summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/cgroup.c754
-rw-r--r--kernel/cgroup_freezer.c514
-rw-r--r--kernel/cpuset.c90
-rw-r--r--kernel/events/core.c8
-rw-r--r--kernel/fork.c9
-rw-r--r--kernel/freezer.c11
-rw-r--r--kernel/power/process.c13
-rw-r--r--kernel/sched/core.c16
-rw-r--r--kernel/signal.c20
9 files changed, 758 insertions, 677 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index f24f724..f34c41b 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -138,6 +138,9 @@ struct cgroupfs_root {
/* Hierarchy-specific flags */
unsigned long flags;
+ /* IDs for cgroups in this hierarchy */
+ struct ida cgroup_ida;
+
/* The path to use for release notifications. */
char release_agent_path[PATH_MAX];
@@ -171,8 +174,8 @@ struct css_id {
* The css to which this ID points. This pointer is set to valid value
* after cgroup is populated. If cgroup is removed, this will be NULL.
* This pointer is expected to be RCU-safe because destroy()
- * is called after synchronize_rcu(). But for safe use, css_is_removed()
- * css_tryget() should be used for avoiding race.
+ * is called after synchronize_rcu(). But for safe use, css_tryget()
+ * should be used for avoiding race.
*/
struct cgroup_subsys_state __rcu *css;
/*
@@ -242,6 +245,10 @@ static DEFINE_SPINLOCK(hierarchy_id_lock);
*/
static int need_forkexit_callback __read_mostly;
+static int cgroup_destroy_locked(struct cgroup *cgrp);
+static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys,
+ struct cftype cfts[], bool is_add);
+
#ifdef CONFIG_PROVE_LOCKING
int cgroup_lock_is_held(void)
{
@@ -294,11 +301,6 @@ static int notify_on_release(const struct cgroup *cgrp)
return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
}
-static int clone_children(const struct cgroup *cgrp)
-{
- return test_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
-}
-
/*
* for_each_subsys() allows you to iterate on each subsystem attached to
* an active hierarchy
@@ -782,12 +784,12 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task,
* The task_lock() exception
*
* The need for this exception arises from the action of
- * cgroup_attach_task(), which overwrites one tasks cgroup pointer with
+ * cgroup_attach_task(), which overwrites one task's cgroup pointer with
* another. It does so using cgroup_mutex, however there are
* several performance critical places that need to reference
* task->cgroup without the expense of grabbing a system global
* mutex. Therefore except as noted below, when dereferencing or, as
- * in cgroup_attach_task(), modifying a task'ss cgroup pointer we use
+ * in cgroup_attach_task(), modifying a task's cgroup pointer we use
* task_lock(), which acts on a spinlock (task->alloc_lock) already in
* the task_struct routinely used for such matters.
*
@@ -854,30 +856,6 @@ static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb)
return inode;
}
-/*
- * Call subsys's pre_destroy handler.
- * This is called before css refcnt check.
- */
-static int cgroup_call_pre_destroy(struct cgroup *cgrp)
-{
- struct cgroup_subsys *ss;
- int ret = 0;
-
- for_each_subsys(cgrp->root, ss) {
- if (!ss->pre_destroy)
- continue;
-
- ret = ss->pre_destroy(cgrp);
- if (ret) {
- /* ->pre_destroy() failure is being deprecated */
- WARN_ON_ONCE(!ss->__DEPRECATED_clear_css_refs);
- break;
- }
- }
-
- return ret;
-}
-
static void cgroup_diput(struct dentry *dentry, struct inode *inode)
{
/* is dentry a directory ? if so, kfree() associated cgroup */
@@ -898,7 +876,7 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
* Release the subsystem state objects.
*/
for_each_subsys(cgrp->root, ss)
- ss->destroy(cgrp);
+ ss->css_free(cgrp);
cgrp->root->number_of_cgroups--;
mutex_unlock(&cgroup_mutex);
@@ -917,6 +895,7 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
simple_xattrs_free(&cgrp->xattrs);
+ ida_simple_remove(&cgrp->root->cgroup_ida, cgrp->id);
kfree_rcu(cgrp, rcu_head);
} else {
struct cfent *cfe = __d_cfe(dentry);
@@ -987,7 +966,7 @@ static void cgroup_clear_directory(struct dentry *dir, bool base_files,
if (!test_bit(ss->subsys_id, &subsys_mask))
continue;
list_for_each_entry(set, &ss->cftsets, node)
- cgroup_rm_file(cgrp, set->cfts);
+ cgroup_addrm_files(cgrp, NULL, set->cfts, false);
}
if (base_files) {
while (!list_empty(&cgrp->files))
@@ -1015,33 +994,6 @@ static void cgroup_d_remove_dir(struct dentry *dentry)
}
/*
- * A queue for waiters to do rmdir() cgroup. A tasks will sleep when
- * cgroup->count == 0 && list_empty(&cgroup->children) && subsys has some
- * reference to css->refcnt. In general, this refcnt is expected to goes down
- * to zero, soon.
- *
- * CGRP_WAIT_ON_RMDIR flag is set under cgroup's inode->i_mutex;
- */
-static DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq);
-
-static void cgroup_wakeup_rmdir_waiter(struct cgroup *cgrp)
-{
- if (unlikely(test_and_clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags)))
- wake_up_all(&cgroup_rmdir_waitq);
-}
-
-void cgroup_exclude_rmdir(struct cgroup_subsys_state *css)
-{
- css_get(css);
-}
-
-void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css)
-{
- cgroup_wakeup_rmdir_waiter(css->cgroup);
- css_put(css);
-}
-
-/*
* Call with cgroup_mutex held. Drops reference counts on modules, including
* any duplicate ones that parse_cgroupfs_options took. If this function
* returns an error, no reference counts are touched.
@@ -1150,7 +1102,7 @@ static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry)
seq_puts(seq, ",xattr");
if (strlen(root->release_agent_path))
seq_printf(seq, ",release_agent=%s", root->release_agent_path);
- if (clone_children(&root->top_cgroup))
+ if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->top_cgroup.flags))
seq_puts(seq, ",clone_children");
if (strlen(root->name))
seq_printf(seq, ",name=%s", root->name);
@@ -1162,7 +1114,7 @@ struct cgroup_sb_opts {
unsigned long subsys_mask;
unsigned long flags;
char *release_agent;
- bool clone_children;
+ bool cpuset_clone_children;
char *name;
/* User explicitly requested empty subsystem */
bool none;
@@ -1213,7 +1165,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
continue;
}
if (!strcmp(token, "clone_children")) {
- opts->clone_children = true;
+ opts->cpuset_clone_children = true;
continue;
}
if (!strcmp(token, "xattr")) {
@@ -1397,14 +1349,21 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
goto out_unlock;
}
+ /*
+ * Clear out the files of subsystems that should be removed, do
+ * this before rebind_subsystems, since rebind_subsystems may
+ * change this hierarchy's subsys_list.
+ */
+ cgroup_clear_directory(cgrp->dentry, false, removed_mask);
+
ret = rebind_subsystems(root, opts.subsys_mask);
if (ret) {
+ /* rebind_subsystems failed, re-populate the removed files */
+ cgroup_populate_dir(cgrp, false, removed_mask);
drop_parsed_module_refcounts(opts.subsys_mask);
goto out_unlock;
}
- /* clear out any existing files and repopulate subsystem files */
- cgroup_clear_directory(cgrp->dentry, false, removed_mask);
/* re-populate subsystem files */
cgroup_populate_dir(cgrp, false, added_mask);
@@ -1432,6 +1391,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
INIT_LIST_HEAD(&cgrp->children);
INIT_LIST_HEAD(&cgrp->files);
INIT_LIST_HEAD(&cgrp->css_sets);
+ INIT_LIST_HEAD(&cgrp->allcg_node);
INIT_LIST_HEAD(&cgrp->release_list);
INIT_LIST_HEAD(&cgrp->pidlists);
mutex_init(&cgrp->pidlist_mutex);
@@ -1450,8 +1410,8 @@ static void init_cgroup_root(struct cgroupfs_root *root)
root->number_of_cgroups = 1;
cgrp->root = root;
cgrp->top_cgroup = cgrp;
- list_add_tail(&cgrp->allcg_node, &root->allcg_list);
init_cgroup_housekeeping(cgrp);
+ list_add_tail(&cgrp->allcg_node, &root->allcg_list);
}
static bool init_root_id(struct cgroupfs_root *root)
@@ -1518,12 +1478,13 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts)
root->subsys_mask = opts->subsys_mask;
root->flags = opts->flags;
+ ida_init(&root->cgroup_ida);
if (opts->release_agent)
strcpy(root->release_agent_path, opts->release_agent);
if (opts->name)
strcpy(root->name, opts->name);
- if (opts->clone_children)
- set_bit(CGRP_CLONE_CHILDREN, &root->top_cgroup.flags);
+ if (opts->cpuset_clone_children)
+ set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->top_cgroup.flags);
return root;
}
@@ -1536,6 +1497,7 @@ static void cgroup_drop_root(struct cgroupfs_root *root)
spin_lock(&hierarchy_id_lock);
ida_remove(&hierarchy_ida, root->hierarchy_id);
spin_unlock(&hierarchy_id_lock);
+ ida_destroy(&root->cgroup_ida);
kfree(root);
}
@@ -1701,7 +1663,6 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
free_cg_links(&tmp_cg_links);
- BUG_ON(!list_empty(&root_cgrp->sibling));
BUG_ON(!list_empty(&root_cgrp->children));
BUG_ON(root->number_of_cgroups != 1);
@@ -1750,7 +1711,6 @@ static void cgroup_kill_sb(struct super_block *sb) {
BUG_ON(root->number_of_cgroups != 1);
BUG_ON(!list_empty(&cgrp->children));
- BUG_ON(!list_empty(&cgrp->sibling));
mutex_lock(&cgroup_mutex);
mutex_lock(&cgroup_root_mutex);
@@ -1808,9 +1768,11 @@ static struct kobject *cgroup_kobj;
*/
int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
{
+ struct dentry *dentry = cgrp->dentry;
char *start;
- struct dentry *dentry = rcu_dereference_check(cgrp->dentry,
- cgroup_lock_is_held());
+
+ rcu_lockdep_assert(rcu_read_lock_held() || cgroup_lock_is_held(),
+ "cgroup_path() called without proper locking");
if (!dentry || cgrp == dummytop) {
/*
@@ -1821,9 +1783,9 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
return 0;
}
- start = buf + buflen;
+ start = buf + buflen - 1;
- *--start = '\0';
+ *start = '\0';
for (;;) {
int len = dentry->d_name.len;
@@ -1834,8 +1796,7 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
if (!cgrp)
break;
- dentry = rcu_dereference_check(cgrp->dentry,
- cgroup_lock_is_held());
+ dentry = cgrp->dentry;
if (!cgrp->parent)
continue;
if (--start < buf)
@@ -1930,9 +1891,7 @@ EXPORT_SYMBOL_GPL(cgroup_taskset_size);
/*
* cgroup_task_migrate - move a task from one cgroup to another.
*
- * 'guarantee' is set if the caller promises that a new css_set for the task
- * will already exist. If not set, this function might sleep, and can fail with
- * -ENOMEM. Must be called with cgroup_mutex and threadgroup locked.
+ * Must be called with cgroup_mutex and threadgroup locked.
*/
static void cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,
struct task_struct *tsk, struct css_set *newcg)
@@ -2025,12 +1984,6 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
}
synchronize_rcu();
-
- /*
- * wake up rmdir() waiter. the rmdir should fail since the cgroup
- * is no longer empty.
- */
- cgroup_wakeup_rmdir_waiter(cgrp);
out:
if (retval) {
for_each_subsys(root, ss) {
@@ -2200,7 +2153,6 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
* step 5: success! and cleanup
*/
synchronize_rcu();
- cgroup_wakeup_rmdir_waiter(cgrp);
retval = 0;
out_put_css_set_refs:
if (retval) {
@@ -2711,10 +2663,17 @@ static int cgroup_create_file(struct dentry *dentry, umode_t mode,
/* start off with i_nlink == 2 (for "." entry) */
inc_nlink(inode);
+ inc_nlink(dentry->d_parent->d_inode);
- /* start with the directory inode held, so that we can
- * populate it without racing with another mkdir */
- mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);
+ /*
+ * Control reaches here with cgroup_mutex held.
+ * @inode->i_mutex should nest outside cgroup_mutex but we
+ * want to populate it immediately without releasing
+ * cgroup_mutex. As @inode isn't visible to anyone else
+ * yet, trylock will always succeed without affecting
+ * lockdep checks.
+ */
+ WARN_ON_ONCE(!mutex_trylock(&inode->i_mutex));
} else if (S_ISREG(mode)) {
inode->i_size = 0;
inode->i_fop = &cgroup_file_operations;
@@ -2725,32 +2684,6 @@ static int cgroup_create_file(struct dentry *dentry, umode_t mode,
return 0;
}
-/*
- * cgroup_create_dir - create a directory for an object.
- * @cgrp: the cgroup we create the directory for. It must have a valid
- * ->parent field. And we are going to fill its ->dentry field.
- * @dentry: dentry of the new cgroup
- * @mode: mode to set on new directory.
- */
-static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry,
- umode_t mode)
-{
- struct dentry *parent;
- int error = 0;
-
- parent = cgrp->parent->dentry;
- error = cgroup_create_file(dentry, S_IFDIR | mode, cgrp->root->sb);
- if (!error) {
- dentry->d_fsdata = cgrp;
- inc_nlink(parent->d_inode);
- rcu_assign_pointer(cgrp->dentry, dentry);
- dget(dentry);
- }
- dput(dentry);
-
- return error;
-}
-
/**
* cgroup_file_mode - deduce file mode of a control file
* @cft: the control file in question
@@ -2791,12 +2724,6 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys,
simple_xattrs_init(&cft->xattrs);
- /* does @cft->flags tell us to skip creation on @cgrp? */
- if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent)
- return 0;
- if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent)
- return 0;
-
if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) {
strcpy(name, subsys->name);
strcat(name, ".");
@@ -2837,6 +2764,12 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys,
int err, ret = 0;
for (cft = cfts; cft->name[0] != '\0'; cft++) {
+ /* does cft->flags tell us to skip this file on @cgrp? */
+ if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent)
+ continue;
+ if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent)
+ continue;
+
if (is_add)
err = cgroup_add_file(cgrp, subsys, cft);
else
@@ -3044,6 +2977,92 @@ static void cgroup_enable_task_cg_lists(void)
write_unlock(&css_set_lock);
}
+/**
+ * cgroup_next_descendant_pre - find the next descendant for pre-order walk
+ * @pos: the current position (%NULL to initiate traversal)
+ * @cgroup: cgroup whose descendants to walk
+ *
+ * To be used by cgroup_for_each_descendant_pre(). Find the next
+ * descendant to visit for pre-order traversal of @cgroup's descendants.
+ */
+struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos,
+ struct cgroup *cgroup)
+{
+ struct cgroup *next;
+
+ WARN_ON_ONCE(!rcu_read_lock_held());
+
+ /* if first iteration, pretend we just visited @cgroup */
+ if (!pos) {
+ if (list_empty(&cgroup->children))
+ return NULL;
+ pos = cgroup;
+ }
+
+ /* visit the first child if exists */
+ next = list_first_or_null_rcu(&pos->children, struct cgroup, sibling);
+ if (next)
+ return next;
+
+ /* no child, visit my or the closest ancestor's next sibling */
+ do {
+ next = list_entry_rcu(pos->sibling.next, struct cgroup,
+ sibling);
+ if (&next->sibling != &pos->parent->children)
+ return next;
+
+ pos = pos->parent;
+ } while (pos != cgroup);
+
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(cgroup_next_descendant_pre);
+
+static struct cgroup *cgroup_leftmost_descendant(struct cgroup *pos)
+{
+ struct cgroup *last;
+
+ do {
+ last = pos;
+ pos = list_first_or_null_rcu(&pos->children, struct cgroup,
+ sibling);
+ } while (pos);
+
+ return last;
+}
+
+/**
+ * cgroup_next_descendant_post - find the next descendant for post-order walk
+ * @pos: the current position (%NULL to initiate traversal)
+ * @cgroup: cgroup whose descendants to walk
+ *
+ * To be used by cgroup_for_each_descendant_post(). Find the next
+ * descendant to visit for post-order traversal of @cgroup's descendants.
+ */
+struct cgroup *cgroup_next_descendant_post(struct cgroup *pos,
+ struct cgroup *cgroup)
+{
+ struct cgroup *next;
+
+ WARN_ON_ONCE(!rcu_read_lock_held());
+
+ /* if first iteration, visit the leftmost descendant */
+ if (!pos) {
+ next = cgroup_leftmost_descendant(cgroup);
+ return next != cgroup ? next : NULL;
+ }
+
+ /* if there's an unvisited sibling, visit its leftmost descendant */
+ next = list_entry_rcu(pos->sibling.next, struct cgroup, sibling);
+ if (&next->sibling != &pos->parent->children)
+ return cgroup_leftmost_descendant(next);
+
+ /* no sibling left, visit parent */
+ next = pos->parent;
+ return next != cgroup ? next : NULL;
+}
+EXPORT_SYMBOL_GPL(cgroup_next_descendant_post);
+
void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it)
__acquires(css_set_lock)
{
@@ -3757,7 +3776,7 @@ static int cgroup_event_wake(wait_queue_t *wait, unsigned mode,
if (flags & POLLHUP) {
__remove_wait_queue(event->wqh, &event->wait);
spin_lock(&cgrp->event_list_lock);
- list_del(&event->list);
+ list_del_init(&event->list);
spin_unlock(&cgrp->event_list_lock);
/*
* We are in atomic context, but cgroup_event_remove() may
@@ -3894,7 +3913,7 @@ fail:
static u64 cgroup_clone_children_read(struct cgroup *cgrp,
struct cftype *cft)
{
- return clone_children(cgrp);
+ return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
}
static int cgroup_clone_children_write(struct cgroup *cgrp,
@@ -3902,9 +3921,9 @@ static int cgroup_clone_children_write(struct cgroup *cgrp,
u64 val)
{
if (val)
- set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
+ set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
else
- clear_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
+ clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
return 0;
}
@@ -4017,19 +4036,57 @@ static void init_cgroup_css(struct cgroup_subsys_state *css,
css->flags = 0;
css->id = NULL;
if (cgrp == dummytop)
- set_bit(CSS_ROOT, &css->flags);
+ css->flags |= CSS_ROOT;
BUG_ON(cgrp->subsys[ss->subsys_id]);
cgrp->subsys[ss->subsys_id] = css;
/*
- * If !clear_css_refs, css holds an extra ref to @cgrp->dentry
- * which is put on the last css_put(). dput() requires process
- * context, which css_put() may be called without. @css->dput_work
- * will be used to invoke dput() asynchronously from css_put().
+ * css holds an extra ref to @cgrp->dentry which is put on the last
+ * css_put(). dput() requires process context, which css_put() may
+ * be called without. @css->dput_work will be used to invoke
+ * dput() asynchronously from css_put().
*/
INIT_WORK(&css->dput_work, css_dput_fn);
- if (ss->__DEPRECATED_clear_css_refs)
- set_bit(CSS_CLEAR_CSS_REFS, &css->flags);
+}
+
+/* invoke ->post_create() on a new CSS and mark it online if successful */
+static int online_css(struct cgroup_subsys *ss, struct cgroup *cgrp)
+{
+ int ret = 0;
+
+ lockdep_assert_held(&cgroup_mutex);
+
+ if (ss->css_online)
+ ret = ss->css_online(cgrp);
+ if (!ret)
+ cgrp->subsys[ss->subsys_id]->flags |= CSS_ONLINE;
+ return ret;
+}
+
+/* if the CSS is online, invoke ->pre_destory() on it and mark it offline */
+static void offline_css(struct cgroup_subsys *ss, struct cgroup *cgrp)
+ __releases(&cgroup_mutex) __acquires(&cgroup_mutex)
+{
+ struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
+
+ lockdep_assert_held(&cgroup_mutex);
+
+ if (!(css->flags & CSS_ONLINE))
+ return;
+
+ /*
+ * css_offline() should be called with cgroup_mutex unlocked. See
+ * 3fa59dfbc3 ("cgroup: fix potential deadlock in pre_destroy") for
+ * details. This temporary unlocking should go away once
+ * cgroup_mutex is unexported from controllers.
+ */
+ if (ss->css_offline) {
+ mutex_unlock(&cgroup_mutex);
+ ss->css_offline(cgrp);
+ mutex_lock(&cgroup_mutex);
+ }
+
+ cgrp->subsys[ss->subsys_id]->flags &= ~CSS_ONLINE;
}
/*
@@ -4049,10 +4106,27 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
struct cgroup_subsys *ss;
struct super_block *sb = root->sb;
+ /* allocate the cgroup and its ID, 0 is reserved for the root */
cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL);
if (!cgrp)
return -ENOMEM;
+ cgrp->id = ida_simple_get(&root->cgroup_ida, 1, 0, GFP_KERNEL);
+ if (cgrp->id < 0)
+ goto err_free_cgrp;
+
+ /*
+ * Only live parents can have children. Note that the liveliness
+ * check isn't strictly necessary because cgroup_mkdir() and
+ * cgroup_rmdir() are fully synchronized by i_mutex; however, do it
+ * anyway so that locking is contained inside cgroup proper and we
+ * don't get nasty surprises if we ever grow another caller.
+ */
+ if (!cgroup_lock_live_group(parent)) {
+ err = -ENODEV;
+ goto err_free_id;
+ }
+
/* Grab a reference on the superblock so the hierarchy doesn't
* get deleted on unmount if there are child cgroups. This
* can be done outside cgroup_mutex, since the sb can't
@@ -4060,8 +4134,6 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
* fs */
atomic_inc(&sb->s_active);
- mutex_lock(&cgroup_mutex);
-
init_cgroup_housekeeping(cgrp);
cgrp->parent = parent;
@@ -4071,26 +4143,51 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
if (notify_on_release(parent))
set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
- if (clone_children(parent))
- set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
+ if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags))
+ set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
for_each_subsys(root, ss) {
struct cgroup_subsys_state *css;
- css = ss->create(cgrp);
+ css = ss->css_alloc(cgrp);
if (IS_ERR(css)) {
err = PTR_ERR(css);
- goto err_destroy;
+ goto err_free_all;
}
init_cgroup_css(css, ss, cgrp);
if (ss->use_id) {
err = alloc_css_id(ss, parent, cgrp);
if (err)
- goto err_destroy;
+ goto err_free_all;
}
- /* At error, ->destroy() callback has to free assigned ID. */
- if (clone_children(parent) && ss->post_clone)
- ss->post_clone(cgrp);
+ }
+
+ /*
+ * Create directory. cgroup_create_file() returns with the new
+ * directory locked on success so that it can be populated without
+ * dropping cgroup_mutex.
+ */
+ err = cgroup_create_file(dentry, S_IFDIR | mode, sb);
+ if (err < 0)
+ goto err_free_all;
+ lockdep_assert_held(&dentry->d_inode->i_mutex);
+
+ /* allocation complete, commit to creation */
+ dentry->d_fsdata = cgrp;
+ cgrp->dentry = dentry;
+ list_add_tail(&cgrp->allcg_node, &root->allcg_list);
+ list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children);
+ root->number_of_cgroups++;
+
+ /* each css holds a ref to the cgroup's dentry */
+ for_each_subsys(root, ss)
+ dget(dentry);
+
+ /* creation succeeded, notify subsystems */
+ for_each_subsys(root, ss) {
+ err = online_css(ss, cgrp);
+ if (err)
+ goto err_destroy;
if (ss->broken_hierarchy && !ss->warned_broken_hierarchy &&
parent->parent) {
@@ -4102,50 +4199,34 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
}
}
- list_add(&cgrp->sibling, &cgrp->parent->children);
- root->number_of_cgroups++;
-
- err = cgroup_create_dir(cgrp, dentry, mode);
- if (err < 0)
- goto err_remove;
-
- /* If !clear_css_refs, each css holds a ref to the cgroup's dentry */
- for_each_subsys(root, ss)
- if (!ss->__DEPRECATED_clear_css_refs)
- dget(dentry);
-
- /* The cgroup directory was pre-locked for us */
- BUG_ON(!mutex_is_locked(&cgrp->dentry->d_inode->i_mutex));
-
- list_add_tail(&cgrp->allcg_node, &root->allcg_list);
-
err = cgroup_populate_dir(cgrp, true, root->subsys_mask);
- /* If err < 0, we have a half-filled directory - oh well ;) */
+ if (err)
+ goto err_destroy;
mutex_unlock(&cgroup_mutex);
mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
return 0;
- err_remove:
-
- list_del(&cgrp->sibling);
- root->number_of_cgroups--;
-
- err_destroy:
-
+err_free_all:
for_each_subsys(root, ss) {
if (cgrp->subsys[ss->subsys_id])
- ss->destroy(cgrp);
+ ss->css_free(cgrp);
}
-
mutex_unlock(&cgroup_mutex);
-
/* Release the reference count that we took on the superblock */
deactivate_super(sb);
-
+err_free_id:
+ ida_simple_remove(&root->cgroup_ida, cgrp->id);
+err_free_cgrp:
kfree(cgrp);
return err;
+
+err_destroy:
+ cgroup_destroy_locked(cgrp);
+ mutex_unlock(&cgroup_mutex);
+ mutex_unlock(&dentry->d_inode->i_mutex);
+ return err;
}
static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
@@ -4197,153 +4278,60 @@ static int cgroup_has_css_refs(struct cgroup *cgrp)
return 0;
}
-/*
- * Atomically mark all (or else none) of the cgroup's CSS objects as
- * CSS_REMOVED. Return true on success, or false if the cgroup has
- * busy subsystems. Call with cgroup_mutex held
- *
- * Depending on whether a subsys has __DEPRECATED_clear_css_refs set or
- * not, cgroup removal behaves differently.
- *
- * If clear is set, css refcnt for the subsystem should be zero before
- * cgroup removal can be committed. This is implemented by
- * CGRP_WAIT_ON_RMDIR and retry logic around ->pre_destroy(), which may be
- * called multiple times until all css refcnts reach zero and is allowed to
- * veto removal on any invocation. This behavior is deprecated and will be
- * removed as soon as the existing user (memcg) is updated.
- *
- * If clear is not set, each css holds an extra reference to the cgroup's
- * dentry and cgroup removal proceeds regardless of css refs.
- * ->pre_destroy() will be called at least once and is not allowed to fail.
- * On the last put of each css, whenever that may be, the extra dentry ref
- * is put so that dentry destruction happens only after all css's are
- * released.
- */
-static int cgroup_clear_css_refs(struct cgroup *cgrp)
+static int cgroup_destroy_locked(struct cgroup *cgrp)
+ __releases(&cgroup_mutex) __acquires(&cgroup_mutex)
{
+ struct dentry *d = cgrp->dentry;
+ struct cgroup *parent = cgrp->parent;
+ DEFINE_WAIT(wait);
+ struct cgroup_event *event, *tmp;
struct cgroup_subsys *ss;
- unsigned long flags;
- bool failed = false;
+ LIST_HEAD(tmp_list);
+
+ lockdep_assert_held(&d->d_inode->i_mutex);
+ lockdep_assert_held(&cgroup_mutex);
- local_irq_save(flags);
+ if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children))
+ return -EBUSY;
/*
- * Block new css_tryget() by deactivating refcnt. If all refcnts
- * for subsystems w/ clear_css_refs set were 1 at the moment of
- * deactivation, we succeeded.
+ * Block new css_tryget() by deactivating refcnt and mark @cgrp
+ * removed. This makes future css_tryget() and child creation
+ * attempts fail thus maintaining the removal conditions verified
+ * above.
*/
for_each_subsys(cgrp->root, ss) {
struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
WARN_ON(atomic_read(&css->refcnt) < 0);
atomic_add(CSS_DEACT_BIAS, &css->refcnt);
-
- if (ss->__DEPRECATED_clear_css_refs)
- failed |= css_refcnt(css) != 1;
- }
-
- /*
- * If succeeded, set REMOVED and put all the base refs; otherwise,
- * restore refcnts to positive values. Either way, all in-progress
- * css_tryget() will be released.
- */
- for_each_subsys(cgrp->root, ss) {
- struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
-
- if (!failed) {
- set_bit(CSS_REMOVED, &css->flags);
- css_put(css);
- } else {
- atomic_sub(CSS_DEACT_BIAS, &css->refcnt);
- }
}
+ set_bit(CGRP_REMOVED, &cgrp->flags);
- local_irq_restore(flags);
- return !failed;
-}
-
-static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
-{
- struct cgroup *cgrp = dentry->d_fsdata;
- struct dentry *d;
- struct cgroup *parent;
- DEFINE_WAIT(wait);
- struct cgroup_event *event, *tmp;
- int ret;
-
- /* the vfs holds both inode->i_mutex already */
-again:
- mutex_lock(&cgroup_mutex);
- if (atomic_read(&cgrp->count) != 0) {
- mutex_unlock(&cgroup_mutex);
- return -EBUSY;
- }
- if (!list_empty(&cgrp->children)) {
- mutex_unlock(&cgroup_mutex);
- return -EBUSY;
- }
- mutex_unlock(&cgroup_mutex);
-
- /*
- * In general, subsystem has no css->refcnt after pre_destroy(). But
- * in racy cases, subsystem may have to get css->refcnt after
- * pre_destroy() and it makes rmdir return with -EBUSY. This sometimes
- * make rmdir return -EBUSY too often. To avoid that, we use waitqueue
- * for cgroup's rmdir. CGRP_WAIT_ON_RMDIR is for synchronizing rmdir
- * and subsystem's reference count handling. Please see css_get/put
- * and css_tryget() and cgroup_wakeup_rmdir_waiter() implementation.
- */
- set_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
+ /* tell subsystems to initate destruction */
+ for_each_subsys(cgrp->root, ss)
+ offline_css(ss, cgrp);
/*
- * Call pre_destroy handlers of subsys. Notify subsystems
- * that rmdir() request comes.
+ * Put all the base refs. Each css holds an extra reference to the
+ * cgroup's dentry and cgroup removal proceeds regardless of css
+ * refs. On the last put of each css, whenever that may be, the
+ * extra dentry ref is put so that dentry destruction happens only
+ * after all css's are released.
*/
- ret = cgroup_call_pre_destroy(cgrp);
- if (ret) {
- clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
- return ret;
- }
-
- mutex_lock(&cgroup_mutex);
- parent = cgrp->parent;
- if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) {
- clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
- mutex_unlock(&cgroup_mutex);
- return -EBUSY;
- }
- prepare_to_wait(&cgroup_rmdir_waitq, &wait, TASK_INTERRUPTIBLE);
- if (!cgroup_clear_css_refs(cgrp)) {
- mutex_unlock(&cgroup_mutex);
- /*
- * Because someone may call cgroup_wakeup_rmdir_waiter() before
- * prepare_to_wait(), we need to check this flag.
- */
- if (test_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags))
- schedule();
- finish_wait(&cgroup_rmdir_waitq, &wait);
- clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
- if (signal_pending(current))
- return -EINTR;
- goto again;
- }
- /* NO css_tryget() can success after here. */
- finish_wait(&cgroup_rmdir_waitq, &wait);
- clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
+ for_each_subsys(cgrp->root, ss)
+ css_put(cgrp->subsys[ss->subsys_id]);
raw_spin_lock(&release_list_lock);
- set_bit(CGRP_REMOVED, &cgrp->flags);
if (!list_empty(&cgrp->release_list))
list_del_init(&cgrp->release_list);
raw_spin_unlock(&release_list_lock);
/* delete this cgroup from parent->children */
- list_del_init(&cgrp->sibling);
-
+ list_del_rcu(&cgrp->sibling);
list_del_init(&cgrp->allcg_node);
- d = dget(cgrp->dentry);
-
+ dget(d);
cgroup_d_remove_dir(d);
dput(d);
@@ -4353,21 +4341,35 @@ again:
/*
* Unregister events and notify userspace.
* Notify userspace about cgroup removing only after rmdir of cgroup
- * directory to avoid race between userspace and kernelspace
+ * directory to avoid race between userspace and kernelspace. Use
+ * a temporary list to avoid a deadlock with cgroup_event_wake(). Since
+ * cgroup_event_wake() is called with the wait queue head locked,
+ * remove_wait_queue() cannot be called while holding event_list_lock.
*/
spin_lock(&cgrp->event_list_lock);
- list_for_each_entry_safe(event, tmp, &cgrp->event_list, list) {
- list_del(&event->list);
+ list_splice_init(&cgrp->event_list, &tmp_list);
+ spin_unlock(&cgrp->event_list_lock);
+ list_for_each_entry_safe(event, tmp, &tmp_list, list) {
+ list_del_init(&event->list);
remove_wait_queue(event->wqh, &event->wait);
eventfd_signal(event->eventfd, 1);
schedule_work(&event->remove);
}
- spin_unlock(&cgrp->event_list_lock);
- mutex_unlock(&cgroup_mutex);
return 0;
}
+static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
+{
+ int ret;
+
+ mutex_lock(&cgroup_mutex);
+ ret = cgroup_destroy_locked(dentry->d_fsdata);
+ mutex_unlock(&cgroup_mutex);
+
+ return ret;
+}
+
static void __init_or_module cgroup_init_cftsets(struct cgroup_subsys *ss)
{
INIT_LIST_HEAD(&ss->cftsets);
@@ -4388,13 +4390,15 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name);
+ mutex_lock(&cgroup_mutex);
+
/* init base cftset */
cgroup_init_cftsets(ss);
/* Create the top cgroup state for this subsystem */
list_add(&ss->sibling, &rootnode.subsys_list);
ss->root = &rootnode;
- css = ss->create(dummytop);
+ css = ss->css_alloc(dummytop);
/* We don't handle early failures gracefully */
BUG_ON(IS_ERR(css));
init_cgroup_css(css, ss, dummytop);
@@ -4403,7 +4407,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
* pointer to this state - since the subsystem is
* newly registered, all tasks and hence the
* init_css_set is in the subsystem's top cgroup. */
- init_css_set.subsys[ss->subsys_id] = dummytop->subsys[ss->subsys_id];
+ init_css_set.subsys[ss->subsys_id] = css;
need_forkexit_callback |= ss->fork || ss->exit;
@@ -4413,6 +4417,9 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
BUG_ON(!list_empty(&init_task.tasks));
ss->active = 1;
+ BUG_ON(online_css(ss, dummytop));
+
+ mutex_unlock(&cgroup_mutex);
/* this function shouldn't be used with modular subsystems, since they
* need to register a subsys_id, among other things */
@@ -4430,12 +4437,12 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
*/
int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
{
- int i;
struct cgroup_subsys_state *css;
+ int i, ret;
/* check name and function validity */
if (ss->name == NULL || strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN ||
- ss->create == NULL || ss->destroy == NULL)
+ ss->css_alloc == NULL || ss->css_free == NULL)
return -EINVAL;
/*
@@ -4464,10 +4471,11 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
subsys[ss->subsys_id] = ss;
/*
- * no ss->create seems to need anything important in the ss struct, so
- * this can happen first (i.e. before the rootnode attachment).
+ * no ss->css_alloc seems to need anything important in the ss
+ * struct, so this can happen first (i.e. before the rootnode
+ * attachment).
*/
- css = ss->create(dummytop);
+ css = ss->css_alloc(dummytop);
if (IS_ERR(css)) {
/* failure case - need to deassign the subsys[] slot. */
subsys[ss->subsys_id] = NULL;
@@ -4482,14 +4490,9 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
init_cgroup_css(css, ss, dummytop);
/* init_idr must be after init_cgroup_css because it sets css->id. */
if (ss->use_id) {
- int ret = cgroup_init_idr(ss, css);
- if (ret) {
- dummytop->subsys[ss->subsys_id] = NULL;
- ss->destroy(dummytop);
- subsys[ss->subsys_id] = NULL;
- mutex_unlock(&cgroup_mutex);
- return ret;
- }
+ ret = cgroup_init_idr(ss, css);
+ if (ret)
+ goto err_unload;
}
/*
@@ -4522,10 +4525,19 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
write_unlock(&css_set_lock);
ss->active = 1;
+ ret = online_css(ss, dummytop);
+ if (ret)
+ goto err_unload;
/* success! */
mutex_unlock(&cgroup_mutex);
return 0;
+
+err_unload:
+ mutex_unlock(&cgroup_mutex);
+ /* @ss can't be mounted here as try_module_get() would fail */
+ cgroup_unload_subsys(ss);
+ return ret;
}
EXPORT_SYMBOL_GPL(cgroup_load_subsys);
@@ -4552,6 +4564,15 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)
BUG_ON(ss->root != &rootnode);
mutex_lock(&cgroup_mutex);
+
+ offline_css(ss, dummytop);
+ ss->active = 0;
+
+ if (ss->use_id) {
+ idr_remove_all(&ss->idr);
+ idr_destroy(&ss->idr);
+ }
+
/* deassign the subsys_id */
subsys[ss->subsys_id] = NULL;
@@ -4567,7 +4588,6 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)
struct css_set *cg = link->cg;
hlist_del(&cg->hlist);
- BUG_ON(!cg->subsys[ss->subsys_id]);
cg->subsys[ss->subsys_id] = NULL;
hhead = css_set_hash(cg->subsys);
hlist_add_head(&cg->hlist, hhead);
@@ -4575,12 +4595,12 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)
write_unlock(&css_set_lock);
/*
- * remove subsystem's css from the dummytop and free it - need to free
- * before marking as null because ss->destroy needs the cgrp->subsys
- * pointer to find their state. note that this also takes care of
- * freeing the css_id.
+ * remove subsystem's css from the dummytop and free it - need to
+ * free before marking as null because ss->css_free needs the
+ * cgrp->subsys pointer to find their state. note that this also
+ * takes care of freeing the css_id.
*/
- ss->destroy(dummytop);
+ ss->css_free(dummytop);
dummytop->subsys[ss->subsys_id] = NULL;
mutex_unlock(&cgroup_mutex);
@@ -4624,8 +4644,8 @@ int __init cgroup_init_early(void)
BUG_ON(!ss->name);
BUG_ON(strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN);
- BUG_ON(!ss->create);
- BUG_ON(!ss->destroy);
+ BUG_ON(!ss->css_alloc);
+ BUG_ON(!ss->css_free);
if (ss->subsys_id != i) {
printk(KERN_ERR "cgroup: Subsys %s id == %d\n",
ss->name, ss->subsys_id);
@@ -4832,44 +4852,19 @@ void cgroup_fork(struct task_struct *child)
}
/**
- * cgroup_fork_callbacks - run fork callbacks
- * @child: the new task
- *
- * Called on a new task very soon before adding it to the
- * tasklist. No need to take any locks since no-one can
- * be operating on this task.
- */
-void cgroup_fork_callbacks(struct task_struct *child)
-{
- if (need_forkexit_callback) {
- int i;
- for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
- struct cgroup_subsys *ss = subsys[i];
-
- /*
- * forkexit callbacks are only supported for
- * builtin subsystems.
- */
- if (!ss || ss->module)
- continue;
-
- if (ss->fork)
- ss->fork(child);
- }
- }
-}
-
-/**
* cgroup_post_fork - called on a new task after adding it to the task list
* @child: the task in question
*
- * Adds the task to the list running through its css_set if necessary.
- * Has to be after the task is visible on the task list in case we race
- * with the first call to cgroup_iter_start() - to guarantee that the
- * new task ends up on its list.
+ * Adds the task to the list running through its css_set if necessary and
+ * call the subsystem fork() callbacks. Has to be after the task is
+ * visible on the task list in case we race with the first call to
+ * cgroup_iter_start() - to guarantee that the new task ends up on its
+ * list.
*/
void cgroup_post_fork(struct task_struct *child)
{
+ int i;
+
/*
* use_task_css_set_links is set to 1 before we walk the tasklist
* under the tasklist_lock and we read it here after we added the child
@@ -4889,7 +4884,30 @@ void cgroup_post_fork(struct task_struct *child)
task_unlock(child);
write_unlock(&css_set_lock);
}
+
+ /*
+ * Call ss->fork(). This must happen after @child is linked on
+ * css_set; otherwise, @child might change state between ->fork()
+ * and addition to css_set.
+ */
+ if (need_forkexit_callback) {
+ for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+ struct cgroup_subsys *ss = subsys[i];
+
+ /*
+ * fork/exit callbacks are supported only for
+ * builtin subsystems and we don't need further
+ * synchronization as they never go away.
+ */
+ if (!ss || ss->module)
+ continue;
+
+ if (ss->fork)
+ ss->fork(child);
+ }
+ }
}
+
/**
* cgroup_exit - detach cgroup from exiting task
* @tsk: pointer to task_struct of exiting process
@@ -5022,15 +5040,17 @@ static void check_for_release(struct cgroup *cgrp)
/* Caller must verify that the css is not for root cgroup */
bool __css_tryget(struct cgroup_subsys_state *css)
{
- do {
- int v = css_refcnt(css);
+ while (true) {
+ int t, v;
- if (atomic_cmpxchg(&css->refcnt, v, v + 1) == v)
+ v = css_refcnt(css);
+ t = atomic_cmpxchg(&css->refcnt, v, v + 1);
+ if (likely(t == v))
return true;
+ else if (t < 0)
+ return false;
cpu_relax();
- } while (!test_bit(CSS_REMOVED, &css->flags));
-
- return false;
+ }
}
EXPORT_SYMBOL_GPL(__css_tryget);
@@ -5049,11 +5069,9 @@ void __css_put(struct cgroup_subsys_state *css)
set_bit(CGRP_RELEASABLE, &cgrp->flags);
check_for_release(cgrp);
}
- cgroup_wakeup_rmdir_waiter(cgrp);
break;
case 0:
- if (!test_bit(CSS_CLEAR_CSS_REFS, &css->flags))
- schedule_work(&css->dput_work);
+ schedule_work(&css->dput_work);
break;
}
rcu_read_unlock();
@@ -5439,7 +5457,7 @@ struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id)
}
#ifdef CONFIG_CGROUP_DEBUG
-static struct cgroup_subsys_state *debug_create(struct cgroup *cont)
+static struct cgroup_subsys_state *debug_css_alloc(struct cgroup *cont)
{
struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);
@@ -5449,7 +5467,7 @@ static struct cgroup_subsys_state *debug_create(struct cgroup *cont)
return css;
}
-static void debug_destroy(struct cgroup *cont)
+static void debug_css_free(struct cgroup *cont)
{
kfree(cont->subsys[debug_subsys_id]);
}
@@ -5578,8 +5596,8 @@ static struct cftype debug_files[] = {
struct cgroup_subsys debug_subsys = {
.name = "debug",
- .create = debug_create,
- .destroy = debug_destroy,
+ .css_alloc = debug_css_alloc,
+ .css_free = debug_css_free,
.subsys_id = debug_subsys_id,
.base_cftypes = debug_files,
};
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index b1724ce..75dda1e 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -22,24 +22,33 @@
#include <linux/freezer.h>
#include <linux/seq_file.h>
-enum freezer_state {
- CGROUP_THAWED = 0,
- CGROUP_FREEZING,
- CGROUP_FROZEN,
+/*
+ * A cgroup is freezing if any FREEZING flags are set. FREEZING_SELF is
+ * set if "FROZEN" is written to freezer.state cgroupfs file, and cleared
+ * for "THAWED". FREEZING_PARENT is set if the parent freezer is FREEZING
+ * for whatever reason. IOW, a cgroup has FREEZING_PARENT set if one of
+ * its ancestors has FREEZING_SELF set.
+ */
+enum freezer_state_flags {
+ CGROUP_FREEZER_ONLINE = (1 << 0), /* freezer is fully online */
+ CGROUP_FREEZING_SELF = (1 << 1), /* this freezer is freezing */
+ CGROUP_FREEZING_PARENT = (1 << 2), /* the parent freezer is freezing */
+ CGROUP_FROZEN = (1 << 3), /* this and its descendants frozen */
+
+ /* mask for all FREEZING flags */
+ CGROUP_FREEZING = CGROUP_FREEZING_SELF | CGROUP_FREEZING_PARENT,
};
struct freezer {
- struct cgroup_subsys_state css;
- enum freezer_state state;
- spinlock_t lock; /* protects _writes_ to state */
+ struct cgroup_subsys_state css;
+ unsigned int state;
+ spinlock_t lock;
};
-static inline struct freezer *cgroup_freezer(
- struct cgroup *cgroup)
+static inline struct freezer *cgroup_freezer(struct cgroup *cgroup)
{
- return container_of(
- cgroup_subsys_state(cgroup, freezer_subsys_id),
- struct freezer, css);
+ return container_of(cgroup_subsys_state(cgroup, freezer_subsys_id),
+ struct freezer, css);
}
static inline struct freezer *task_freezer(struct task_struct *task)
@@ -48,14 +57,21 @@ static inline struct freezer *task_freezer(struct task_struct *task)
struct freezer, css);
}
+static struct freezer *parent_freezer(struct freezer *freezer)
+{
+ struct cgroup *pcg = freezer->css.cgroup->parent;
+
+ if (pcg)
+ return cgroup_freezer(pcg);
+ return NULL;
+}
+
bool cgroup_freezing(struct task_struct *task)
{
- enum freezer_state state;
bool ret;
rcu_read_lock();
- state = task_freezer(task)->state;
- ret = state == CGROUP_FREEZING || state == CGROUP_FROZEN;
+ ret = task_freezer(task)->state & CGROUP_FREEZING;
rcu_read_unlock();
return ret;
@@ -65,70 +81,18 @@ bool cgroup_freezing(struct task_struct *task)
* cgroups_write_string() limits the size of freezer state strings to
* CGROUP_LOCAL_BUFFER_SIZE
*/
-static const char *freezer_state_strs[] = {
- "THAWED",
- "FREEZING",
- "FROZEN",
+static const char *freezer_state_strs(unsigned int state)
+{
+ if (state & CGROUP_FROZEN)
+ return "FROZEN";
+ if (state & CGROUP_FREEZING)
+ return "FREEZING";
+ return "THAWED";
};
-/*
- * State diagram
- * Transitions are caused by userspace writes to the freezer.state file.
- * The values in parenthesis are state labels. The rest are edge labels.
- *
- * (THAWED) --FROZEN--> (FREEZING) --FROZEN--> (FROZEN)
- * ^ ^ | |
- * | \_______THAWED_______/ |
- * \__________________________THAWED____________/
- */
-
struct cgroup_subsys freezer_subsys;
-/* Locks taken and their ordering
- * ------------------------------
- * cgroup_mutex (AKA cgroup_lock)
- * freezer->lock
- * css_set_lock
- * task->alloc_lock (AKA task_lock)
- * task->sighand->siglock
- *
- * cgroup code forces css_set_lock to be taken before task->alloc_lock
- *
- * freezer_create(), freezer_destroy():
- * cgroup_mutex [ by cgroup core ]
- *
- * freezer_can_attach():
- * cgroup_mutex (held by caller of can_attach)
- *
- * freezer_fork() (preserving fork() performance means can't take cgroup_mutex):
- * freezer->lock
- * sighand->siglock (if the cgroup is freezing)
- *
- * freezer_read():
- * cgroup_mutex
- * freezer->lock
- * write_lock css_set_lock (cgroup iterator start)
- * task->alloc_lock
- * read_lock css_set_lock (cgroup iterator start)
- *
- * freezer_write() (freeze):
- * cgroup_mutex
- * freezer->lock
- * write_lock css_set_lock (cgroup iterator start)
- * task->alloc_lock
- * read_lock css_set_lock (cgroup iterator start)
- * sighand->siglock (fake signal delivery inside freeze_task())
- *
- * freezer_write() (unfreeze):
- * cgroup_mutex
- * freezer->lock
- * write_lock css_set_lock (cgroup iterator start)
- * task->alloc_lock
- * read_lock css_set_lock (cgroup iterator start)
- * task->alloc_lock (inside __thaw_task(), prevents race with refrigerator())
- * sighand->siglock
- */
-static struct cgroup_subsys_state *freezer_create(struct cgroup *cgroup)
+static struct cgroup_subsys_state *freezer_css_alloc(struct cgroup *cgroup)
{
struct freezer *freezer;
@@ -137,160 +101,244 @@ static struct cgroup_subsys_state *freezer_create(struct cgroup *cgroup)
return ERR_PTR(-ENOMEM);
spin_lock_init(&freezer->lock);
- freezer->state = CGROUP_THAWED;
return &freezer->css;
}
-static void freezer_destroy(struct cgroup *cgroup)
+/**
+ * freezer_css_online - commit creation of a freezer cgroup
+ * @cgroup: cgroup being created
+ *
+ * We're committing to creation of @cgroup. Mark it online and inherit
+ * parent's freezing state while holding both parent's and our
+ * freezer->lock.
+ */
+static int freezer_css_online(struct cgroup *cgroup)
+{
+ struct freezer *freezer = cgroup_freezer(cgroup);
+ struct freezer *parent = parent_freezer(freezer);
+
+ /*
+ * The following double locking and freezing state inheritance
+ * guarantee that @cgroup can never escape ancestors' freezing
+ * states. See cgroup_for_each_descendant_pre() for details.
+ */
+ if (parent)
+ spin_lock_irq(&parent->lock);
+ spin_lock_nested(&freezer->lock, SINGLE_DEPTH_NESTING);
+
+ freezer->state |= CGROUP_FREEZER_ONLINE;
+
+ if (parent && (parent->state & CGROUP_FREEZING)) {
+ freezer->state |= CGROUP_FREEZING_PARENT | CGROUP_FROZEN;
+ atomic_inc(&system_freezing_cnt);
+ }
+
+ spin_unlock(&freezer->lock);
+ if (parent)
+ spin_unlock_irq(&parent->lock);
+
+ return 0;
+}
+
+/**
+ * freezer_css_offline - initiate destruction of @cgroup
+ * @cgroup: cgroup being destroyed
+ *
+ * @cgroup is going away. Mark it dead and decrement system_freezing_count
+ * if it was holding one.
+ */
+static void freezer_css_offline(struct cgroup *cgroup)
{
struct freezer *freezer = cgroup_freezer(cgroup);
- if (freezer->state != CGROUP_THAWED)
+ spin_lock_irq(&freezer->lock);
+
+ if (freezer->state & CGROUP_FREEZING)
atomic_dec(&system_freezing_cnt);
- kfree(freezer);
+
+ freezer->state = 0;
+
+ spin_unlock_irq(&freezer->lock);
}
-/* task is frozen or will freeze immediately when next it gets woken */
-static bool is_task_frozen_enough(struct task_struct *task)
+static void freezer_css_free(struct cgroup *cgroup)
{
- return frozen(task) ||
- (task_is_stopped_or_traced(task) && freezing(task));
+ kfree(cgroup_freezer(cgroup));
}
/*
- * The call to cgroup_lock() in the freezer.state write method prevents
- * a write to that file racing against an attach, and hence the
- * can_attach() result will remain valid until the attach completes.
+ * Tasks can be migrated into a different freezer anytime regardless of its
+ * current state. freezer_attach() is responsible for making new tasks
+ * conform to the current state.
+ *
+ * Freezer state changes and task migration are synchronized via
+ * @freezer->lock. freezer_attach() makes the new tasks conform to the
+ * current state and all following state changes can see the new tasks.
*/
-static int freezer_can_attach(struct cgroup *new_cgroup,
- struct cgroup_taskset *tset)
+static void freezer_attach(struct cgroup *new_cgrp, struct cgroup_taskset *tset)
{
- struct freezer *freezer;
+ struct freezer *freezer = cgroup_freezer(new_cgrp);
struct task_struct *task;
+ bool clear_frozen = false;
+
+ spin_lock_irq(&freezer->lock);
/*
- * Anything frozen can't move or be moved to/from.
+ * Make the new tasks conform to the current state of @new_cgrp.
+ * For simplicity, when migrating any task to a FROZEN cgroup, we
+ * revert it to FREEZING and let update_if_frozen() determine the
+ * correct state later.
+ *
+ * Tasks in @tset are on @new_cgrp but may not conform to its
+ * current state before executing the following - !frozen tasks may
+ * be visible in a FROZEN cgroup and frozen tasks in a THAWED one.
*/
- cgroup_taskset_for_each(task, new_cgroup, tset)
- if (cgroup_freezing(task))
- return -EBUSY;
+ cgroup_taskset_for_each(task, new_cgrp, tset) {
+ if (!(freezer->state & CGROUP_FREEZING)) {
+ __thaw_task(task);
+ } else {
+ freeze_task(task);
+ freezer->state &= ~CGROUP_FROZEN;
+ clear_frozen = true;
+ }
+ }
- freezer = cgroup_freezer(new_cgroup);
- if (freezer->state != CGROUP_THAWED)
- return -EBUSY;
+ spin_unlock_irq(&freezer->lock);
- return 0;
+ /*
+ * Propagate FROZEN clearing upwards. We may race with
+ * update_if_frozen(), but as long as both work bottom-up, either
+ * update_if_frozen() sees child's FROZEN cleared or we clear the
+ * parent's FROZEN later. No parent w/ !FROZEN children can be
+ * left FROZEN.
+ */
+ while (clear_frozen && (freezer = parent_freezer(freezer))) {
+ spin_lock_irq(&freezer->lock);
+ freezer->state &= ~CGROUP_FROZEN;
+ clear_frozen = freezer->state & CGROUP_FREEZING;
+ spin_unlock_irq(&freezer->lock);
+ }
}
static void freezer_fork(struct task_struct *task)
{
struct freezer *freezer;
- /*
- * No lock is needed, since the task isn't on tasklist yet,
- * so it can't be moved to another cgroup, which means the
- * freezer won't be removed and will be valid during this
- * function call. Nevertheless, apply RCU read-side critical
- * section to suppress RCU lockdep false positives.
- */
rcu_read_lock();
freezer = task_freezer(task);
- rcu_read_unlock();
/*
* The root cgroup is non-freezable, so we can skip the
* following check.
*/
if (!freezer->css.cgroup->parent)
- return;
+ goto out;
spin_lock_irq(&freezer->lock);
- BUG_ON(freezer->state == CGROUP_FROZEN);
-
- /* Locking avoids race with FREEZING -> THAWED transitions. */
- if (freezer->state == CGROUP_FREEZING)
+ if (freezer->state & CGROUP_FREEZING)
freeze_task(task);
spin_unlock_irq(&freezer->lock);
+out:
+ rcu_read_unlock();
}
-/*
- * caller must hold freezer->lock
+/**
+ * update_if_frozen - update whether a cgroup finished freezing
+ * @cgroup: cgroup of interest
+ *
+ * Once FREEZING is initiated, transition to FROZEN is lazily updated by
+ * calling this function. If the current state is FREEZING but not FROZEN,
+ * this function checks whether all tasks of this cgroup and the descendant
+ * cgroups finished freezing and, if so, sets FROZEN.
+ *
+ * The caller is responsible for grabbing RCU read lock and calling
+ * update_if_frozen() on all descendants prior to invoking this function.
+ *
+ * Task states and freezer state might disagree while tasks are being
+ * migrated into or out of @cgroup, so we can't verify task states against
+ * @freezer state here. See freezer_attach() for details.
*/
-static void update_if_frozen(struct cgroup *cgroup,
- struct freezer *freezer)
+static void update_if_frozen(struct cgroup *cgroup)
{
+ struct freezer *freezer = cgroup_freezer(cgroup);
+ struct cgroup *pos;
struct cgroup_iter it;
struct task_struct *task;
- unsigned int nfrozen = 0, ntotal = 0;
- enum freezer_state old_state = freezer->state;
- cgroup_iter_start(cgroup, &it);
- while ((task = cgroup_iter_next(cgroup, &it))) {
- ntotal++;
- if (freezing(task) && is_task_frozen_enough(task))
- nfrozen++;
+ WARN_ON_ONCE(!rcu_read_lock_held());
+
+ spin_lock_irq(&freezer->lock);
+
+ if (!(freezer->state & CGROUP_FREEZING) ||
+ (freezer->state & CGROUP_FROZEN))
+ goto out_unlock;
+
+ /* are all (live) children frozen? */
+ cgroup_for_each_child(pos, cgroup) {
+ struct freezer *child = cgroup_freezer(pos);
+
+ if ((child->state & CGROUP_FREEZER_ONLINE) &&
+ !(child->state & CGROUP_FROZEN))
+ goto out_unlock;
}
- if (old_state == CGROUP_THAWED) {
- BUG_ON(nfrozen > 0);
- } else if (old_state == CGROUP_FREEZING) {
- if (nfrozen == ntotal)
- freezer->state = CGROUP_FROZEN;
- } else { /* old_state == CGROUP_FROZEN */
- BUG_ON(nfrozen != ntotal);
+ /* are all tasks frozen? */
+ cgroup_iter_start(cgroup, &it);
+
+ while ((task = cgroup_iter_next(cgroup, &it))) {
+ if (freezing(task)) {
+ /*
+ * freezer_should_skip() indicates that the task
+ * should be skipped when determining freezing
+ * completion. Consider it frozen in addition to
+ * the usual frozen condition.
+ */
+ if (!frozen(task) && !freezer_should_skip(task))
+ goto out_iter_end;
+ }
}
+ freezer->state |= CGROUP_FROZEN;
+out_iter_end:
cgroup_iter_end(cgroup, &it);
+out_unlock:
+ spin_unlock_irq(&freezer->lock);
}
static int freezer_read(struct cgroup *cgroup, struct cftype *cft,
struct seq_file *m)
{
- struct freezer *freezer;
- enum freezer_state state;
+ struct cgroup *pos;
- if (!cgroup_lock_live_group(cgroup))
- return -ENODEV;
+ rcu_read_lock();
- freezer = cgroup_freezer(cgroup);
- spin_lock_irq(&freezer->lock);
- state = freezer->state;
- if (state == CGROUP_FREEZING) {
- /* We change from FREEZING to FROZEN lazily if the cgroup was
- * only partially frozen when we exitted write. */
- update_if_frozen(cgroup, freezer);
- state = freezer->state;
- }
- spin_unlock_irq(&freezer->lock);
- cgroup_unlock();
+ /* update states bottom-up */
+ cgroup_for_each_descendant_post(pos, cgroup)
+ update_if_frozen(pos);
+ update_if_frozen(cgroup);
+
+ rcu_read_unlock();
- seq_puts(m, freezer_state_strs[state]);
+ seq_puts(m, freezer_state_strs(cgroup_freezer(cgroup)->state));
seq_putc(m, '\n');
return 0;
}
-static int try_to_freeze_cgroup(struct cgroup *cgroup, struct freezer *freezer)
+static void freeze_cgroup(struct freezer *freezer)
{
+ struct cgroup *cgroup = freezer->css.cgroup;
struct cgroup_iter it;
struct task_struct *task;
- unsigned int num_cant_freeze_now = 0;
cgroup_iter_start(cgroup, &it);
- while ((task = cgroup_iter_next(cgroup, &it))) {
- if (!freeze_task(task))
- continue;
- if (is_task_frozen_enough(task))
- continue;
- if (!freezing(task) && !freezer_should_skip(task))
- num_cant_freeze_now++;
- }
+ while ((task = cgroup_iter_next(cgroup, &it)))
+ freeze_task(task);
cgroup_iter_end(cgroup, &it);
-
- return num_cant_freeze_now ? -EBUSY : 0;
}
-static void unfreeze_cgroup(struct cgroup *cgroup, struct freezer *freezer)
+static void unfreeze_cgroup(struct freezer *freezer)
{
+ struct cgroup *cgroup = freezer->css.cgroup;
struct cgroup_iter it;
struct task_struct *task;
@@ -300,59 +348,111 @@ static void unfreeze_cgroup(struct cgroup *cgroup, struct freezer *freezer)
cgroup_iter_end(cgroup, &it);
}
-static int freezer_change_state(struct cgroup *cgroup,
- enum freezer_state goal_state)
+/**
+ * freezer_apply_state - apply state change to a single cgroup_freezer
+ * @freezer: freezer to apply state change to
+ * @freeze: whether to freeze or unfreeze
+ * @state: CGROUP_FREEZING_* flag to set or clear
+ *
+ * Set or clear @state on @cgroup according to @freeze, and perform
+ * freezing or thawing as necessary.
+ */
+static void freezer_apply_state(struct freezer *freezer, bool freeze,
+ unsigned int state)
{
- struct freezer *freezer;
- int retval = 0;
-
- freezer = cgroup_freezer(cgroup);
+ /* also synchronizes against task migration, see freezer_attach() */
+ lockdep_assert_held(&freezer->lock);
- spin_lock_irq(&freezer->lock);
+ if (!(freezer->state & CGROUP_FREEZER_ONLINE))
+ return;
- update_if_frozen(cgroup, freezer);
-
- switch (goal_state) {
- case CGROUP_THAWED:
- if (freezer->state != CGROUP_THAWED)
- atomic_dec(&system_freezing_cnt);
- freezer->state = CGROUP_THAWED;
- unfreeze_cgroup(cgroup, freezer);
- break;
- case CGROUP_FROZEN:
- if (freezer->state == CGROUP_THAWED)
+ if (freeze) {
+ if (!(freezer->state & CGROUP_FREEZING))
atomic_inc(&system_freezing_cnt);
- freezer->state = CGROUP_FREEZING;
- retval = try_to_freeze_cgroup(cgroup, freezer);
- break;
- default:
- BUG();
+ freezer->state |= state;
+ freeze_cgroup(freezer);
+ } else {
+ bool was_freezing = freezer->state & CGROUP_FREEZING;
+
+ freezer->state &= ~state;
+
+ if (!(freezer->state & CGROUP_FREEZING)) {
+ if (was_freezing)
+ atomic_dec(&system_freezing_cnt);
+ freezer->state &= ~CGROUP_FROZEN;
+ unfreeze_cgroup(freezer);
+ }
}
+}
+/**
+ * freezer_change_state - change the freezing state of a cgroup_freezer
+ * @freezer: freezer of interest
+ * @freeze: whether to freeze or thaw
+ *
+ * Freeze or thaw @freezer according to @freeze. The operations are
+ * recursive - all descendants of @freezer will be affected.
+ */
+static void freezer_change_state(struct freezer *freezer, bool freeze)
+{
+ struct cgroup *pos;
+
+ /* update @freezer */
+ spin_lock_irq(&freezer->lock);
+ freezer_apply_state(freezer, freeze, CGROUP_FREEZING_SELF);
spin_unlock_irq(&freezer->lock);
- return retval;
+ /*
+ * Update all its descendants in pre-order traversal. Each
+ * descendant will try to inherit its parent's FREEZING state as
+ * CGROUP_FREEZING_PARENT.
+ */
+ rcu_read_lock();
+ cgroup_for_each_descendant_pre(pos, freezer->css.cgroup) {
+ struct freezer *pos_f = cgroup_freezer(pos);
+ struct freezer *parent = parent_freezer(pos_f);
+
+ /*
+ * Our update to @parent->state is already visible which is
+ * all we need. No need to lock @parent. For more info on
+ * synchronization, see freezer_post_create().
+ */
+ spin_lock_irq(&pos_f->lock);
+ freezer_apply_state(pos_f, parent->state & CGROUP_FREEZING,
+ CGROUP_FREEZING_PARENT);
+ spin_unlock_irq(&pos_f->lock);
+ }
+ rcu_read_unlock();
}
-static int freezer_write(struct cgroup *cgroup,
- struct cftype *cft,
+static int freezer_write(struct cgroup *cgroup, struct cftype *cft,
const char *buffer)
{
- int retval;
- enum freezer_state goal_state;
+ bool freeze;
- if (strcmp(buffer, freezer_state_strs[CGROUP_THAWED]) == 0)
- goal_state = CGROUP_THAWED;
- else if (strcmp(buffer, freezer_state_strs[CGROUP_FROZEN]) == 0)
- goal_state = CGROUP_FROZEN;
+ if (strcmp(buffer, freezer_state_strs(0)) == 0)
+ freeze = false;
+ else if (strcmp(buffer, freezer_state_strs(CGROUP_FROZEN)) == 0)
+ freeze = true;
else
return -EINVAL;
- if (!cgroup_lock_live_group(cgroup))
- return -ENODEV;
- retval = freezer_change_state(cgroup, goal_state);
- cgroup_unlock();
- return retval;
+ freezer_change_state(cgroup_freezer(cgroup), freeze);
+ return 0;
+}
+
+static u64 freezer_self_freezing_read(struct cgroup *cgroup, struct cftype *cft)
+{
+ struct freezer *freezer = cgroup_freezer(cgroup);
+
+ return (bool)(freezer->state & CGROUP_FREEZING_SELF);
+}
+
+static u64 freezer_parent_freezing_read(struct cgroup *cgroup, struct cftype *cft)
+{
+ struct freezer *freezer = cgroup_freezer(cgroup);
+
+ return (bool)(freezer->state & CGROUP_FREEZING_PARENT);
}
static struct cftype files[] = {
@@ -362,23 +462,27 @@ static struct cftype files[] = {
.read_seq_string = freezer_read,
.write_string = freezer_write,
},
+ {
+ .name = "self_freezing",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .read_u64 = freezer_self_freezing_read,
+ },
+ {
+ .name = "parent_freezing",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .read_u64 = freezer_parent_freezing_read,
+ },
{ } /* terminate */
};
struct cgroup_subsys freezer_subsys = {
.name = "freezer",
- .create = freezer_create,
- .destroy = freezer_destroy,
+ .css_alloc = freezer_css_alloc,
+ .css_online = freezer_css_online,
+ .css_offline = freezer_css_offline,
+ .css_free = freezer_css_free,
.subsys_id = freezer_subsys_id,
- .can_attach = freezer_can_attach,
+ .attach = freezer_attach,
.fork = freezer_fork,
.base_cftypes = files,
-
- /*
- * freezer subsys doesn't handle hierarchy at all. Frozen state
- * should be inherited through the hierarchy - if a parent is
- * frozen, all its children should be frozen. Fix it and remove
- * the following.
- */
- .broken_hierarchy = true,
};
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index f33c715..b017887 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1784,56 +1784,20 @@ static struct cftype files[] = {
};
/*
- * post_clone() is called during cgroup_create() when the
- * clone_children mount argument was specified. The cgroup
- * can not yet have any tasks.
- *
- * Currently we refuse to set up the cgroup - thereby
- * refusing the task to be entered, and as a result refusing
- * the sys_unshare() or clone() which initiated it - if any
- * sibling cpusets have exclusive cpus or mem.
- *
- * If this becomes a problem for some users who wish to
- * allow that scenario, then cpuset_post_clone() could be
- * changed to grant parent->cpus_allowed-sibling_cpus_exclusive
- * (and likewise for mems) to the new cgroup. Called with cgroup_mutex
- * held.
- */
-static void cpuset_post_clone(struct cgroup *cgroup)
-{
- struct cgroup *parent, *child;
- struct cpuset *cs, *parent_cs;
-
- parent = cgroup->parent;
- list_for_each_entry(child, &parent->children, sibling) {
- cs = cgroup_cs(child);
- if (is_mem_exclusive(cs) || is_cpu_exclusive(cs))
- return;
- }
- cs = cgroup_cs(cgroup);
- parent_cs = cgroup_cs(parent);
-
- mutex_lock(&callback_mutex);
- cs->mems_allowed = parent_cs->mems_allowed;
- cpumask_copy(cs->cpus_allowed, parent_cs->cpus_allowed);
- mutex_unlock(&callback_mutex);
- return;
-}
-
-/*
- * cpuset_create - create a cpuset
+ * cpuset_css_alloc - allocate a cpuset css
* cont: control group that the new cpuset will be part of
*/
-static struct cgroup_subsys_state *cpuset_create(struct cgroup *cont)
+static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cont)
{
- struct cpuset *cs;
- struct cpuset *parent;
+ struct cgroup *parent_cg = cont->parent;
+ struct cgroup *tmp_cg;
+ struct cpuset *parent, *cs;
- if (!cont->parent) {
+ if (!parent_cg)
return &top_cpuset.css;
- }
- parent = cgroup_cs(cont->parent);
+ parent = cgroup_cs(parent_cg);
+
cs = kmalloc(sizeof(*cs), GFP_KERNEL);
if (!cs)
return ERR_PTR(-ENOMEM);
@@ -1855,7 +1819,36 @@ static struct cgroup_subsys_state *cpuset_create(struct cgroup *cont)
cs->parent = parent;
number_of_cpusets++;
- return &cs->css ;
+
+ if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &cont->flags))
+ goto skip_clone;
+
+ /*
+ * Clone @parent's configuration if CGRP_CPUSET_CLONE_CHILDREN is
+ * set. This flag handling is implemented in cgroup core for
+ * histrical reasons - the flag may be specified during mount.
+ *
+ * Currently, if any sibling cpusets have exclusive cpus or mem, we
+ * refuse to clone the configuration - thereby refusing the task to
+ * be entered, and as a result refusing the sys_unshare() or
+ * clone() which initiated it. If this becomes a problem for some
+ * users who wish to allow that scenario, then this could be
+ * changed to grant parent->cpus_allowed-sibling_cpus_exclusive
+ * (and likewise for mems) to the new cgroup.
+ */
+ list_for_each_entry(tmp_cg, &parent_cg->children, sibling) {
+ struct cpuset *tmp_cs = cgroup_cs(tmp_cg);
+
+ if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs))
+ goto skip_clone;
+ }
+
+ mutex_lock(&callback_mutex);
+ cs->mems_allowed = parent->mems_allowed;
+ cpumask_copy(cs->cpus_allowed, parent->cpus_allowed);
+ mutex_unlock(&callback_mutex);
+skip_clone:
+ return &cs->css;
}
/*
@@ -1864,7 +1857,7 @@ static struct cgroup_subsys_state *cpuset_create(struct cgroup *cont)
* will call async_rebuild_sched_domains().
*/
-static void cpuset_destroy(struct cgroup *cont)
+static void cpuset_css_free(struct cgroup *cont)
{
struct cpuset *cs = cgroup_cs(cont);
@@ -1878,11 +1871,10 @@ static void cpuset_destroy(struct cgroup *cont)
struct cgroup_subsys cpuset_subsys = {
.name = "cpuset",
- .create = cpuset_create,
- .destroy = cpuset_destroy,
+ .css_alloc = cpuset_css_alloc,
+ .css_free = cpuset_css_free,
.can_attach = cpuset_can_attach,
.attach = cpuset_attach,
- .post_clone = cpuset_post_clone,
.subsys_id = cpuset_subsys_id,
.base_cftypes = files,
.early_init = 1,
diff --git a/kernel/events/core.c b/kernel/events/core.c
index dbccf83..f9ff549 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -7434,7 +7434,7 @@ unlock:
device_initcall(perf_event_sysfs_init);
#ifdef CONFIG_CGROUP_PERF
-static struct cgroup_subsys_state *perf_cgroup_create(struct cgroup *cont)
+static struct cgroup_subsys_state *perf_cgroup_css_alloc(struct cgroup *cont)
{
struct perf_cgroup *jc;
@@ -7451,7 +7451,7 @@ static struct cgroup_subsys_state *perf_cgroup_create(struct cgroup *cont)
return &jc->css;
}
-static void perf_cgroup_destroy(struct cgroup *cont)
+static void perf_cgroup_css_free(struct cgroup *cont)
{
struct perf_cgroup *jc;
jc = container_of(cgroup_subsys_state(cont, perf_subsys_id),
@@ -7492,8 +7492,8 @@ static void perf_cgroup_exit(struct cgroup *cgrp, struct cgroup *old_cgrp,
struct cgroup_subsys perf_subsys = {
.name = "perf_event",
.subsys_id = perf_subsys_id,
- .create = perf_cgroup_create,
- .destroy = perf_cgroup_destroy,
+ .css_alloc = perf_cgroup_css_alloc,
+ .css_free = perf_cgroup_css_free,
.exit = perf_cgroup_exit,
.attach = perf_cgroup_attach,
diff --git a/kernel/fork.c b/kernel/fork.c
index 850dde1..79de9f9 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1137,7 +1137,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
{
int retval;
struct task_struct *p;
- int cgroup_callbacks_done = 0;
if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
return ERR_PTR(-EINVAL);
@@ -1395,12 +1394,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
INIT_LIST_HEAD(&p->thread_group);
p->task_works = NULL;
- /* Now that the task is set up, run cgroup callbacks if
- * necessary. We need to run them before the task is visible
- * on the tasklist. */
- cgroup_fork_callbacks(p);
- cgroup_callbacks_done = 1;
-
/* Need tasklist lock for parent etc handling! */
write_lock_irq(&tasklist_lock);
@@ -1505,7 +1498,7 @@ bad_fork_cleanup_cgroup:
#endif
if (clone_flags & CLONE_THREAD)
threadgroup_change_end(current);
- cgroup_exit(p, cgroup_callbacks_done);
+ cgroup_exit(p, 0);
delayacct_tsk_free(p);
module_put(task_thread_info(p)->exec_domain->module);
bad_fork_cleanup_count:
diff --git a/kernel/freezer.c b/kernel/freezer.c
index 11f82a4..c38893b 100644
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -116,17 +116,10 @@ bool freeze_task(struct task_struct *p)
return false;
}
- if (!(p->flags & PF_KTHREAD)) {
+ if (!(p->flags & PF_KTHREAD))
fake_signal_wake_up(p);
- /*
- * fake_signal_wake_up() goes through p's scheduler
- * lock and guarantees that TASK_STOPPED/TRACED ->
- * TASK_RUNNING transition can't race with task state
- * testing in try_to_freeze_tasks().
- */
- } else {
+ else
wake_up_state(p, TASK_INTERRUPTIBLE);
- }
spin_unlock_irqrestore(&freezer_lock, flags);
return true;
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 87da817..d5a258b 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -48,18 +48,7 @@ static int try_to_freeze_tasks(bool user_only)
if (p == current || !freeze_task(p))
continue;
- /*
- * Now that we've done set_freeze_flag, don't
- * perturb a task in TASK_STOPPED or TASK_TRACED.
- * It is "frozen enough". If the task does wake
- * up, it will immediately call try_to_freeze.
- *
- * Because freeze_task() goes through p's scheduler lock, it's
- * guaranteed that TASK_STOPPED/TRACED -> TASK_RUNNING
- * transition can't race with task state testing here.
- */
- if (!task_is_stopped_or_traced(p) &&
- !freezer_should_skip(p))
+ if (!freezer_should_skip(p))
todo++;
} while_each_thread(g, p);
read_unlock(&tasklist_lock);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index f5066a6..6271b89 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7484,7 +7484,7 @@ static inline struct task_group *cgroup_tg(struct cgroup *cgrp)
struct task_group, css);
}
-static struct cgroup_subsys_state *cpu_cgroup_create(struct cgroup *cgrp)
+static struct cgroup_subsys_state *cpu_cgroup_css_alloc(struct cgroup *cgrp)
{
struct task_group *tg, *parent;
@@ -7501,7 +7501,7 @@ static struct cgroup_subsys_state *cpu_cgroup_create(struct cgroup *cgrp)
return &tg->css;
}
-static void cpu_cgroup_destroy(struct cgroup *cgrp)
+static void cpu_cgroup_css_free(struct cgroup *cgrp)
{
struct task_group *tg = cgroup_tg(cgrp);
@@ -7861,8 +7861,8 @@ static struct cftype cpu_files[] = {
struct cgroup_subsys cpu_cgroup_subsys = {
.name = "cpu",
- .create = cpu_cgroup_create,
- .destroy = cpu_cgroup_destroy,
+ .css_alloc = cpu_cgroup_css_alloc,
+ .css_free = cpu_cgroup_css_free,
.can_attach = cpu_cgroup_can_attach,
.attach = cpu_cgroup_attach,
.exit = cpu_cgroup_exit,
@@ -7885,7 +7885,7 @@ struct cgroup_subsys cpu_cgroup_subsys = {
struct cpuacct root_cpuacct;
/* create a new cpu accounting group */
-static struct cgroup_subsys_state *cpuacct_create(struct cgroup *cgrp)
+static struct cgroup_subsys_state *cpuacct_css_alloc(struct cgroup *cgrp)
{
struct cpuacct *ca;
@@ -7915,7 +7915,7 @@ out:
}
/* destroy an existing cpu accounting group */
-static void cpuacct_destroy(struct cgroup *cgrp)
+static void cpuacct_css_free(struct cgroup *cgrp)
{
struct cpuacct *ca = cgroup_ca(cgrp);
@@ -8086,8 +8086,8 @@ void cpuacct_charge(struct task_struct *tsk, u64 cputime)
struct cgroup_subsys cpuacct_subsys = {
.name = "cpuacct",
- .create = cpuacct_create,
- .destroy = cpuacct_destroy,
+ .css_alloc = cpuacct_css_alloc,
+ .css_free = cpuacct_css_free,
.subsys_id = cpuacct_subsys_id,
.base_cftypes = files,
};
diff --git a/kernel/signal.c b/kernel/signal.c
index 0af8868..5ffb5626 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1908,7 +1908,7 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info)
preempt_disable();
read_unlock(&tasklist_lock);
preempt_enable_no_resched();
- schedule();
+ freezable_schedule();
} else {
/*
* By the time we got the lock, our tracer went away.
@@ -1930,13 +1930,6 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info)
}
/*
- * While in TASK_TRACED, we were considered "frozen enough".
- * Now that we woke up, it's crucial if we're supposed to be
- * frozen that we freeze now before running anything substantial.
- */
- try_to_freeze();
-
- /*
* We are back. Now reacquire the siglock before touching
* last_siginfo, so that we are sure to have synchronized with
* any signal-sending on another CPU that wants to examine it.
@@ -2092,7 +2085,7 @@ static bool do_signal_stop(int signr)
}
/* Now we don't run again until woken by SIGCONT or SIGKILL */
- schedule();
+ freezable_schedule();
return true;
} else {
/*
@@ -2200,15 +2193,14 @@ int get_signal_to_deliver(siginfo_t *info, struct k_sigaction *return_ka,
if (unlikely(uprobe_deny_signal()))
return 0;
-relock:
/*
- * We'll jump back here after any time we were stopped in TASK_STOPPED.
- * While in TASK_STOPPED, we were considered "frozen enough".
- * Now that we woke up, it's crucial if we're supposed to be
- * frozen that we freeze now before running anything substantial.
+ * Do this once, we can't return to user-mode if freezing() == T.
+ * do_signal_stop() and ptrace_stop() do freezable_schedule() and
+ * thus do not need another check after return.
*/
try_to_freeze();
+relock:
spin_lock_irq(&sighand->siglock);
/*
* Every stopped thread goes here after wakeup. Check to see if