From 8f89140ae41ccd9c63344e6823faa862aa7435e3 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 28 Jun 2013 16:24:10 -0700 Subject: cgroup: minor updates around cgroup_clear_directory() * Rename it to cgroup_clear_dir() and make it take the pointer to the target cgroup instead of the the dentry. This makes the function consistent with its counterpart - cgroup_populate_dir(). * Move cgroup_clear_directory() invocation from cgroup_d_remove_dir() to cgroup_remount() so that the function doesn't have to determine the cgroup pointer back from the dentry. cgroup_d_remove_dir() now only deals with vfs, which is slightly cleaner. This patch doesn't introduce any functional differences. Signed-off-by: Tejun Heo Acked-by: Li Zefan diff --git a/kernel/cgroup.c b/kernel/cgroup.c index e5583d1..09bfa87 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -957,15 +957,14 @@ static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) } /** - * cgroup_clear_directory - selective removal of base and subsystem files - * @dir: directory containing the files + * cgroup_clear_dir - selective removal of base and subsystem files + * @cgrp: target cgroup * @base_files: true if the base files should be removed * @subsys_mask: mask of the subsystem ids whose files should be removed */ -static void cgroup_clear_directory(struct dentry *dir, bool base_files, - unsigned long subsys_mask) +static void cgroup_clear_dir(struct cgroup *cgrp, bool base_files, + unsigned long subsys_mask) { - struct cgroup *cgrp = __d_cgrp(dir); struct cgroup_subsys *ss; for_each_root_subsys(cgrp->root, ss) { @@ -987,9 +986,6 @@ static void cgroup_clear_directory(struct dentry *dir, bool base_files, static void cgroup_d_remove_dir(struct dentry *dentry) { struct dentry *parent; - struct cgroupfs_root *root = dentry->d_sb->s_fs_info; - - cgroup_clear_directory(dentry, true, root->subsys_mask); parent = dentry->d_parent; spin_lock(&parent->d_lock); @@ -1376,7 +1372,7 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) * this before rebind_subsystems, since rebind_subsystems may * change this hierarchy's subsys_list. */ - cgroup_clear_directory(cgrp->dentry, false, removed_mask); + cgroup_clear_dir(cgrp, false, removed_mask); ret = rebind_subsystems(root, added_mask, removed_mask); if (ret) { @@ -4541,9 +4537,10 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) raw_spin_unlock(&release_list_lock); /* - * Remove @cgrp directory. The removal puts the base ref but we - * aren't quite done with @cgrp yet, so hold onto it. + * Clear and remove @cgrp directory. The removal puts the base ref + * but we aren't quite done with @cgrp yet, so hold onto it. */ + cgroup_clear_dir(cgrp, true, cgrp->root->subsys_mask); dget(d); cgroup_d_remove_dir(d); -- cgit v0.10.2 From b1f28d3109349899e87377e89f9d8ab5bc95ec57 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 28 Jun 2013 16:24:10 -0700 Subject: cgroup: fix error path of cgroup_addrm_files() cgroup_addrm_files() mishandled error return value from cgroup_add_file() and returns error iff the last file fails to create. As we're in the process of cleaning up file add/rm error handling and will reliably propagate file creation failures, there's no point in keeping adding files after a failure. Replace the broken error collection logic with immediate error return. While at it, add lockdep assertions and function comment. Signed-off-by: Tejun Heo Acked-by: Li Zefan diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 09bfa87..9b16d75 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2780,11 +2780,26 @@ out: return error; } +/** + * cgroup_addrm_files - add or remove files to a cgroup directory + * @cgrp: the target cgroup + * @subsys: the subsystem of files to be added + * @cfts: array of cftypes to be added + * @is_add: whether to add or remove + * + * Depending on @is_add, add or remove files defined by @cfts on @cgrp. + * All @cfts should belong to @subsys. For removals, this function never + * fails. If addition fails, this function doesn't remove files already + * added. The caller is responsible for cleaning up. + */ static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys, struct cftype cfts[], bool is_add) { struct cftype *cft; - int err, ret = 0; + int ret; + + lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex); + lockdep_assert_held(&cgroup_mutex); for (cft = cfts; cft->name[0] != '\0'; cft++) { /* does cft->flags tell us to skip this file on @cgrp? */ @@ -2796,16 +2811,17 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys, continue; if (is_add) { - err = cgroup_add_file(cgrp, subsys, cft); - if (err) + ret = cgroup_add_file(cgrp, subsys, cft); + if (ret) { pr_warn("cgroup_addrm_files: failed to add %s, err=%d\n", - cft->name, err); - ret = err; + cft->name, ret); + return ret; + } } else { cgroup_rm_file(cgrp, cft); } } - return ret; + return 0; } static void cgroup_cfts_prepare(void) -- cgit v0.10.2 From 9ccece80ae19ed42439fc0ced76858f189cd41e8 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 28 Jun 2013 16:24:11 -0700 Subject: cgroup: fix cgroup_add_cftypes() error handling cgroup_add_cftypes() uses cgroup_cfts_commit() to actually create the files; however, both functions ignore actual file creation errors and just assume success. This can lead to, for example, blkio hierarchy with some of the cgroups with only subset of interface files populated after cfq-iosched is loaded under heavy memory pressure, which is nasty. This patch updates cgroup_cfts_commit() and cgroup_add_cftypes() to guarantee that all files are created on success and no file is created on failure. Signed-off-by: Tejun Heo Acked-by: Li Zefan diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 9b16d75..36c0ccc 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2836,8 +2836,8 @@ static void cgroup_cfts_prepare(void) mutex_lock(&cgroup_mutex); } -static void cgroup_cfts_commit(struct cgroup_subsys *ss, - struct cftype *cfts, bool is_add) +static int cgroup_cfts_commit(struct cgroup_subsys *ss, + struct cftype *cfts, bool is_add) __releases(&cgroup_mutex) { LIST_HEAD(pending); @@ -2846,12 +2846,13 @@ static void cgroup_cfts_commit(struct cgroup_subsys *ss, struct dentry *prev = NULL; struct inode *inode; u64 update_before; + int ret = 0; /* %NULL @cfts indicates abort and don't bother if @ss isn't attached */ if (!cfts || ss->root == &cgroup_dummy_root || !atomic_inc_not_zero(&sb->s_active)) { mutex_unlock(&cgroup_mutex); - return; + return 0; } /* @@ -2867,10 +2868,13 @@ static void cgroup_cfts_commit(struct cgroup_subsys *ss, inode = root->dentry->d_inode; mutex_lock(&inode->i_mutex); mutex_lock(&cgroup_mutex); - cgroup_addrm_files(root, ss, cfts, is_add); + ret = cgroup_addrm_files(root, ss, cfts, is_add); mutex_unlock(&cgroup_mutex); mutex_unlock(&inode->i_mutex); + if (ret) + goto out_deact; + /* add/rm files for all cgroups created before */ rcu_read_lock(); cgroup_for_each_descendant_pre(cgrp, root) { @@ -2887,15 +2891,19 @@ static void cgroup_cfts_commit(struct cgroup_subsys *ss, mutex_lock(&inode->i_mutex); mutex_lock(&cgroup_mutex); if (cgrp->serial_nr < update_before && !cgroup_is_dead(cgrp)) - cgroup_addrm_files(cgrp, ss, cfts, is_add); + ret = cgroup_addrm_files(cgrp, ss, cfts, is_add); mutex_unlock(&cgroup_mutex); mutex_unlock(&inode->i_mutex); rcu_read_lock(); + if (ret) + break; } rcu_read_unlock(); dput(prev); +out_deact: deactivate_super(sb); + return ret; } /** @@ -2915,6 +2923,7 @@ static void cgroup_cfts_commit(struct cgroup_subsys *ss, int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) { struct cftype_set *set; + int ret; set = kzalloc(sizeof(*set), GFP_KERNEL); if (!set) @@ -2923,9 +2932,10 @@ int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) cgroup_cfts_prepare(); set->cfts = cfts; list_add_tail(&set->node, &ss->cftsets); - cgroup_cfts_commit(ss, cfts, true); - - return 0; + ret = cgroup_cfts_commit(ss, cfts, true); + if (ret) + cgroup_rm_cftypes(ss, cfts); + return ret; } EXPORT_SYMBOL_GPL(cgroup_add_cftypes); -- cgit v0.10.2 From 628f7cd47ab758cae0353d1a6decf3d1459dca24 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 28 Jun 2013 16:24:11 -0700 Subject: cgroup: separate out cgroup_base_files[] handling out of cgroup_populate/clear_dir() cgroup_populate/clear_dir() currently take @base_files and adds and removes, respectively, cgroup_base_files[] to the directory. File additions and removals are being reorganized for proper error handling and more dynamic handling for the unified hierarchy, and mixing base and subsys file handling into the same functions gets a bit confusing. This patch moves base file handling out of cgroup_populate/clear_dir() into their users - cgroup_mount(), cgroup_create() and cgroup_destroy_locked(). Note that this changes the behavior of base file removal. If @base_files is %true, cgroup_clear_dir() used to delete files regardless of cftype until there's no files left. Now, only files with matching cfts are removed. As files can only be created by the base or registered cftypes, this shouldn't result in any behavior difference. Signed-off-by: Tejun Heo Acked-by: Li Zefan diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 36c0ccc..9835a09 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -215,6 +215,8 @@ static u64 cgroup_serial_nr_next = 1; */ static int need_forkexit_callback __read_mostly; +static struct cftype cgroup_base_files[]; + static void cgroup_offline_fn(struct work_struct *work); static int cgroup_destroy_locked(struct cgroup *cgrp); static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys, @@ -804,8 +806,7 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task, static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode); static struct dentry *cgroup_lookup(struct inode *, struct dentry *, unsigned int); static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry); -static int cgroup_populate_dir(struct cgroup *cgrp, bool base_files, - unsigned long subsys_mask); +static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask); static const struct inode_operations cgroup_dir_inode_operations; static const struct file_operations proc_cgroupstats_operations; @@ -957,13 +958,11 @@ static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) } /** - * cgroup_clear_dir - selective removal of base and subsystem files + * cgroup_clear_dir - remove subsys files in a cgroup directory * @cgrp: target cgroup - * @base_files: true if the base files should be removed * @subsys_mask: mask of the subsystem ids whose files should be removed */ -static void cgroup_clear_dir(struct cgroup *cgrp, bool base_files, - unsigned long subsys_mask) +static void cgroup_clear_dir(struct cgroup *cgrp, unsigned long subsys_mask) { struct cgroup_subsys *ss; @@ -974,10 +973,6 @@ static void cgroup_clear_dir(struct cgroup *cgrp, bool base_files, list_for_each_entry(set, &ss->cftsets, node) cgroup_addrm_files(cgrp, NULL, set->cfts, false); } - if (base_files) { - while (!list_empty(&cgrp->files)) - cgroup_rm_file(cgrp, NULL); - } } /* @@ -1372,17 +1367,17 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) * this before rebind_subsystems, since rebind_subsystems may * change this hierarchy's subsys_list. */ - cgroup_clear_dir(cgrp, false, removed_mask); + cgroup_clear_dir(cgrp, removed_mask); ret = rebind_subsystems(root, added_mask, removed_mask); if (ret) { /* rebind_subsystems failed, re-populate the removed files */ - cgroup_populate_dir(cgrp, false, removed_mask); + cgroup_populate_dir(cgrp, removed_mask); goto out_unlock; } /* re-populate subsystem files */ - cgroup_populate_dir(cgrp, false, added_mask); + cgroup_populate_dir(cgrp, added_mask); if (opts.release_agent) strcpy(root->release_agent_path, opts.release_agent); @@ -1687,7 +1682,8 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, BUG_ON(root->number_of_cgroups != 1); cred = override_creds(&init_cred); - cgroup_populate_dir(root_cgrp, true, root->subsys_mask); + cgroup_addrm_files(root_cgrp, NULL, cgroup_base_files, true); + cgroup_populate_dir(root_cgrp, root->subsys_mask); revert_creds(cred); mutex_unlock(&cgroup_root_mutex); mutex_unlock(&cgroup_mutex); @@ -4172,23 +4168,14 @@ static struct cftype cgroup_base_files[] = { }; /** - * cgroup_populate_dir - selectively creation of files in a directory + * cgroup_populate_dir - create subsys files in a cgroup directory * @cgrp: target cgroup - * @base_files: true if the base files should be added * @subsys_mask: mask of the subsystem ids whose files should be added */ -static int cgroup_populate_dir(struct cgroup *cgrp, bool base_files, - unsigned long subsys_mask) +static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask) { - int err; struct cgroup_subsys *ss; - if (base_files) { - err = cgroup_addrm_files(cgrp, NULL, cgroup_base_files, true); - if (err < 0) - return err; - } - /* process cftsets of each subsystem */ for_each_root_subsys(cgrp->root, ss) { struct cftype_set *set; @@ -4410,7 +4397,11 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, } } - err = cgroup_populate_dir(cgrp, true, root->subsys_mask); + err = cgroup_addrm_files(cgrp, NULL, cgroup_base_files, true); + if (err) + goto err_destroy; + + err = cgroup_populate_dir(cgrp, root->subsys_mask); if (err) goto err_destroy; @@ -4566,7 +4557,8 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) * Clear and remove @cgrp directory. The removal puts the base ref * but we aren't quite done with @cgrp yet, so hold onto it. */ - cgroup_clear_dir(cgrp, true, cgrp->root->subsys_mask); + cgroup_clear_dir(cgrp, cgrp->root->subsys_mask); + cgroup_addrm_files(cgrp, NULL, cgroup_base_files, false); dget(d); cgroup_d_remove_dir(d); -- cgit v0.10.2 From bee550994f6b0c1179bd3ccea58dc5c2c4ccf842 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 28 Jun 2013 16:24:11 -0700 Subject: cgroup: update error handling in cgroup_populate_dir() cgroup_populate_dir() didn't use to check whether the actual file creations were successful and could return success with only subset of the requested files created, which is nasty. This patch udpates cgroup_populate_dir() so that it either succeeds with all files or fails with no file. v2: The original patch also converted for_each_root_subsys() usages to for_each_subsys() without explaining why. That part has been moved to a separate patch. Signed-off-by: Tejun Heo Acked-by: Li Zefan diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 9835a09..6b73244 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4171,10 +4171,13 @@ static struct cftype cgroup_base_files[] = { * cgroup_populate_dir - create subsys files in a cgroup directory * @cgrp: target cgroup * @subsys_mask: mask of the subsystem ids whose files should be added + * + * On failure, no file is added. */ static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask) { struct cgroup_subsys *ss; + int ret = 0; /* process cftsets of each subsystem */ for_each_root_subsys(cgrp->root, ss) { @@ -4182,8 +4185,11 @@ static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask) if (!test_bit(ss->subsys_id, &subsys_mask)) continue; - list_for_each_entry(set, &ss->cftsets, node) - cgroup_addrm_files(cgrp, ss, set->cfts, true); + list_for_each_entry(set, &ss->cftsets, node) { + ret = cgroup_addrm_files(cgrp, ss, set->cfts, true); + if (ret < 0) + goto err; + } } /* This cgroup is ready now */ @@ -4201,6 +4207,9 @@ static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask) } return 0; +err: + cgroup_clear_dir(cgrp, subsys_mask); + return ret; } static void css_dput_fn(struct work_struct *work) -- cgit v0.10.2 From b420ba7db15659253d4f286a0ba479d336371999 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 12 Jul 2013 12:34:02 -0700 Subject: cgroup: use for_each_subsys() instead of for_each_root_subsys() in cgroup_populate/clear_dir() rebind_subsystems() will be updated to handle file creations and removals with proper error handling and to do that will need to perform file operations before actually adding the subsystem to the hierarchy. To enable such usage, update cgroup_populate/clear_dir() to use for_each_subsys() instead of for_each_root_subsys() so that they operate on all subsystems specified by @subsys_mask whether that subsystem is currently bound to the hierarchy or not. Signed-off-by: Tejun Heo Acked-by: Li Zefan diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 6b73244..8f70dc0 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -965,10 +965,12 @@ static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) static void cgroup_clear_dir(struct cgroup *cgrp, unsigned long subsys_mask) { struct cgroup_subsys *ss; + int i; - for_each_root_subsys(cgrp->root, ss) { + for_each_subsys(ss, i) { struct cftype_set *set; - if (!test_bit(ss->subsys_id, &subsys_mask)) + + if (!test_bit(i, &subsys_mask)) continue; list_for_each_entry(set, &ss->cftsets, node) cgroup_addrm_files(cgrp, NULL, set->cfts, false); @@ -4177,12 +4179,13 @@ static struct cftype cgroup_base_files[] = { static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask) { struct cgroup_subsys *ss; - int ret = 0; + int i, ret = 0; /* process cftsets of each subsystem */ - for_each_root_subsys(cgrp->root, ss) { + for_each_subsys(ss, i) { struct cftype_set *set; - if (!test_bit(ss->subsys_id, &subsys_mask)) + + if (!test_bit(i, &subsys_mask)) continue; list_for_each_entry(set, &ss->cftsets, node) { -- cgit v0.10.2 From 3126121fb30941552b1a806c7c2e686bde57e270 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 28 Jun 2013 17:07:30 -0700 Subject: cgroup: make rebind_subsystems() handle file additions and removals with proper error handling Currently, creating and removing cgroup files in the root directory are handled separately from the actual subsystem binding and unbinding which happens in rebind_subsystems(). Also, rebind_subsystems() users aren't handling file creation errors properly. Let's integrate top_cgroup file handling into rebind_subsystems() so that it's simpler to use and everyone handles file creation errors correctly. * On a successful return, rebind_subsystems() is guaranteed to have created all files of the new subsystems and deleted the ones belonging to the removed subsystems. After a failure, no file is created or removed. * cgroup_remount() no longer needs to make explicit populate/clear calls as it's all handled by rebind_subsystems(), and it gets proper error handling automatically. * cgroup_mount() has been updated such that the root dentry and cgroup are linked before rebind_subsystems(). Also, the init_cred dancing and base file handling are moved right above rebind_subsystems() call and proper error handling for the base files is added. While at it, add a comment explaining what's going on with the cred thing. * cgroup_kill_sb() calls rebind_subsystems() to unbind all subsystems which now implies removing all subsystem files which requires the directory's i_mutex. Grab it. This means that files on the root cgroup are removed earlier - they used to be deleted from generic super_block cleanup from vfs. This doesn't lead to any functional difference and it's cleaner to do the clean up explicitly for all files. Combined with the previous changes, this makes all cgroup file creation errors handled correctly. v2: Added comment on init_cred. v3: Li spotted that cgroup_mount() wasn't freeing tmp_links after base file addition failure. Fix it by adding free_tmp_links error handling label. v4: v3 introduced build bugs which got noticed by Fengguang's awesome kbuild test robot. Fixed, and shame on me. Signed-off-by: Tejun Heo Acked-by: Li Zefan Cc: Fengguang Wu diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 8f70dc0..4ec8d2d 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1003,7 +1003,7 @@ static int rebind_subsystems(struct cgroupfs_root *root, { struct cgroup *cgrp = &root->top_cgroup; struct cgroup_subsys *ss; - int i; + int i, ret; BUG_ON(!mutex_is_locked(&cgroup_mutex)); BUG_ON(!mutex_is_locked(&cgroup_root_mutex)); @@ -1028,7 +1028,16 @@ static int rebind_subsystems(struct cgroupfs_root *root, if (root->number_of_cgroups > 1) return -EBUSY; - /* Process each subsystem */ + ret = cgroup_populate_dir(cgrp, added_mask); + if (ret) + return ret; + + /* + * Nothing can fail from this point on. Remove files for the + * removed subsystems and rebind each subsystem. + */ + cgroup_clear_dir(cgrp, removed_mask); + for_each_subsys(ss, i) { unsigned long bit = 1UL << i; @@ -1364,22 +1373,9 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) goto out_unlock; } - /* - * Clear out the files of subsystems that should be removed, do - * this before rebind_subsystems, since rebind_subsystems may - * change this hierarchy's subsys_list. - */ - cgroup_clear_dir(cgrp, removed_mask); - ret = rebind_subsystems(root, added_mask, removed_mask); - if (ret) { - /* rebind_subsystems failed, re-populate the removed files */ - cgroup_populate_dir(cgrp, removed_mask); + if (ret) goto out_unlock; - } - - /* re-populate subsystem files */ - cgroup_populate_dir(cgrp, added_mask); if (opts.release_agent) strcpy(root->release_agent_path, opts.release_agent); @@ -1578,7 +1574,9 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, int ret = 0; struct super_block *sb; struct cgroupfs_root *new_root; + struct list_head tmp_links; struct inode *inode; + const struct cred *cred; /* First find the desired set of subsystems */ mutex_lock(&cgroup_mutex); @@ -1610,10 +1608,8 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, BUG_ON(!root); if (root == opts.new_root) { /* We used the new root structure, so this is a new hierarchy */ - struct list_head tmp_links; struct cgroup *root_cgrp = &root->top_cgroup; struct cgroupfs_root *existing_root; - const struct cred *cred; int i; struct css_set *cset; @@ -1651,26 +1647,37 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, if (ret) goto unlock_drop; + sb->s_root->d_fsdata = root_cgrp; + root_cgrp->dentry = sb->s_root; + + /* + * We're inside get_sb() and will call lookup_one_len() to + * create the root files, which doesn't work if SELinux is + * in use. The following cred dancing somehow works around + * it. See 2ce9738ba ("cgroupfs: use init_cred when + * populating new cgroupfs mount") for more details. + */ + cred = override_creds(&init_cred); + + ret = cgroup_addrm_files(root_cgrp, NULL, cgroup_base_files, true); + if (ret) + goto rm_base_files; + ret = rebind_subsystems(root, root->subsys_mask, 0); - if (ret == -EBUSY) { - free_cgrp_cset_links(&tmp_links); - goto unlock_drop; - } + if (ret) + goto rm_base_files; + + revert_creds(cred); + /* * There must be no failure case after here, since rebinding * takes care of subsystems' refcounts, which are explicitly * dropped in the failure exit path. */ - /* EBUSY should be the only error here */ - BUG_ON(ret); - list_add(&root->root_list, &cgroup_roots); cgroup_root_count++; - sb->s_root->d_fsdata = root_cgrp; - root->top_cgroup.dentry = sb->s_root; - /* Link the top cgroup in this hierarchy into all * the css_set objects */ write_lock(&css_set_lock); @@ -1683,10 +1690,6 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, BUG_ON(!list_empty(&root_cgrp->children)); BUG_ON(root->number_of_cgroups != 1); - cred = override_creds(&init_cred); - cgroup_addrm_files(root_cgrp, NULL, cgroup_base_files, true); - cgroup_populate_dir(root_cgrp, root->subsys_mask); - revert_creds(cred); mutex_unlock(&cgroup_root_mutex); mutex_unlock(&cgroup_mutex); mutex_unlock(&inode->i_mutex); @@ -1715,6 +1718,10 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, kfree(opts.name); return dget(sb->s_root); + rm_base_files: + free_cgrp_cset_links(&tmp_links); + cgroup_addrm_files(&root->top_cgroup, NULL, cgroup_base_files, false); + revert_creds(cred); unlock_drop: cgroup_exit_root_id(root); mutex_unlock(&cgroup_root_mutex); @@ -1741,6 +1748,7 @@ static void cgroup_kill_sb(struct super_block *sb) { BUG_ON(root->number_of_cgroups != 1); BUG_ON(!list_empty(&cgrp->children)); + mutex_lock(&cgrp->dentry->d_inode->i_mutex); mutex_lock(&cgroup_mutex); mutex_lock(&cgroup_root_mutex); @@ -1773,6 +1781,7 @@ static void cgroup_kill_sb(struct super_block *sb) { mutex_unlock(&cgroup_root_mutex); mutex_unlock(&cgroup_mutex); + mutex_unlock(&cgrp->dentry->d_inode->i_mutex); simple_xattrs_free(&cgrp->xattrs); -- cgit v0.10.2 From f172e67cf9d842bc646d0f66792e38435a334b1e Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 28 Jun 2013 17:07:30 -0700 Subject: cgroup: move number_of_cgroups test out of rebind_subsystems() into cgroup_remount() rebind_subsystems() currently fails if the hierarchy has any !root cgroups; however, on the planned unified hierarchy, rebind_subsystems() will be used while populated. Move the test to cgroup_remount(), which is the only place the test is necessary anyway. As it's impossible for the other two callers of rebind_subsystems() to have populated hierarchy, this doesn't make any behavior changes. Signed-off-by: Tejun Heo Acked-by: Li Zefan diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 4ec8d2d..c108d3d 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1021,13 +1021,6 @@ static int rebind_subsystems(struct cgroupfs_root *root, } } - /* Currently we don't handle adding/removing subsystems when - * any child cgroups exist. This is theoretically supportable - * but involves complex error handling, so it's being left until - * later */ - if (root->number_of_cgroups > 1) - return -EBUSY; - ret = cgroup_populate_dir(cgrp, added_mask); if (ret) return ret; @@ -1373,6 +1366,12 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) goto out_unlock; } + /* remounting is not allowed for populated hierarchies */ + if (root->number_of_cgroups > 1) { + ret = -EBUSY; + goto out_unlock; + } + ret = rebind_subsystems(root, added_mask, removed_mask); if (ret) goto out_unlock; -- cgit v0.10.2 From 1d5be6b287c8efc879fbe578e2b7bc8f7a38f313 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 12 Jul 2013 13:38:17 -0700 Subject: cgroup: move module ref handling into rebind_subsystems() Module ref handling in cgroup is rather weird. parse_cgroupfs_options() grabs all the modules for the specified subsystems. A module ref is kept if the specified subsystem is newly bound to the hierarchy. If not, or the operation fails, the refs are dropped. This scatters module ref handling across multiple functions making it difficult to track. It also make the function nasty to use for dynamic subsystem binding which is necessary for the planned unified hierarchy. There's nothing which requires the subsystem modules to be pinned between parse_cgroupfs_options() and rebind_subsystems() in both mount and remount paths. parse_cgroupfs_options() can just parse and rebind_subsystems() can handle pinning the subsystems that it wants to bind, which is a natural part of its task - binding - anyway. Move module ref handling into rebind_subsystems() which makes the code a lot simpler - modules are gotten iff it's gonna be bound and put iff unbound or binding fails. v2: Li pointed out that if a controller module is unloaded between parsing and binding, rebind_subsystems() won't notice the missing controller as it only iterates through existing controllers. Fix it by updating rebind_subsystems() to compare @added_mask to @pinned and fail with -ENOENT if they don't match. Signed-off-by: Tejun Heo Acked-by: Li Zefan diff --git a/kernel/cgroup.c b/kernel/cgroup.c index c108d3d..2a8cf1a 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1003,6 +1003,7 @@ static int rebind_subsystems(struct cgroupfs_root *root, { struct cgroup *cgrp = &root->top_cgroup; struct cgroup_subsys *ss; + unsigned long pinned = 0; int i, ret; BUG_ON(!mutex_is_locked(&cgroup_mutex)); @@ -1010,20 +1011,32 @@ static int rebind_subsystems(struct cgroupfs_root *root, /* Check that any added subsystems are currently free */ for_each_subsys(ss, i) { - unsigned long bit = 1UL << i; - - if (!(bit & added_mask)) + if (!(added_mask & (1 << i))) continue; + /* is the subsystem mounted elsewhere? */ if (ss->root != &cgroup_dummy_root) { - /* Subsystem isn't free */ - return -EBUSY; + ret = -EBUSY; + goto out_put; + } + + /* pin the module */ + if (!try_module_get(ss->module)) { + ret = -ENOENT; + goto out_put; } + pinned |= 1 << i; + } + + /* subsys could be missing if unloaded between parsing and here */ + if (added_mask != pinned) { + ret = -ENOENT; + goto out_put; } ret = cgroup_populate_dir(cgrp, added_mask); if (ret) - return ret; + goto out_put; /* * Nothing can fail from this point on. Remove files for the @@ -1067,11 +1080,6 @@ static int rebind_subsystems(struct cgroupfs_root *root, } else if (bit & root->subsys_mask) { /* Subsystem state should already exist */ BUG_ON(!cgrp->subsys[i]); - /* - * a refcount was taken, but we already had one, so - * drop the extra reference. - */ - module_put(ss->module); #ifdef CONFIG_MODULE_UNLOAD BUG_ON(ss->module && !module_refcount(ss->module)); #endif @@ -1088,6 +1096,12 @@ static int rebind_subsystems(struct cgroupfs_root *root, root->flags |= CGRP_ROOT_SUBSYS_BOUND; return 0; + +out_put: + for_each_subsys(ss, i) + if (pinned & (1 << i)) + module_put(ss->module); + return ret; } static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry) @@ -1138,7 +1152,6 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) char *token, *o = data; bool all_ss = false, one_ss = false; unsigned long mask = (unsigned long)-1; - bool module_pin_failed = false; struct cgroup_subsys *ss; int i; @@ -1281,52 +1294,9 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) if (!opts->subsys_mask && !opts->name) return -EINVAL; - /* - * Grab references on all the modules we'll need, so the subsystems - * don't dance around before rebind_subsystems attaches them. This may - * take duplicate reference counts on a subsystem that's already used, - * but rebind_subsystems handles this case. - */ - for_each_subsys(ss, i) { - if (!(opts->subsys_mask & (1UL << i))) - continue; - if (!try_module_get(cgroup_subsys[i]->module)) { - module_pin_failed = true; - break; - } - } - if (module_pin_failed) { - /* - * oops, one of the modules was going away. this means that we - * raced with a module_delete call, and to the user this is - * essentially a "subsystem doesn't exist" case. - */ - for (i--; i >= 0; i--) { - /* drop refcounts only on the ones we took */ - unsigned long bit = 1UL << i; - - if (!(bit & opts->subsys_mask)) - continue; - module_put(cgroup_subsys[i]->module); - } - return -ENOENT; - } - return 0; } -static void drop_parsed_module_refcounts(unsigned long subsys_mask) -{ - struct cgroup_subsys *ss; - int i; - - mutex_lock(&cgroup_mutex); - for_each_subsys(ss, i) - if (subsys_mask & (1UL << i)) - module_put(cgroup_subsys[i]->module); - mutex_unlock(&cgroup_mutex); -} - static int cgroup_remount(struct super_block *sb, int *flags, char *data) { int ret = 0; @@ -1384,8 +1354,6 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) mutex_unlock(&cgroup_root_mutex); mutex_unlock(&cgroup_mutex); mutex_unlock(&cgrp->dentry->d_inode->i_mutex); - if (ret) - drop_parsed_module_refcounts(opts.subsys_mask); return ret; } @@ -1591,7 +1559,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, new_root = cgroup_root_from_opts(&opts); if (IS_ERR(new_root)) { ret = PTR_ERR(new_root); - goto drop_modules; + goto out_err; } opts.new_root = new_root; @@ -1600,7 +1568,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, if (IS_ERR(sb)) { ret = PTR_ERR(sb); cgroup_free_root(opts.new_root); - goto drop_modules; + goto out_err; } root = sb->s_fs_info; @@ -1708,9 +1676,6 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, pr_warning("cgroup: new mount options do not match the existing superblock, will be ignored\n"); } } - - /* no subsys rebinding, so refcounts don't change */ - drop_parsed_module_refcounts(opts.subsys_mask); } kfree(opts.release_agent); @@ -1728,8 +1693,6 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, mutex_unlock(&inode->i_mutex); drop_new_super: deactivate_locked_super(sb); - drop_modules: - drop_parsed_module_refcounts(opts.subsys_mask); out_err: kfree(opts.release_agent); kfree(opts.name); @@ -4837,7 +4800,7 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss) /* * we shouldn't be called if the subsystem is in use, and the use of - * try_module_get in parse_cgroupfs_options should ensure that it + * try_module_get() in rebind_subsystems() should ensure that it * doesn't start being used while we're killing it off. */ BUG_ON(ss->root != &cgroup_dummy_root); -- cgit v0.10.2 From a698b4488ab98deef6c3beeba3e27fea17650132 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 28 Jun 2013 21:08:27 -0700 Subject: cgroup: remove gratuituous BUG_ON()s from rebind_subsystems() rebind_subsystems() performs santiy checks even on subsystems which aren't specified to be added or removed and the checks aren't all that useful given that these are in a very cold path while the violations they check would trip up in much hotter paths. Let's remove these from rebind_subsystems(). Signed-off-by: Tejun Heo Acked-by: Li Zefan diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 2a8cf1a..345fac8 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1077,15 +1077,6 @@ static int rebind_subsystems(struct cgroupfs_root *root, /* subsystem is now free - drop reference on module */ module_put(ss->module); root->subsys_mask &= ~bit; - } else if (bit & root->subsys_mask) { - /* Subsystem state should already exist */ - BUG_ON(!cgrp->subsys[i]); -#ifdef CONFIG_MODULE_UNLOAD - BUG_ON(ss->module && !module_refcount(ss->module)); -#endif - } else { - /* Subsystem state shouldn't exist */ - BUG_ON(cgrp->subsys[i]); } } -- cgit v0.10.2 From 9ad9d25a1ec82d6e52d687348e8cdd4942e7d393 Mon Sep 17 00:00:00 2001 From: Zhao Hongjiang Date: Sat, 27 Jul 2013 11:56:49 +0800 Subject: cpuset: get rid of the useless forward declaration of cpuset get rid of the useless forward declaration of the struct cpuset cause the below define it. Signed-off-by: Zhao Hongjiang Acked-by: Li Zefan Signed-off-by: Tejun Heo diff --git a/kernel/cpuset.c b/kernel/cpuset.c index e565778..2ddd9b9 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -70,7 +70,6 @@ int number_of_cpusets __read_mostly; /* Forward declare cgroup structures */ struct cgroup_subsys cpuset_subsys; -struct cpuset; /* See "Frequency meter" comments, below. */ -- cgit v0.10.2 From 0b9e6965add0701e5cbf56d5bab6d9181e948359 Mon Sep 17 00:00:00 2001 From: Zhao Hongjiang Date: Sat, 27 Jul 2013 11:56:53 +0800 Subject: cpuset: relocate a misplaced comment Comment for cpuset_css_offline() was on top of cpuset_css_free(). Move it. Signed-off-by: Zhao Hongjiang Acked-by: Li Zefan Signed-off-by: Tejun Heo diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 2ddd9b9..703bfd5 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -2020,6 +2020,12 @@ out_unlock: return 0; } +/* + * If the cpuset being removed has its flag 'sched_load_balance' + * enabled, then simulate turning sched_load_balance off, which + * will call rebuild_sched_domains_locked(). + */ + static void cpuset_css_offline(struct cgroup *cgrp) { struct cpuset *cs = cgroup_cs(cgrp); @@ -2035,12 +2041,6 @@ static void cpuset_css_offline(struct cgroup *cgrp) mutex_unlock(&cpuset_mutex); } -/* - * If the cpuset being removed has its flag 'sched_load_balance' - * enabled, then simulate turning sched_load_balance off, which - * will call rebuild_sched_domains_locked(). - */ - static void cpuset_css_free(struct cgroup *cgrp) { struct cpuset *cs = cgroup_cs(cgrp); -- cgit v0.10.2 From 2a4ac63333584b2791986cf2270f5ba9a4b97606 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 31 Jul 2013 16:16:40 +0800 Subject: cgroup: remove sparse tags from offline_css() This should have been removed in commit d7eeac1913ff ("cgroup: hold cgroup_mutex before calling css_offline"). While at it, update the comments. Signed-off-by: Li Zefan Signed-off-by: Tejun Heo diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 345fac8..41b559f 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4214,7 +4214,7 @@ static void init_cgroup_css(struct cgroup_subsys_state *css, INIT_WORK(&css->dput_work, css_dput_fn); } -/* invoke ->post_create() on a new CSS and mark it online if successful */ +/* invoke ->css_online() on a new CSS and mark it online if successful */ static int online_css(struct cgroup_subsys *ss, struct cgroup *cgrp) { int ret = 0; @@ -4228,9 +4228,8 @@ static int online_css(struct cgroup_subsys *ss, struct cgroup *cgrp) return ret; } -/* if the CSS is online, invoke ->pre_destory() on it and mark it offline */ +/* if the CSS is online, invoke ->css_offline() on it and mark it offline */ static void offline_css(struct cgroup_subsys *ss, struct cgroup *cgrp) - __releases(&cgroup_mutex) __acquires(&cgroup_mutex) { struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; -- cgit v0.10.2 From e0798ce27346edb8aa369b5b39af5a47fdf2b25c Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 31 Jul 2013 17:36:25 +0800 Subject: cgroup: remove struct cgroup_seqfile_state We can use struct cfent instead. v2: - remove cgroup_seqfile_release(). Signed-off-by: Li Zefan Signed-off-by: Tejun Heo diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 41b559f..ed21043 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2397,11 +2397,6 @@ static ssize_t cgroup_file_read(struct file *file, char __user *buf, * supports string->u64 maps, but can be extended in future. */ -struct cgroup_seqfile_state { - struct cftype *cft; - struct cgroup *cgroup; -}; - static int cgroup_map_add(struct cgroup_map_cb *cb, const char *key, u64 value) { struct seq_file *sf = cb->state; @@ -2410,59 +2405,45 @@ static int cgroup_map_add(struct cgroup_map_cb *cb, const char *key, u64 value) static int cgroup_seqfile_show(struct seq_file *m, void *arg) { - struct cgroup_seqfile_state *state = m->private; - struct cftype *cft = state->cft; + struct cfent *cfe = m->private; + struct cftype *cft = cfe->type; + struct cgroup *cgrp = __d_cgrp(cfe->dentry->d_parent); + if (cft->read_map) { struct cgroup_map_cb cb = { .fill = cgroup_map_add, .state = m, }; - return cft->read_map(state->cgroup, cft, &cb); + return cft->read_map(cgrp, cft, &cb); } - return cft->read_seq_string(state->cgroup, cft, m); -} - -static int cgroup_seqfile_release(struct inode *inode, struct file *file) -{ - struct seq_file *seq = file->private_data; - kfree(seq->private); - return single_release(inode, file); + return cft->read_seq_string(cgrp, cft, m); } static const struct file_operations cgroup_seqfile_operations = { .read = seq_read, .write = cgroup_file_write, .llseek = seq_lseek, - .release = cgroup_seqfile_release, + .release = single_release, }; static int cgroup_file_open(struct inode *inode, struct file *file) { int err; + struct cfent *cfe; struct cftype *cft; err = generic_file_open(inode, file); if (err) return err; - cft = __d_cft(file->f_dentry); + cfe = __d_cfe(file->f_dentry); + cft = cfe->type; if (cft->read_map || cft->read_seq_string) { - struct cgroup_seqfile_state *state; - - state = kzalloc(sizeof(*state), GFP_USER); - if (!state) - return -ENOMEM; - - state->cft = cft; - state->cgroup = __d_cgrp(file->f_dentry->d_parent); file->f_op = &cgroup_seqfile_operations; - err = single_open(file, cgroup_seqfile_show, state); - if (err < 0) - kfree(state); - } else if (cft->open) + err = single_open(file, cgroup_seqfile_show, cfe); + } else if (cft->open) { err = cft->open(inode, file); - else - err = 0; + } return err; } -- cgit v0.10.2 From 6f4b7e632d78c2d91502211c430722cc66428492 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 31 Jul 2013 16:18:36 +0800 Subject: cgroup: more naming cleanups Constantly use @cset for css_set variables and use @cgrp as cgroup variables. Signed-off-by: Li Zefan Signed-off-by: Tejun Heo diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 297462b..00a7e07 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -394,8 +394,8 @@ struct cgroup_map_cb { /* cftype->flags */ enum { - CFTYPE_ONLY_ON_ROOT = (1 << 0), /* only create on root cg */ - CFTYPE_NOT_ON_ROOT = (1 << 1), /* don't create on root cg */ + CFTYPE_ONLY_ON_ROOT = (1 << 0), /* only create on root cgrp */ + CFTYPE_NOT_ON_ROOT = (1 << 1), /* don't create on root cgrp */ CFTYPE_INSANE = (1 << 2), /* don't create if sane_behavior */ }; @@ -513,7 +513,7 @@ struct cftype_set { }; struct cgroup_scanner { - struct cgroup *cg; + struct cgroup *cgrp; int (*test_task)(struct task_struct *p, struct cgroup_scanner *scan); void (*process_task)(struct task_struct *p, struct cgroup_scanner *scan); diff --git a/kernel/cgroup.c b/kernel/cgroup.c index ed21043..9577beb 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -466,7 +466,7 @@ static inline void put_css_set_taskexit(struct css_set *cset) * @new_cgrp: cgroup that's being entered by the task * @template: desired set of css pointers in css_set (pre-calculated) * - * Returns true if "cg" matches "old_cg" except for the hierarchy + * Returns true if "cset" matches "old_cset" except for the hierarchy * which "new_cgrp" belongs to, for which it should match "new_cgrp". */ static bool compare_css_sets(struct css_set *cset, @@ -1839,7 +1839,7 @@ EXPORT_SYMBOL_GPL(task_cgroup_path_from_hierarchy); struct task_and_cgroup { struct task_struct *task; struct cgroup *cgrp; - struct css_set *cg; + struct css_set *cset; }; struct cgroup_taskset { @@ -2057,8 +2057,8 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk, tc = flex_array_get(group, i); old_cset = task_css_set(tc->task); - tc->cg = find_css_set(old_cset, cgrp); - if (!tc->cg) { + tc->cset = find_css_set(old_cset, cgrp); + if (!tc->cset) { retval = -ENOMEM; goto out_put_css_set_refs; } @@ -2071,7 +2071,7 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk, */ for (i = 0; i < group_size; i++) { tc = flex_array_get(group, i); - cgroup_task_migrate(tc->cgrp, tc->task, tc->cg); + cgroup_task_migrate(tc->cgrp, tc->task, tc->cset); } /* nothing is sensitive to fork() after this point. */ @@ -2091,9 +2091,9 @@ out_put_css_set_refs: if (retval) { for (i = 0; i < group_size; i++) { tc = flex_array_get(group, i); - if (!tc->cg) + if (!tc->cset) break; - put_css_set(tc->cg); + put_css_set(tc->cset); } } out_cancel_attach: @@ -2203,9 +2203,9 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) mutex_lock(&cgroup_mutex); for_each_active_root(root) { - struct cgroup *from_cg = task_cgroup_from_root(from, root); + struct cgroup *from_cgrp = task_cgroup_from_root(from, root); - retval = cgroup_attach_task(from_cg, tsk, false); + retval = cgroup_attach_task(from_cgrp, tsk, false); if (retval) break; } @@ -3305,8 +3305,8 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan) * guarantees forward progress and that we don't miss any tasks. */ heap->size = 0; - cgroup_iter_start(scan->cg, &it); - while ((p = cgroup_iter_next(scan->cg, &it))) { + cgroup_iter_start(scan->cgrp, &it); + while ((p = cgroup_iter_next(scan->cgrp, &it))) { /* * Only affect tasks that qualify per the caller's callback, * if he provided one @@ -3339,7 +3339,7 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan) * the heap and wasn't inserted */ } - cgroup_iter_end(scan->cg, &it); + cgroup_iter_end(scan->cgrp, &it); if (heap->size) { for (i = 0; i < heap->size; i++) { @@ -3385,7 +3385,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) { struct cgroup_scanner scan; - scan.cg = from; + scan.cgrp = from; scan.test_task = NULL; /* select all tasks in cgroup */ scan.process_task = cgroup_transfer_one_task; scan.heap = NULL; diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 703bfd5..1b9c315 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -845,7 +845,7 @@ static void cpuset_change_cpumask(struct task_struct *tsk, { struct cpuset *cpus_cs; - cpus_cs = effective_cpumask_cpuset(cgroup_cs(scan->cg)); + cpus_cs = effective_cpumask_cpuset(cgroup_cs(scan->cgrp)); set_cpus_allowed_ptr(tsk, cpus_cs->cpus_allowed); } @@ -866,7 +866,7 @@ static void update_tasks_cpumask(struct cpuset *cs, struct ptr_heap *heap) { struct cgroup_scanner scan; - scan.cg = cs->css.cgroup; + scan.cgrp = cs->css.cgroup; scan.test_task = NULL; scan.process_task = cpuset_change_cpumask; scan.heap = heap; @@ -1062,7 +1062,7 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk, static void cpuset_change_nodemask(struct task_struct *p, struct cgroup_scanner *scan) { - struct cpuset *cs = cgroup_cs(scan->cg); + struct cpuset *cs = cgroup_cs(scan->cgrp); struct mm_struct *mm; int migrate; nodemask_t *newmems = scan->data; @@ -1102,7 +1102,7 @@ static void update_tasks_nodemask(struct cpuset *cs, struct ptr_heap *heap) guarantee_online_mems(mems_cs, &newmems); - scan.cg = cs->css.cgroup; + scan.cgrp = cs->css.cgroup; scan.test_task = NULL; scan.process_task = cpuset_change_nodemask; scan.heap = heap; @@ -1275,7 +1275,7 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val) static void cpuset_change_flag(struct task_struct *tsk, struct cgroup_scanner *scan) { - cpuset_update_task_spread_flag(cgroup_cs(scan->cg), tsk); + cpuset_update_task_spread_flag(cgroup_cs(scan->cgrp), tsk); } /* @@ -1295,7 +1295,7 @@ static void update_tasks_flags(struct cpuset *cs, struct ptr_heap *heap) { struct cgroup_scanner scan; - scan.cg = cs->css.cgroup; + scan.cgrp = cs->css.cgroup; scan.test_task = NULL; scan.process_task = cpuset_change_flag; scan.heap = heap; @@ -1971,7 +1971,7 @@ static int cpuset_css_online(struct cgroup *cgrp) struct cpuset *cs = cgroup_cs(cgrp); struct cpuset *parent = parent_cs(cs); struct cpuset *tmp_cs; - struct cgroup *pos_cg; + struct cgroup *pos_cgrp; if (!parent) return 0; @@ -2003,7 +2003,7 @@ static int cpuset_css_online(struct cgroup *cgrp) * (and likewise for mems) to the new cgroup. */ rcu_read_lock(); - cpuset_for_each_child(tmp_cs, pos_cg, parent) { + cpuset_for_each_child(tmp_cs, pos_cgrp, parent) { if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) { rcu_read_unlock(); goto out_unlock; -- cgit v0.10.2 From 4e96ee8e981b5140a2bcc5fff0d5c0eef39a62ee Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 31 Jul 2013 09:50:50 +0800 Subject: cgroup: convert cgroup_ida to cgroup_idr This enables us to lookup a cgroup by its id. v4: - add a comment for idr_remove() in cgroup_offline_fn(). v3: - on success, idr_alloc() returns the id but not 0, so fix the BUG_ON() in cgroup_init(). - pass the right value to idr_alloc() so that the id for dummy cgroup is 0. Signed-off-by: Li Zefan Reviewed-by: Michal Hocko Signed-off-by: Tejun Heo diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 00a7e07..cca570e 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -161,7 +161,7 @@ struct cgroup_name { struct cgroup { unsigned long flags; /* "unsigned long" so bitops work */ - int id; /* ida allocated in-hierarchy ID */ + int id; /* idr allocated in-hierarchy ID */ /* * We link our 'sibling' struct into our parent's 'children'. @@ -322,7 +322,7 @@ struct cgroupfs_root { unsigned long flags; /* IDs for cgroups in this hierarchy */ - struct ida cgroup_ida; + struct idr cgroup_idr; /* The path to use for release notifications. */ char release_agent_path[PATH_MAX]; diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 9577beb..3f65933 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -866,8 +866,6 @@ static void cgroup_free_fn(struct work_struct *work) */ dput(cgrp->parent->dentry); - ida_simple_remove(&cgrp->root->cgroup_ida, cgrp->id); - /* * Drop the active superblock reference that we took when we * created the cgroup. This will free cgrp->root, if we are @@ -1379,6 +1377,7 @@ static void init_cgroup_root(struct cgroupfs_root *root) cgrp->root = root; RCU_INIT_POINTER(cgrp->name, &root_cgroup_name); init_cgroup_housekeeping(cgrp); + idr_init(&root->cgroup_idr); } static int cgroup_init_root_id(struct cgroupfs_root *root, int start, int end) @@ -1451,7 +1450,6 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts) */ root->subsys_mask = opts->subsys_mask; root->flags = opts->flags; - ida_init(&root->cgroup_ida); if (opts->release_agent) strcpy(root->release_agent_path, opts->release_agent); if (opts->name) @@ -1467,7 +1465,7 @@ static void cgroup_free_root(struct cgroupfs_root *root) /* hierarhcy ID shoulid already have been released */ WARN_ON_ONCE(root->hierarchy_id); - ida_destroy(&root->cgroup_ida); + idr_destroy(&root->cgroup_idr); kfree(root); } } @@ -1582,6 +1580,11 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, mutex_lock(&cgroup_mutex); mutex_lock(&cgroup_root_mutex); + root_cgrp->id = idr_alloc(&root->cgroup_idr, root_cgrp, + 0, 1, GFP_KERNEL); + if (root_cgrp->id < 0) + goto unlock_drop; + /* Check for name clashes with existing mounts */ ret = -EBUSY; if (strlen(root->name)) @@ -4253,7 +4256,11 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, goto err_free_cgrp; rcu_assign_pointer(cgrp->name, name); - cgrp->id = ida_simple_get(&root->cgroup_ida, 1, 0, GFP_KERNEL); + /* + * Temporarily set the pointer to NULL, so idr_find() won't return + * a half-baked cgroup. + */ + cgrp->id = idr_alloc(&root->cgroup_idr, NULL, 1, 0, GFP_KERNEL); if (cgrp->id < 0) goto err_free_name; @@ -4351,6 +4358,8 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, } } + idr_replace(&root->cgroup_idr, cgrp, cgrp->id); + err = cgroup_addrm_files(cgrp, NULL, cgroup_base_files, true); if (err) goto err_destroy; @@ -4377,7 +4386,7 @@ err_free_all: /* Release the reference count that we took on the superblock */ deactivate_super(sb); err_free_id: - ida_simple_remove(&root->cgroup_ida, cgrp->id); + idr_remove(&root->cgroup_idr, cgrp->id); err_free_name: kfree(rcu_dereference_raw(cgrp->name)); err_free_cgrp: @@ -4570,6 +4579,14 @@ static void cgroup_offline_fn(struct work_struct *work) /* delete this cgroup from parent->children */ list_del_rcu(&cgrp->sibling); + /* + * We should remove the cgroup object from idr before its grace + * period starts, so we won't be looking up a cgroup while the + * cgroup is being freed. + */ + idr_remove(&cgrp->root->cgroup_idr, cgrp->id); + cgrp->id = -1; + dput(d); set_bit(CGRP_RELEASABLE, &parent->flags); @@ -4895,6 +4912,10 @@ int __init cgroup_init(void) BUG_ON(cgroup_init_root_id(&cgroup_dummy_root, 0, 1)); + err = idr_alloc(&cgroup_dummy_root.cgroup_idr, cgroup_dummy_top, + 0, 1, GFP_KERNEL); + BUG_ON(err < 0); + mutex_unlock(&cgroup_root_mutex); mutex_unlock(&cgroup_mutex); -- cgit v0.10.2 From b414dc09a31d41d696093a4cce9fb2853a5ecd4e Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 31 Jul 2013 09:51:06 +0800 Subject: cgroup: document how cgroup IDs are assigned As cgroup id has been used in netprio cgroup and will be used in memcg, it's important to make it clear how a cgroup id is allocated. For example, in netprio cgroup, the id is used as index of anarray. Signed-off-by: Li Zefan Reviewed-by: Michal Hocko Signed-off-by: Tejun Heo diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index cca570e..4dfcd0e 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -161,7 +161,13 @@ struct cgroup_name { struct cgroup { unsigned long flags; /* "unsigned long" so bitops work */ - int id; /* idr allocated in-hierarchy ID */ + /* + * idr allocated in-hierarchy ID. + * + * The ID of the root cgroup is always 0, and a new cgroup + * will be assigned with a smallest available ID. + */ + int id; /* * We link our 'sibling' struct into our parent's 'children'. -- cgit v0.10.2 From e14880f7bb7e0dc0933af304998371dd543ceb40 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 31 Jul 2013 09:51:31 +0800 Subject: cgroup: implement cgroup_from_id() This will be used as a replacement for css_lookup(). There's a difference with cgroup id and css id. cgroup id starts with 0, while css id starts with 1. v4: - also check if cggroup_mutex is held. - make it an inline function. Signed-off-by: Li Zefan Reviewed-by: Michal Hocko Signed-off-by: Tejun Heo diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 4dfcd0e..bbf4d89 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -720,6 +720,24 @@ static inline struct cgroup* task_cgroup(struct task_struct *task, return task_subsys_state(task, subsys_id)->cgroup; } +/** + * cgroup_from_id - lookup cgroup by id + * @ss: cgroup subsys to be looked into + * @id: the cgroup id + * + * Returns the cgroup if there's valid one with @id, otherwise returns NULL. + * Should be called under rcu_read_lock(). + */ +static inline struct cgroup *cgroup_from_id(struct cgroup_subsys *ss, int id) +{ +#ifdef CONFIG_PROVE_RCU + rcu_lockdep_assert(rcu_read_lock_held() || + lockdep_is_held(&cgroup_mutex), + "cgroup_from_id() needs proper protection"); +#endif + return idr_find(&ss->root->cgroup_idr, id); +} + struct cgroup *cgroup_next_sibling(struct cgroup *pos); /** -- cgit v0.10.2 From 876ede8b2b9880615be0de3ec7b8afd0a1786e76 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 1 Aug 2013 09:51:47 +0800 Subject: cgroup: restructure the failure path in cgroup_write_event_control() It uses a single label and checks the validity of each pointer. This is err-prone, and actually we had a bug because one of the check was insufficient. Use multi lables as we do in other places. v2: - drop initializations of local variables. Signed-off-by: Li Zefan Signed-off-by: Tejun Heo diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 3f65933..9f6dab2 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -3934,11 +3934,11 @@ static void cgroup_event_ptable_queue_proc(struct file *file, static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft, const char *buffer) { - struct cgroup_event *event = NULL; + struct cgroup_event *event; struct cgroup *cgrp_cfile; unsigned int efd, cfd; - struct file *efile = NULL; - struct file *cfile = NULL; + struct file *efile; + struct file *cfile; char *endp; int ret; @@ -3964,31 +3964,31 @@ static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft, efile = eventfd_fget(efd); if (IS_ERR(efile)) { ret = PTR_ERR(efile); - goto fail; + goto out_kfree; } event->eventfd = eventfd_ctx_fileget(efile); if (IS_ERR(event->eventfd)) { ret = PTR_ERR(event->eventfd); - goto fail; + goto out_put_efile; } cfile = fget(cfd); if (!cfile) { ret = -EBADF; - goto fail; + goto out_put_eventfd; } /* the process need read permission on control file */ /* AV: shouldn't we check that it's been opened for read instead? */ ret = inode_permission(file_inode(cfile), MAY_READ); if (ret < 0) - goto fail; + goto out_put_cfile; event->cft = __file_cft(cfile); if (IS_ERR(event->cft)) { ret = PTR_ERR(event->cft); - goto fail; + goto out_put_cfile; } /* @@ -3998,18 +3998,18 @@ static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft, cgrp_cfile = __d_cgrp(cfile->f_dentry->d_parent); if (cgrp_cfile != cgrp) { ret = -EINVAL; - goto fail; + goto out_put_cfile; } if (!event->cft->register_event || !event->cft->unregister_event) { ret = -EINVAL; - goto fail; + goto out_put_cfile; } ret = event->cft->register_event(cgrp, event->cft, event->eventfd, buffer); if (ret) - goto fail; + goto out_put_cfile; efile->f_op->poll(efile, &event->pt); @@ -4029,16 +4029,13 @@ static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft, return 0; -fail: - if (cfile) - fput(cfile); - - if (event && event->eventfd && !IS_ERR(event->eventfd)) - eventfd_ctx_put(event->eventfd); - - if (!IS_ERR_OR_NULL(efile)) - fput(efile); - +out_put_cfile: + fput(cfile); +out_put_eventfd: + eventfd_ctx_put(event->eventfd); +out_put_efile: + fput(efile); +out_kfree: kfree(event); return ret; -- cgit v0.10.2 From b395890a092d8ecbe54f005179e3dec4b6bf752a Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 1 Aug 2013 09:52:15 +0800 Subject: cgroup: rename cgroup_pidlist->mutex It's a rw_semaphore not a mutex. Signed-off-by: Li Zefan Signed-off-by: Tejun Heo diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 9f6dab2..9420662 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -3436,7 +3436,7 @@ struct cgroup_pidlist { /* pointer to the cgroup we belong to, for list removal purposes */ struct cgroup *owner; /* protects the other fields */ - struct rw_semaphore mutex; + struct rw_semaphore rwsem; }; /* @@ -3509,7 +3509,7 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp, struct pid_namespace *ns = task_active_pid_ns(current); /* - * We can't drop the pidlist_mutex before taking the l->mutex in case + * We can't drop the pidlist_mutex before taking the l->rwsem in case * the last ref-holder is trying to remove l from the list at the same * time. Holding the pidlist_mutex precludes somebody taking whichever * list we find out from under us - compare release_pid_array(). @@ -3518,7 +3518,7 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp, list_for_each_entry(l, &cgrp->pidlists, links) { if (l->key.type == type && l->key.ns == ns) { /* make sure l doesn't vanish out from under us */ - down_write(&l->mutex); + down_write(&l->rwsem); mutex_unlock(&cgrp->pidlist_mutex); return l; } @@ -3529,8 +3529,8 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp, mutex_unlock(&cgrp->pidlist_mutex); return l; } - init_rwsem(&l->mutex); - down_write(&l->mutex); + init_rwsem(&l->rwsem); + down_write(&l->rwsem); l->key.type = type; l->key.ns = get_pid_ns(ns); l->owner = cgrp; @@ -3591,7 +3591,7 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type, l->list = array; l->length = length; l->use_count++; - up_write(&l->mutex); + up_write(&l->rwsem); *lp = l; return 0; } @@ -3669,7 +3669,7 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos) int index = 0, pid = *pos; int *iter; - down_read(&l->mutex); + down_read(&l->rwsem); if (pid) { int end = l->length; @@ -3696,7 +3696,7 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos) static void cgroup_pidlist_stop(struct seq_file *s, void *v) { struct cgroup_pidlist *l = s->private; - up_read(&l->mutex); + up_read(&l->rwsem); } static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos) @@ -3742,7 +3742,7 @@ static void cgroup_release_pid_array(struct cgroup_pidlist *l) * pidlist_mutex, we have to take pidlist_mutex first. */ mutex_lock(&l->owner->pidlist_mutex); - down_write(&l->mutex); + down_write(&l->rwsem); BUG_ON(!l->use_count); if (!--l->use_count) { /* we're the last user if refcount is 0; remove and free */ @@ -3750,12 +3750,12 @@ static void cgroup_release_pid_array(struct cgroup_pidlist *l) mutex_unlock(&l->owner->pidlist_mutex); pidlist_free(l->list); put_pid_ns(l->key.ns); - up_write(&l->mutex); + up_write(&l->rwsem); kfree(l); return; } mutex_unlock(&l->owner->pidlist_mutex); - up_write(&l->mutex); + up_write(&l->rwsem); } static int cgroup_pidlist_release(struct inode *inode, struct file *file) -- cgit v0.10.2 From 8af01f56a03e9cbd91a55d688fce1315021efba8 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 8 Aug 2013 20:11:22 -0400 Subject: cgroup: s/cgroup_subsys_state/cgroup_css/ s/task_subsys_state/task_css/ The names of the two struct cgroup_subsys_state accessors - cgroup_subsys_state() and task_subsys_state() - are somewhat awkward. The former clashes with the type name and the latter doesn't even indicate it's somehow related to cgroup. We're about to revamp large portion of cgroup API, so, let's rename them so that they're less awkward. Most per-controller usages of the accessors are localized in accessor wrappers and given the amount of scheduled changes, this isn't gonna add any noticeable headache. Rename cgroup_subsys_state() to cgroup_css() and task_subsys_state() to task_css(). This patch is pure rename. Signed-off-by: Tejun Heo Acked-by: Li Zefan diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h index 8056c03..628e50f 100644 --- a/block/blk-cgroup.h +++ b/block/blk-cgroup.h @@ -181,14 +181,13 @@ void blkg_conf_finish(struct blkg_conf_ctx *ctx); static inline struct blkcg *cgroup_to_blkcg(struct cgroup *cgroup) { - return container_of(cgroup_subsys_state(cgroup, blkio_subsys_id), + return container_of(cgroup_css(cgroup, blkio_subsys_id), struct blkcg, css); } static inline struct blkcg *task_blkcg(struct task_struct *tsk) { - return container_of(task_subsys_state(tsk, blkio_subsys_id), - struct blkcg, css); + return container_of(task_css(tsk, blkio_subsys_id), struct blkcg, css); } static inline struct blkcg *bio_blkcg(struct bio *bio) diff --git a/fs/bio.c b/fs/bio.c index 94bbc04..8e0348f 100644 --- a/fs/bio.c +++ b/fs/bio.c @@ -1946,7 +1946,7 @@ int bio_associate_current(struct bio *bio) /* associate blkcg if exists */ rcu_read_lock(); - css = task_subsys_state(current, blkio_subsys_id); + css = task_css(current, blkio_subsys_id); if (css && css_tryget(css)) bio->bi_css = css; rcu_read_unlock(); diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 44dd422..552c5fe 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -647,8 +647,15 @@ struct cgroup_subsys { #undef IS_SUBSYS_ENABLED #undef SUBSYS -static inline struct cgroup_subsys_state *cgroup_subsys_state( - struct cgroup *cgrp, int subsys_id) +/** + * cgroup_css - obtain a cgroup's css for the specified subsystem + * @cgrp: the cgroup of interest + * @subsys_id: the subsystem of interest + * + * Return @cgrp's css (cgroup_subsys_state) associated with @subsys_id. + */ +static inline struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp, + int subsys_id) { return cgrp->subsys[subsys_id]; } @@ -678,7 +685,7 @@ extern struct mutex cgroup_mutex; #endif /** - * task_subsys_state_check - obtain css for (task, subsys) w/ extra access conds + * task_css_check - obtain css for (task, subsys) w/ extra access conds * @task: the target task * @subsys_id: the target subsystem ID * @__c: extra condition expression to be passed to rcu_dereference_check() @@ -686,7 +693,7 @@ extern struct mutex cgroup_mutex; * Return the cgroup_subsys_state for the (@task, @subsys_id) pair. The * synchronization rules are the same as task_css_set_check(). */ -#define task_subsys_state_check(task, subsys_id, __c) \ +#define task_css_check(task, subsys_id, __c) \ task_css_set_check((task), (__c))->subsys[(subsys_id)] /** @@ -701,22 +708,22 @@ static inline struct css_set *task_css_set(struct task_struct *task) } /** - * task_subsys_state - obtain css for (task, subsys) + * task_css - obtain css for (task, subsys) * @task: the target task * @subsys_id: the target subsystem ID * - * See task_subsys_state_check(). + * See task_css_check(). */ -static inline struct cgroup_subsys_state * -task_subsys_state(struct task_struct *task, int subsys_id) +static inline struct cgroup_subsys_state *task_css(struct task_struct *task, + int subsys_id) { - return task_subsys_state_check(task, subsys_id, false); + return task_css_check(task, subsys_id, false); } -static inline struct cgroup* task_cgroup(struct task_struct *task, - int subsys_id) +static inline struct cgroup *task_cgroup(struct task_struct *task, + int subsys_id) { - return task_subsys_state(task, subsys_id)->cgroup; + return task_css(task, subsys_id)->cgroup; } /** diff --git a/include/net/cls_cgroup.h b/include/net/cls_cgroup.h index 0fee061..52adaa7 100644 --- a/include/net/cls_cgroup.h +++ b/include/net/cls_cgroup.h @@ -35,7 +35,7 @@ static inline u32 task_cls_classid(struct task_struct *p) return 0; rcu_read_lock(); - classid = container_of(task_subsys_state(p, net_cls_subsys_id), + classid = container_of(task_css(p, net_cls_subsys_id), struct cgroup_cls_state, css)->classid; rcu_read_unlock(); @@ -51,7 +51,7 @@ static inline u32 task_cls_classid(struct task_struct *p) return 0; rcu_read_lock(); - css = task_subsys_state(p, net_cls_subsys_id); + css = task_css(p, net_cls_subsys_id); if (css) classid = container_of(css, struct cgroup_cls_state, css)->classid; diff --git a/include/net/netprio_cgroup.h b/include/net/netprio_cgroup.h index 50ab8c2..8110fa7 100644 --- a/include/net/netprio_cgroup.h +++ b/include/net/netprio_cgroup.h @@ -39,7 +39,7 @@ static inline u32 task_netprioidx(struct task_struct *p) u32 idx; rcu_read_lock(); - css = task_subsys_state(p, net_prio_subsys_id); + css = task_css(p, net_prio_subsys_id); idx = css->cgroup->id; rcu_read_unlock(); return idx; @@ -53,7 +53,7 @@ static inline u32 task_netprioidx(struct task_struct *p) u32 idx = 0; rcu_read_lock(); - css = task_subsys_state(p, net_prio_subsys_id); + css = task_css(p, net_prio_subsys_id); if (css) idx = css->cgroup->id; rcu_read_unlock(); diff --git a/kernel/cgroup.c b/kernel/cgroup.c index ae4c468..0b3caa3 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -81,7 +81,7 @@ */ #ifdef CONFIG_PROVE_RCU DEFINE_MUTEX(cgroup_mutex); -EXPORT_SYMBOL_GPL(cgroup_mutex); /* only for task_subsys_state_check() */ +EXPORT_SYMBOL_GPL(cgroup_mutex); /* only for lockdep */ #else static DEFINE_MUTEX(cgroup_mutex); #endif diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index 75dda1e..9d3f615 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -47,13 +47,13 @@ struct freezer { static inline struct freezer *cgroup_freezer(struct cgroup *cgroup) { - return container_of(cgroup_subsys_state(cgroup, freezer_subsys_id), + return container_of(cgroup_css(cgroup, freezer_subsys_id), struct freezer, css); } static inline struct freezer *task_freezer(struct task_struct *task) { - return container_of(task_subsys_state(task, freezer_subsys_id), + return container_of(task_css(task, freezer_subsys_id), struct freezer, css); } diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 1b9c315..be4512b 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -117,14 +117,14 @@ struct cpuset { /* Retrieve the cpuset for a cgroup */ static inline struct cpuset *cgroup_cs(struct cgroup *cgrp) { - return container_of(cgroup_subsys_state(cgrp, cpuset_subsys_id), + return container_of(cgroup_css(cgrp, cpuset_subsys_id), struct cpuset, css); } /* Retrieve the cpuset for a task */ static inline struct cpuset *task_cs(struct task_struct *task) { - return container_of(task_subsys_state(task, cpuset_subsys_id), + return container_of(task_css(task, cpuset_subsys_id), struct cpuset, css); } @@ -2724,7 +2724,7 @@ int proc_cpuset_show(struct seq_file *m, void *unused_v) goto out_free; rcu_read_lock(); - css = task_subsys_state(tsk, cpuset_subsys_id); + css = task_css(tsk, cpuset_subsys_id); retval = cgroup_path(css->cgroup, buf, PAGE_SIZE); rcu_read_unlock(); if (retval < 0) diff --git a/kernel/events/core.c b/kernel/events/core.c index 1833bc5..414c61f 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -340,8 +340,8 @@ struct perf_cgroup { static inline struct perf_cgroup * perf_cgroup_from_task(struct task_struct *task) { - return container_of(task_subsys_state(task, perf_subsys_id), - struct perf_cgroup, css); + return container_of(task_css(task, perf_subsys_id), + struct perf_cgroup, css); } static inline bool @@ -7798,7 +7798,7 @@ static struct cgroup_subsys_state *perf_cgroup_css_alloc(struct cgroup *cont) static void perf_cgroup_css_free(struct cgroup *cont) { struct perf_cgroup *jc; - jc = container_of(cgroup_subsys_state(cont, perf_subsys_id), + jc = container_of(cgroup_css(cont, perf_subsys_id), struct perf_cgroup, css); free_percpu(jc->info); kfree(jc); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 9b1f2e5..323d907 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6761,7 +6761,7 @@ void sched_move_task(struct task_struct *tsk) if (unlikely(running)) tsk->sched_class->put_prev_task(rq, tsk); - tg = container_of(task_subsys_state_check(tsk, cpu_cgroup_subsys_id, + tg = container_of(task_css_check(tsk, cpu_cgroup_subsys_id, lockdep_is_held(&tsk->sighand->siglock)), struct task_group, css); tg = autogroup_task_group(tsk, tg); @@ -7086,7 +7086,7 @@ int sched_rt_handler(struct ctl_table *table, int write, /* return corresponding task_group object of a cgroup */ static inline struct task_group *cgroup_tg(struct cgroup *cgrp) { - return container_of(cgroup_subsys_state(cgrp, cpu_cgroup_subsys_id), + return container_of(cgroup_css(cgrp, cpu_cgroup_subsys_id), struct task_group, css); } diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index dbb7e2c..4a210fa 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c @@ -36,14 +36,14 @@ struct cpuacct { /* return cpu accounting group corresponding to this container */ static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp) { - return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id), + return container_of(cgroup_css(cgrp, cpuacct_subsys_id), struct cpuacct, css); } /* return cpu accounting group to which this task belongs */ static inline struct cpuacct *task_ca(struct task_struct *tsk) { - return container_of(task_subsys_state(tsk, cpuacct_subsys_id), + return container_of(task_css(tsk, cpuacct_subsys_id), struct cpuacct, css); } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index ef0a7b2..471a56d 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -665,9 +665,9 @@ extern int group_balance_cpu(struct sched_group *sg); /* * Return the group to which this tasks belongs. * - * We cannot use task_subsys_state() and friends because the cgroup - * subsystem changes that value before the cgroup_subsys::attach() method - * is called, therefore we cannot pin it and might observe the wrong value. + * We cannot use task_css() and friends because the cgroup subsystem + * changes that value before the cgroup_subsys::attach() method is called, + * therefore we cannot pin it and might observe the wrong value. * * The same is true for autogroup's p->signal->autogroup->tg, the autogroup * core changes this before calling sched_move_task(). diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c index 9cea7de..50f213f 100644 --- a/mm/hugetlb_cgroup.c +++ b/mm/hugetlb_cgroup.c @@ -42,15 +42,13 @@ struct hugetlb_cgroup *hugetlb_cgroup_from_css(struct cgroup_subsys_state *s) static inline struct hugetlb_cgroup *hugetlb_cgroup_from_cgroup(struct cgroup *cgroup) { - return hugetlb_cgroup_from_css(cgroup_subsys_state(cgroup, - hugetlb_subsys_id)); + return hugetlb_cgroup_from_css(cgroup_css(cgroup, hugetlb_subsys_id)); } static inline struct hugetlb_cgroup *hugetlb_cgroup_from_task(struct task_struct *task) { - return hugetlb_cgroup_from_css(task_subsys_state(task, - hugetlb_subsys_id)); + return hugetlb_cgroup_from_css(task_css(task, hugetlb_subsys_id)); } static inline bool hugetlb_cgroup_is_root(struct hugetlb_cgroup *h_cg) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index d12ca6f..b47bd3a 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1037,8 +1037,7 @@ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page) struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont) { - return mem_cgroup_from_css( - cgroup_subsys_state(cont, mem_cgroup_subsys_id)); + return mem_cgroup_from_css(cgroup_css(cont, mem_cgroup_subsys_id)); } struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) @@ -1051,7 +1050,7 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) if (unlikely(!p)) return NULL; - return mem_cgroup_from_css(task_subsys_state(p, mem_cgroup_subsys_id)); + return mem_cgroup_from_css(task_css(p, mem_cgroup_subsys_id)); } struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) diff --git a/mm/vmpressure.c b/mm/vmpressure.c index 736a601..7f1654d 100644 --- a/mm/vmpressure.c +++ b/mm/vmpressure.c @@ -76,7 +76,7 @@ static struct vmpressure *work_to_vmpressure(struct work_struct *work) static struct vmpressure *cg_to_vmpressure(struct cgroup *cg) { - return css_to_vmpressure(cgroup_subsys_state(cg, mem_cgroup_subsys_id)); + return css_to_vmpressure(cgroup_css(cg, mem_cgroup_subsys_id)); } static struct vmpressure *vmpressure_parent(struct vmpressure *vmpr) diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c index e533259..ccf8523 100644 --- a/net/core/netprio_cgroup.c +++ b/net/core/netprio_cgroup.c @@ -31,7 +31,7 @@ static inline struct cgroup_netprio_state *cgrp_netprio_state(struct cgroup *cgrp) { - return container_of(cgroup_subsys_state(cgrp, net_prio_subsys_id), + return container_of(cgroup_css(cgrp, net_prio_subsys_id), struct cgroup_netprio_state, css); } diff --git a/net/sched/cls_cgroup.c b/net/sched/cls_cgroup.c index 3a294eb..5ee72a0 100644 --- a/net/sched/cls_cgroup.c +++ b/net/sched/cls_cgroup.c @@ -25,13 +25,13 @@ static inline struct cgroup_cls_state *cgrp_cls_state(struct cgroup *cgrp) { - return container_of(cgroup_subsys_state(cgrp, net_cls_subsys_id), + return container_of(cgroup_css(cgrp, net_cls_subsys_id), struct cgroup_cls_state, css); } static inline struct cgroup_cls_state *task_cls_state(struct task_struct *p) { - return container_of(task_subsys_state(p, net_cls_subsys_id), + return container_of(task_css(p, net_cls_subsys_id), struct cgroup_cls_state, css); } diff --git a/security/device_cgroup.c b/security/device_cgroup.c index e8aad69..87a0a03 100644 --- a/security/device_cgroup.c +++ b/security/device_cgroup.c @@ -58,12 +58,12 @@ static inline struct dev_cgroup *css_to_devcgroup(struct cgroup_subsys_state *s) static inline struct dev_cgroup *cgroup_to_devcgroup(struct cgroup *cgroup) { - return css_to_devcgroup(cgroup_subsys_state(cgroup, devices_subsys_id)); + return css_to_devcgroup(cgroup_css(cgroup, devices_subsys_id)); } static inline struct dev_cgroup *task_devcgroup(struct task_struct *task) { - return css_to_devcgroup(task_subsys_state(task, devices_subsys_id)); + return css_to_devcgroup(task_css(task, devices_subsys_id)); } struct cgroup_subsys devices_subsys; -- cgit v0.10.2 From c9710d8018273b0740e0794858f1961fcea5e61a Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 8 Aug 2013 20:11:22 -0400 Subject: cpuset: drop "const" qualifiers from struct cpuset instances cpuset uses "const" qualifiers on struct cpuset in some functions; however, it doesn't work well when a value derived from returned const pointer has to be passed to an accessor. It's C after all. Drop the "const" qualifiers except for the trivially leaf ones. This patch doesn't make any functional changes. Signed-off-by: Tejun Heo Acked-by: Li Zefan diff --git a/kernel/cpuset.c b/kernel/cpuset.c index be4512b..f737134 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -128,7 +128,7 @@ static inline struct cpuset *task_cs(struct task_struct *task) struct cpuset, css); } -static inline struct cpuset *parent_cs(const struct cpuset *cs) +static inline struct cpuset *parent_cs(struct cpuset *cs) { struct cgroup *pcgrp = cs->css.cgroup->parent; @@ -319,8 +319,7 @@ static struct file_system_type cpuset_fs_type = { * * Call with callback_mutex held. */ -static void guarantee_online_cpus(const struct cpuset *cs, - struct cpumask *pmask) +static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask) { while (!cpumask_intersects(cs->cpus_allowed, cpu_online_mask)) cs = parent_cs(cs); @@ -338,7 +337,7 @@ static void guarantee_online_cpus(const struct cpuset *cs, * * Call with callback_mutex held. */ -static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask) +static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask) { while (!nodes_intersects(cs->mems_allowed, node_states[N_MEMORY])) cs = parent_cs(cs); @@ -383,7 +382,7 @@ static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q) * alloc_trial_cpuset - allocate a trial cpuset * @cs: the cpuset that the trial cpuset duplicates */ -static struct cpuset *alloc_trial_cpuset(const struct cpuset *cs) +static struct cpuset *alloc_trial_cpuset(struct cpuset *cs) { struct cpuset *trial; @@ -430,7 +429,7 @@ static void free_trial_cpuset(struct cpuset *trial) * Return 0 if valid, -errno if not. */ -static int validate_change(const struct cpuset *cur, const struct cpuset *trial) +static int validate_change(struct cpuset *cur, struct cpuset *trial) { struct cgroup *cgrp; struct cpuset *c, *par; @@ -2343,7 +2342,7 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask) void cpuset_cpus_allowed_fallback(struct task_struct *tsk) { - const struct cpuset *cpus_cs; + struct cpuset *cpus_cs; rcu_read_lock(); cpus_cs = effective_cpumask_cpuset(task_cs(tsk)); @@ -2416,7 +2415,7 @@ int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask) * callback_mutex. If no ancestor is mem_exclusive or mem_hardwall * (an unusual configuration), then returns the root cpuset. */ -static const struct cpuset *nearest_hardwall_ancestor(const struct cpuset *cs) +static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs) { while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && parent_cs(cs)) cs = parent_cs(cs); @@ -2486,7 +2485,7 @@ static const struct cpuset *nearest_hardwall_ancestor(const struct cpuset *cs) */ int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask) { - const struct cpuset *cs; /* current cpuset ancestors */ + struct cpuset *cs; /* current cpuset ancestors */ int allowed; /* is allocation in zone z allowed? */ if (in_interrupt() || (gfp_mask & __GFP_THISNODE)) -- cgit v0.10.2 From 6d37b97428d20a21ffc39ba90e97e91e2a79a986 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 8 Aug 2013 20:11:22 -0400 Subject: netprio_cgroup: pass around @css instead of @cgroup and kill struct cgroup_netprio_state cgroup controller API will be converted to primarily use struct cgroup_subsys_state instead of struct cgroup. In preparation, make the internal functions of netprio_cgroup pass around @css instead of @cgrp. While at it, kill struct cgroup_netprio_state which only contained struct cgroup_subsys_state without serving any purpose. All functions are converted to deal with @css directly. This patch shouldn't cause any behavior differences. Signed-off-by: Tejun Heo Acked-by: Li Zefan Acked-by: Neil Horman Acked-by: David S. Miller diff --git a/include/net/netprio_cgroup.h b/include/net/netprio_cgroup.h index 8110fa7..a24f8bb 100644 --- a/include/net/netprio_cgroup.h +++ b/include/net/netprio_cgroup.h @@ -25,10 +25,6 @@ struct netprio_map { u32 priomap[]; }; -struct cgroup_netprio_state { - struct cgroup_subsys_state css; -}; - extern void sock_update_netprioidx(struct sock *sk); #if IS_BUILTIN(CONFIG_NETPRIO_CGROUP) diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c index ccf8523..5dfac88 100644 --- a/net/core/netprio_cgroup.c +++ b/net/core/netprio_cgroup.c @@ -29,12 +29,6 @@ #define PRIOMAP_MIN_SZ 128 -static inline struct cgroup_netprio_state *cgrp_netprio_state(struct cgroup *cgrp) -{ - return container_of(cgroup_css(cgrp, net_prio_subsys_id), - struct cgroup_netprio_state, css); -} - /* * Extend @dev->priomap so that it's large enough to accomodate * @target_idx. @dev->priomap.priomap_len > @target_idx after successful @@ -87,68 +81,72 @@ static int extend_netdev_table(struct net_device *dev, u32 target_idx) /** * netprio_prio - return the effective netprio of a cgroup-net_device pair - * @cgrp: cgroup part of the target pair + * @css: css part of the target pair * @dev: net_device part of the target pair * * Should be called under RCU read or rtnl lock. */ -static u32 netprio_prio(struct cgroup *cgrp, struct net_device *dev) +static u32 netprio_prio(struct cgroup_subsys_state *css, struct net_device *dev) { struct netprio_map *map = rcu_dereference_rtnl(dev->priomap); + int id = css->cgroup->id; - if (map && cgrp->id < map->priomap_len) - return map->priomap[cgrp->id]; + if (map && id < map->priomap_len) + return map->priomap[id]; return 0; } /** * netprio_set_prio - set netprio on a cgroup-net_device pair - * @cgrp: cgroup part of the target pair + * @css: css part of the target pair * @dev: net_device part of the target pair * @prio: prio to set * - * Set netprio to @prio on @cgrp-@dev pair. Should be called under rtnl + * Set netprio to @prio on @css-@dev pair. Should be called under rtnl * lock and may fail under memory pressure for non-zero @prio. */ -static int netprio_set_prio(struct cgroup *cgrp, struct net_device *dev, - u32 prio) +static int netprio_set_prio(struct cgroup_subsys_state *css, + struct net_device *dev, u32 prio) { struct netprio_map *map; + int id = css->cgroup->id; int ret; /* avoid extending priomap for zero writes */ map = rtnl_dereference(dev->priomap); - if (!prio && (!map || map->priomap_len <= cgrp->id)) + if (!prio && (!map || map->priomap_len <= id)) return 0; - ret = extend_netdev_table(dev, cgrp->id); + ret = extend_netdev_table(dev, id); if (ret) return ret; map = rtnl_dereference(dev->priomap); - map->priomap[cgrp->id] = prio; + map->priomap[id] = prio; return 0; } static struct cgroup_subsys_state *cgrp_css_alloc(struct cgroup *cgrp) { - struct cgroup_netprio_state *cs; + struct cgroup_subsys_state *css; - cs = kzalloc(sizeof(*cs), GFP_KERNEL); - if (!cs) + css = kzalloc(sizeof(*css), GFP_KERNEL); + if (!css) return ERR_PTR(-ENOMEM); - return &cs->css; + return css; } static int cgrp_css_online(struct cgroup *cgrp) { - struct cgroup *parent = cgrp->parent; + struct cgroup_subsys_state *css = cgroup_css(cgrp, net_prio_subsys_id); + struct cgroup_subsys_state *parent_css; struct net_device *dev; int ret = 0; - if (!parent) + if (!cgrp->parent) return 0; + parent_css = cgroup_css(cgrp->parent, net_prio_subsys_id); rtnl_lock(); /* @@ -156,9 +154,9 @@ static int cgrp_css_online(struct cgroup *cgrp) * onlining, there is no need to clear them on offline. */ for_each_netdev(&init_net, dev) { - u32 prio = netprio_prio(parent, dev); + u32 prio = netprio_prio(parent_css, dev); - ret = netprio_set_prio(cgrp, dev, prio); + ret = netprio_set_prio(css, dev, prio); if (ret) break; } @@ -168,7 +166,7 @@ static int cgrp_css_online(struct cgroup *cgrp) static void cgrp_css_free(struct cgroup *cgrp) { - kfree(cgrp_netprio_state(cgrp)); + kfree(cgroup_css(cgrp, net_prio_subsys_id)); } static u64 read_prioidx(struct cgroup *cgrp, struct cftype *cft) @@ -179,11 +177,12 @@ static u64 read_prioidx(struct cgroup *cgrp, struct cftype *cft) static int read_priomap(struct cgroup *cont, struct cftype *cft, struct cgroup_map_cb *cb) { + struct cgroup_subsys_state *css = cgroup_css(cont, net_prio_subsys_id); struct net_device *dev; rcu_read_lock(); for_each_netdev_rcu(&init_net, dev) - cb->fill(cb, dev->name, netprio_prio(cont, dev)); + cb->fill(cb, dev->name, netprio_prio(css, dev)); rcu_read_unlock(); return 0; } @@ -191,6 +190,7 @@ static int read_priomap(struct cgroup *cont, struct cftype *cft, static int write_priomap(struct cgroup *cgrp, struct cftype *cft, const char *buffer) { + struct cgroup_subsys_state *css = cgroup_css(cgrp, net_prio_subsys_id); char devname[IFNAMSIZ + 1]; struct net_device *dev; u32 prio; @@ -205,7 +205,7 @@ static int write_priomap(struct cgroup *cgrp, struct cftype *cft, rtnl_lock(); - ret = netprio_set_prio(cgrp, dev, prio); + ret = netprio_set_prio(css, dev, prio); rtnl_unlock(); dev_put(dev); -- cgit v0.10.2 From 3f79851831a135c5cebbcaa8cddb07d02870b069 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 8 Aug 2013 20:11:22 -0400 Subject: hugetlb_cgroup: pass around @hugetlb_cgroup instead of @cgroup cgroup controller API will be converted to primarily use struct cgroup_subsys_state instead of struct cgroup. In preparation, make hugetlb_cgroup functions pass around struct hugetlb_cgroup instead of struct cgroup. This patch shouldn't cause any behavior differences. Signed-off-by: Tejun Heo Acked-by: Li Zefan Reviewed-by: Aneesh Kumar K.V Reviewed-by: Michal Hocko Cc: KAMEZAWA Hiroyuki Cc: Johannes Weiner diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c index 50f213f..d2f9fc0 100644 --- a/mm/hugetlb_cgroup.c +++ b/mm/hugetlb_cgroup.c @@ -56,17 +56,19 @@ static inline bool hugetlb_cgroup_is_root(struct hugetlb_cgroup *h_cg) return (h_cg == root_h_cgroup); } -static inline struct hugetlb_cgroup *parent_hugetlb_cgroup(struct cgroup *cg) +static inline struct hugetlb_cgroup * +parent_hugetlb_cgroup(struct hugetlb_cgroup *h_cg) { - if (!cg->parent) + struct cgroup *parent = h_cg->css.cgroup->parent; + + if (!parent) return NULL; - return hugetlb_cgroup_from_cgroup(cg->parent); + return hugetlb_cgroup_from_cgroup(parent); } -static inline bool hugetlb_cgroup_have_usage(struct cgroup *cg) +static inline bool hugetlb_cgroup_have_usage(struct hugetlb_cgroup *h_cg) { int idx; - struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_cgroup(cg); for (idx = 0; idx < hugetlb_max_hstate; idx++) { if ((res_counter_read_u64(&h_cg->hugepage[idx], RES_USAGE)) > 0) @@ -115,15 +117,14 @@ static void hugetlb_cgroup_css_free(struct cgroup *cgroup) * page reference and test for page active here. This function * cannot fail. */ -static void hugetlb_cgroup_move_parent(int idx, struct cgroup *cgroup, +static void hugetlb_cgroup_move_parent(int idx, struct hugetlb_cgroup *h_cg, struct page *page) { int csize; struct res_counter *counter; struct res_counter *fail_res; struct hugetlb_cgroup *page_hcg; - struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_cgroup(cgroup); - struct hugetlb_cgroup *parent = parent_hugetlb_cgroup(cgroup); + struct hugetlb_cgroup *parent = parent_hugetlb_cgroup(h_cg); page_hcg = hugetlb_cgroup_from_page(page); /* @@ -155,6 +156,7 @@ out: */ static void hugetlb_cgroup_css_offline(struct cgroup *cgroup) { + struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_cgroup(cgroup); struct hstate *h; struct page *page; int idx = 0; @@ -163,13 +165,13 @@ static void hugetlb_cgroup_css_offline(struct cgroup *cgroup) for_each_hstate(h) { spin_lock(&hugetlb_lock); list_for_each_entry(page, &h->hugepage_activelist, lru) - hugetlb_cgroup_move_parent(idx, cgroup, page); + hugetlb_cgroup_move_parent(idx, h_cg, page); spin_unlock(&hugetlb_lock); idx++; } cond_resched(); - } while (hugetlb_cgroup_have_usage(cgroup)); + } while (hugetlb_cgroup_have_usage(h_cg)); } int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages, -- cgit v0.10.2 From 72c97e54e0f043d33b246d7460ae0a36c4b8c643 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 8 Aug 2013 20:11:22 -0400 Subject: cgroup: add subsystem pointer to cgroup_subsys_state Currently, given a cgroup_subsys_state, there's no way to find out which subsystem the css is for, which we'll need to convert the cgroup controller API to primarily use @css instead of @cgroup. This patch adds cgroup_subsys_state->ss which points to the subsystem the @css belongs to. While at it, remove the comment about accessing @css->cgroup to determine the hierarchy. cgroup core will provide API to traverse hierarchy of css'es and we don't want subsystems to directly walk cgroup hierarchies anymore. Signed-off-by: Tejun Heo Acked-by: Li Zefan diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 552c5fe..821678a 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -66,13 +66,12 @@ enum cgroup_subsys_id { /* Per-subsystem/per-cgroup state maintained by the system. */ struct cgroup_subsys_state { - /* - * The cgroup that this subsystem is attached to. Useful - * for subsystems that want to know about the cgroup - * hierarchy structure - */ + /* the cgroup that this css is attached to */ struct cgroup *cgroup; + /* the cgroup subsystem that this css is attached to */ + struct cgroup_subsys *ss; + /* reference count - access via css_[try]get() and css_put() */ struct percpu_ref refcnt; diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 0b3caa3..4234428 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4186,6 +4186,7 @@ static void init_cgroup_css(struct cgroup_subsys_state *css, struct cgroup *cgrp) { css->cgroup = cgrp; + css->ss = ss; css->flags = 0; css->id = NULL; if (cgrp == cgroup_dummy_top) -- cgit v0.10.2 From a7c6d554aa01236ac2a9f851ab0f75704f76dfa2 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 8 Aug 2013 20:11:23 -0400 Subject: cgroup: add/update accessors which obtain subsys specific data from css css (cgroup_subsys_state) is usually embedded in a subsys specific data structure. Subsystems either use container_of() directly to cast from css to such data structure or has an accessor function wrapping such cast. As cgroup as whole is moving towards using css as the main interface handle, add and update such accessors to ease dealing with css's. All accessors explicitly handle NULL input and return NULL in those cases. While this looks like an extra branch in the code, as all controllers specific data structures have css as the first field, the casting doesn't involve any offsetting and the compiler can trivially optimize out the branch. * blkio, freezer, cpuset, cpu, cpuacct and net_cls didn't have such accessor. Added. * memory, hugetlb and devices already had one but didn't explicitly handle NULL input. Updated. Signed-off-by: Tejun Heo Acked-by: Li Zefan diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h index 628e50f..8e5863e 100644 --- a/block/blk-cgroup.h +++ b/block/blk-cgroup.h @@ -179,21 +179,25 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, void blkg_conf_finish(struct blkg_conf_ctx *ctx); +static inline struct blkcg *css_to_blkcg(struct cgroup_subsys_state *css) +{ + return css ? container_of(css, struct blkcg, css) : NULL; +} + static inline struct blkcg *cgroup_to_blkcg(struct cgroup *cgroup) { - return container_of(cgroup_css(cgroup, blkio_subsys_id), - struct blkcg, css); + return css_to_blkcg(cgroup_css(cgroup, blkio_subsys_id)); } static inline struct blkcg *task_blkcg(struct task_struct *tsk) { - return container_of(task_css(tsk, blkio_subsys_id), struct blkcg, css); + return css_to_blkcg(task_css(tsk, blkio_subsys_id)); } static inline struct blkcg *bio_blkcg(struct bio *bio) { if (bio && bio->bi_css) - return container_of(bio->bi_css, struct blkcg, css); + return css_to_blkcg(bio->bi_css); return task_blkcg(current); } diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index 9d3f615..1db686e 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -45,16 +45,19 @@ struct freezer { spinlock_t lock; }; +static inline struct freezer *css_freezer(struct cgroup_subsys_state *css) +{ + return css ? container_of(css, struct freezer, css) : NULL; +} + static inline struct freezer *cgroup_freezer(struct cgroup *cgroup) { - return container_of(cgroup_css(cgroup, freezer_subsys_id), - struct freezer, css); + return css_freezer(cgroup_css(cgroup, freezer_subsys_id)); } static inline struct freezer *task_freezer(struct task_struct *task) { - return container_of(task_css(task, freezer_subsys_id), - struct freezer, css); + return css_freezer(task_css(task, freezer_subsys_id)); } static struct freezer *parent_freezer(struct freezer *freezer) diff --git a/kernel/cpuset.c b/kernel/cpuset.c index f737134..6e9cbdd 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -114,18 +114,21 @@ struct cpuset { int relax_domain_level; }; +static inline struct cpuset *css_cs(struct cgroup_subsys_state *css) +{ + return css ? container_of(css, struct cpuset, css) : NULL; +} + /* Retrieve the cpuset for a cgroup */ static inline struct cpuset *cgroup_cs(struct cgroup *cgrp) { - return container_of(cgroup_css(cgrp, cpuset_subsys_id), - struct cpuset, css); + return css_cs(cgroup_css(cgrp, cpuset_subsys_id)); } /* Retrieve the cpuset for a task */ static inline struct cpuset *task_cs(struct task_struct *task) { - return container_of(task_css(task, cpuset_subsys_id), - struct cpuset, css); + return css_cs(task_css(task, cpuset_subsys_id)); } static inline struct cpuset *parent_cs(struct cpuset *cs) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 323d907..5bccb02 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7083,11 +7083,15 @@ int sched_rt_handler(struct ctl_table *table, int write, #ifdef CONFIG_CGROUP_SCHED +static inline struct task_group *css_tg(struct cgroup_subsys_state *css) +{ + return css ? container_of(css, struct task_group, css) : NULL; +} + /* return corresponding task_group object of a cgroup */ static inline struct task_group *cgroup_tg(struct cgroup *cgrp) { - return container_of(cgroup_css(cgrp, cpu_cgroup_subsys_id), - struct task_group, css); + return css_tg(cgroup_css(cgrp, cpu_cgroup_subsys_id)); } static struct cgroup_subsys_state *cpu_cgroup_css_alloc(struct cgroup *cgrp) diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index 4a210fa..8ccfa10 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c @@ -33,18 +33,21 @@ struct cpuacct { struct kernel_cpustat __percpu *cpustat; }; +static inline struct cpuacct *css_ca(struct cgroup_subsys_state *css) +{ + return css ? container_of(css, struct cpuacct, css) : NULL; +} + /* return cpu accounting group corresponding to this container */ static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp) { - return container_of(cgroup_css(cgrp, cpuacct_subsys_id), - struct cpuacct, css); + return css_ca(cgroup_css(cgrp, cpuacct_subsys_id)); } /* return cpu accounting group to which this task belongs */ static inline struct cpuacct *task_ca(struct task_struct *tsk) { - return container_of(task_css(tsk, cpuacct_subsys_id), - struct cpuacct, css); + return css_ca(task_css(tsk, cpuacct_subsys_id)); } static inline struct cpuacct *__parent_ca(struct cpuacct *ca) diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c index d2f9fc0..95585a0 100644 --- a/mm/hugetlb_cgroup.c +++ b/mm/hugetlb_cgroup.c @@ -36,7 +36,7 @@ static struct hugetlb_cgroup *root_h_cgroup __read_mostly; static inline struct hugetlb_cgroup *hugetlb_cgroup_from_css(struct cgroup_subsys_state *s) { - return container_of(s, struct hugetlb_cgroup, css); + return s ? container_of(s, struct hugetlb_cgroup, css) : NULL; } static inline diff --git a/mm/memcontrol.c b/mm/memcontrol.c index b47bd3a..11d659e 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -486,7 +486,7 @@ static DEFINE_MUTEX(memcg_create_mutex); static inline struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *s) { - return container_of(s, struct mem_cgroup, css); + return s ? container_of(s, struct mem_cgroup, css) : NULL; } /* Some nice accessors for the vmpressure. */ diff --git a/net/sched/cls_cgroup.c b/net/sched/cls_cgroup.c index 5ee72a0..af412ab 100644 --- a/net/sched/cls_cgroup.c +++ b/net/sched/cls_cgroup.c @@ -23,16 +23,19 @@ #include #include +static inline struct cgroup_cls_state *css_cls_state(struct cgroup_subsys_state *css) +{ + return css ? container_of(css, struct cgroup_cls_state, css) : NULL; +} + static inline struct cgroup_cls_state *cgrp_cls_state(struct cgroup *cgrp) { - return container_of(cgroup_css(cgrp, net_cls_subsys_id), - struct cgroup_cls_state, css); + return css_cls_state(cgroup_css(cgrp, net_cls_subsys_id)); } static inline struct cgroup_cls_state *task_cls_state(struct task_struct *p) { - return container_of(task_css(p, net_cls_subsys_id), - struct cgroup_cls_state, css); + return css_cls_state(task_css(p, net_cls_subsys_id)); } static struct cgroup_subsys_state *cgrp_css_alloc(struct cgroup *cgrp) diff --git a/security/device_cgroup.c b/security/device_cgroup.c index 87a0a03..9095364 100644 --- a/security/device_cgroup.c +++ b/security/device_cgroup.c @@ -53,7 +53,7 @@ struct dev_cgroup { static inline struct dev_cgroup *css_to_devcgroup(struct cgroup_subsys_state *s) { - return container_of(s, struct dev_cgroup, css); + return s ? container_of(s, struct dev_cgroup, css) : NULL; } static inline struct dev_cgroup *cgroup_to_devcgroup(struct cgroup *cgroup) -- cgit v0.10.2 From 6387698699afd72d6304566fb6ccf84bffe07c56 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 8 Aug 2013 20:11:23 -0400 Subject: cgroup: add css_parent() Currently, controllers have to explicitly follow the cgroup hierarchy to find the parent of a given css. cgroup is moving towards using cgroup_subsys_state as the main controller interface construct, so let's provide a way to climb the hierarchy using just csses. This patch implements css_parent() which, given a css, returns its parent. The function is guarnateed to valid non-NULL parent css as long as the target css is not at the top of the hierarchy. freezer, cpuset, cpu, cpuacct, hugetlb, memory, net_cls and devices are converted to use css_parent() instead of accessing cgroup->parent directly. * __parent_ca() is dropped from cpuacct and its usage is replaced with parent_ca(). The only difference between the two was NULL test on cgroup->parent which is now embedded in css_parent() making the distinction moot. Note that eventually a css->parent field will be added to css and the NULL check in css_parent() will go away. This patch shouldn't cause any behavior differences. Signed-off-by: Tejun Heo Acked-by: Li Zefan diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h index 8e5863e..b6802c4 100644 --- a/block/blk-cgroup.h +++ b/block/blk-cgroup.h @@ -209,9 +209,7 @@ static inline struct blkcg *bio_blkcg(struct bio *bio) */ static inline struct blkcg *blkcg_parent(struct blkcg *blkcg) { - struct cgroup *pcg = blkcg->css.cgroup->parent; - - return pcg ? cgroup_to_blkcg(pcg) : NULL; + return css_to_blkcg(css_parent(&blkcg->css)); } /** diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 821678a..18112a3 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -647,6 +647,21 @@ struct cgroup_subsys { #undef SUBSYS /** + * css_parent - find the parent css + * @css: the target cgroup_subsys_state + * + * Return the parent css of @css. This function is guaranteed to return + * non-NULL parent as long as @css isn't the root. + */ +static inline +struct cgroup_subsys_state *css_parent(struct cgroup_subsys_state *css) +{ + struct cgroup *parent_cgrp = css->cgroup->parent; + + return parent_cgrp ? parent_cgrp->subsys[css->ss->subsys_id] : NULL; +} + +/** * cgroup_css - obtain a cgroup's css for the specified subsystem * @cgrp: the cgroup of interest * @subsys_id: the subsystem of interest diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index 1db686e..657a73c 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -62,11 +62,7 @@ static inline struct freezer *task_freezer(struct task_struct *task) static struct freezer *parent_freezer(struct freezer *freezer) { - struct cgroup *pcg = freezer->css.cgroup->parent; - - if (pcg) - return cgroup_freezer(pcg); - return NULL; + return css_freezer(css_parent(&freezer->css)); } bool cgroup_freezing(struct task_struct *task) @@ -234,7 +230,7 @@ static void freezer_fork(struct task_struct *task) * The root cgroup is non-freezable, so we can skip the * following check. */ - if (!freezer->css.cgroup->parent) + if (!parent_freezer(freezer)) goto out; spin_lock_irq(&freezer->lock); diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 6e9cbdd..259a4af 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -133,11 +133,7 @@ static inline struct cpuset *task_cs(struct task_struct *task) static inline struct cpuset *parent_cs(struct cpuset *cs) { - struct cgroup *pcgrp = cs->css.cgroup->parent; - - if (pcgrp) - return cgroup_cs(pcgrp); - return NULL; + return css_cs(css_parent(&cs->css)); } #ifdef CONFIG_NUMA diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 5bccb02..7a10742 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7114,13 +7114,10 @@ static struct cgroup_subsys_state *cpu_cgroup_css_alloc(struct cgroup *cgrp) static int cpu_cgroup_css_online(struct cgroup *cgrp) { struct task_group *tg = cgroup_tg(cgrp); - struct task_group *parent; + struct task_group *parent = css_tg(css_parent(&tg->css)); - if (!cgrp->parent) - return 0; - - parent = cgroup_tg(cgrp->parent); - sched_online_group(tg, parent); + if (parent) + sched_online_group(tg, parent); return 0; } diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index 8ccfa10..f6926a1 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c @@ -50,16 +50,9 @@ static inline struct cpuacct *task_ca(struct task_struct *tsk) return css_ca(task_css(tsk, cpuacct_subsys_id)); } -static inline struct cpuacct *__parent_ca(struct cpuacct *ca) -{ - return cgroup_ca(ca->css.cgroup->parent); -} - static inline struct cpuacct *parent_ca(struct cpuacct *ca) { - if (!ca->css.cgroup->parent) - return NULL; - return cgroup_ca(ca->css.cgroup->parent); + return css_ca(css_parent(&ca->css)); } static DEFINE_PER_CPU(u64, root_cpuacct_cpuusage); @@ -284,7 +277,7 @@ void cpuacct_account_field(struct task_struct *p, int index, u64 val) while (ca != &root_cpuacct) { kcpustat = this_cpu_ptr(ca->cpustat); kcpustat->cpustat[index] += val; - ca = __parent_ca(ca); + ca = parent_ca(ca); } rcu_read_unlock(); } diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c index 95585a0..57ecb5d 100644 --- a/mm/hugetlb_cgroup.c +++ b/mm/hugetlb_cgroup.c @@ -59,11 +59,7 @@ static inline bool hugetlb_cgroup_is_root(struct hugetlb_cgroup *h_cg) static inline struct hugetlb_cgroup * parent_hugetlb_cgroup(struct hugetlb_cgroup *h_cg) { - struct cgroup *parent = h_cg->css.cgroup->parent; - - if (!parent) - return NULL; - return hugetlb_cgroup_from_cgroup(parent); + return hugetlb_cgroup_from_css(css_parent(&h_cg->css)); } static inline bool hugetlb_cgroup_have_usage(struct hugetlb_cgroup *h_cg) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 11d659e..69b3e52 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1524,10 +1524,8 @@ static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg) int mem_cgroup_swappiness(struct mem_cgroup *memcg) { - struct cgroup *cgrp = memcg->css.cgroup; - /* root ? */ - if (cgrp->parent == NULL) + if (!css_parent(&memcg->css)) return vm_swappiness; return memcg->swappiness; @@ -5026,11 +5024,7 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft, { int retval = 0; struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); - struct cgroup *parent = cont->parent; - struct mem_cgroup *parent_memcg = NULL; - - if (parent) - parent_memcg = mem_cgroup_from_cont(parent); + struct mem_cgroup *parent_memcg = mem_cgroup_from_css(css_parent(&memcg->css)); mutex_lock(&memcg_create_mutex); @@ -5282,18 +5276,15 @@ static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft, static void memcg_get_hierarchical_limit(struct mem_cgroup *memcg, unsigned long long *mem_limit, unsigned long long *memsw_limit) { - struct cgroup *cgroup; unsigned long long min_limit, min_memsw_limit, tmp; min_limit = res_counter_read_u64(&memcg->res, RES_LIMIT); min_memsw_limit = res_counter_read_u64(&memcg->memsw, RES_LIMIT); - cgroup = memcg->css.cgroup; if (!memcg->use_hierarchy) goto out; - while (cgroup->parent) { - cgroup = cgroup->parent; - memcg = mem_cgroup_from_cont(cgroup); + while (css_parent(&memcg->css)) { + memcg = mem_cgroup_from_css(css_parent(&memcg->css)); if (!memcg->use_hierarchy) break; tmp = res_counter_read_u64(&memcg->res, RES_LIMIT); @@ -5523,16 +5514,11 @@ static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft, u64 val) { struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); - struct mem_cgroup *parent; - - if (val > 100) - return -EINVAL; + struct mem_cgroup *parent = mem_cgroup_from_css(css_parent(&memcg->css)); - if (cgrp->parent == NULL) + if (val > 100 || !parent) return -EINVAL; - parent = mem_cgroup_from_cont(cgrp->parent); - mutex_lock(&memcg_create_mutex); /* If under hierarchy, only empty-root can set this value */ @@ -5861,14 +5847,12 @@ static int mem_cgroup_oom_control_write(struct cgroup *cgrp, struct cftype *cft, u64 val) { struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); - struct mem_cgroup *parent; + struct mem_cgroup *parent = mem_cgroup_from_css(css_parent(&memcg->css)); /* cannot set to root cgroup and only 0 and 1 are allowed */ - if (!cgrp->parent || !((val == 0) || (val == 1))) + if (!parent || !((val == 0) || (val == 1))) return -EINVAL; - parent = mem_cgroup_from_cont(cgrp->parent); - mutex_lock(&memcg_create_mutex); /* oom-kill-disable is a flag for subhierarchy. */ if ((parent->use_hierarchy) || memcg_has_children(memcg)) { @@ -6266,15 +6250,14 @@ free_out: static int mem_cgroup_css_online(struct cgroup *cont) { - struct mem_cgroup *memcg, *parent; + struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); + struct mem_cgroup *parent = mem_cgroup_from_css(css_parent(&memcg->css)); int error = 0; - if (!cont->parent) + if (!parent) return 0; mutex_lock(&memcg_create_mutex); - memcg = mem_cgroup_from_cont(cont); - parent = mem_cgroup_from_cont(cont->parent); memcg->use_hierarchy = parent->use_hierarchy; memcg->oom_kill_disable = parent->oom_kill_disable; diff --git a/net/sched/cls_cgroup.c b/net/sched/cls_cgroup.c index af412ab..9e6b75e 100644 --- a/net/sched/cls_cgroup.c +++ b/net/sched/cls_cgroup.c @@ -50,9 +50,11 @@ static struct cgroup_subsys_state *cgrp_css_alloc(struct cgroup *cgrp) static int cgrp_css_online(struct cgroup *cgrp) { - if (cgrp->parent) - cgrp_cls_state(cgrp)->classid = - cgrp_cls_state(cgrp->parent)->classid; + struct cgroup_cls_state *cs = cgrp_cls_state(cgrp); + struct cgroup_cls_state *parent = css_cls_state(css_parent(&cs->css)); + + if (parent) + cs->classid = parent->classid; return 0; } diff --git a/security/device_cgroup.c b/security/device_cgroup.c index 9095364..635a49d 100644 --- a/security/device_cgroup.c +++ b/security/device_cgroup.c @@ -198,13 +198,11 @@ static inline bool is_devcg_online(const struct dev_cgroup *devcg) */ static int devcgroup_online(struct cgroup *cgroup) { - struct dev_cgroup *dev_cgroup, *parent_dev_cgroup = NULL; + struct dev_cgroup *dev_cgroup = cgroup_to_devcgroup(cgroup); + struct dev_cgroup *parent_dev_cgroup = css_to_devcgroup(css_parent(&dev_cgroup->css)); int ret = 0; mutex_lock(&devcgroup_mutex); - dev_cgroup = cgroup_to_devcgroup(cgroup); - if (cgroup->parent) - parent_dev_cgroup = cgroup_to_devcgroup(cgroup->parent); if (parent_dev_cgroup == NULL) dev_cgroup->behavior = DEVCG_DEFAULT_ALLOW; @@ -394,12 +392,10 @@ static bool may_access(struct dev_cgroup *dev_cgroup, static int parent_has_perm(struct dev_cgroup *childcg, struct dev_exception_item *ex) { - struct cgroup *pcg = childcg->css.cgroup->parent; - struct dev_cgroup *parent; + struct dev_cgroup *parent = css_to_devcgroup(css_parent(&childcg->css)); - if (!pcg) + if (!parent) return 1; - parent = cgroup_to_devcgroup(pcg); return may_access(parent, ex, childcg->behavior); } @@ -524,15 +520,11 @@ static int devcgroup_update_access(struct dev_cgroup *devcgroup, char temp[12]; /* 11 + 1 characters needed for a u32 */ int count, rc = 0; struct dev_exception_item ex; - struct cgroup *p = devcgroup->css.cgroup; - struct dev_cgroup *parent = NULL; + struct dev_cgroup *parent = css_to_devcgroup(css_parent(&devcgroup->css)); if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (p->parent) - parent = cgroup_to_devcgroup(p->parent); - memset(&ex, 0, sizeof(ex)); b = buffer; -- cgit v0.10.2 From eb95419b023abacb415e2a18fea899023ce7624d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 8 Aug 2013 20:11:23 -0400 Subject: cgroup: pass around cgroup_subsys_state instead of cgroup in subsystem methods cgroup is currently in the process of transitioning to using struct cgroup_subsys_state * as the primary handle instead of struct cgroup * in subsystem implementations for the following reasons. * With unified hierarchy, subsystems will be dynamically bound and unbound from cgroups and thus css's (cgroup_subsys_state) may be created and destroyed dynamically over the lifetime of a cgroup, which is different from the current state where all css's are allocated and destroyed together with the associated cgroup. This in turn means that cgroup_css() should be synchronized and may return NULL, making it more cumbersome to use. * Differing levels of per-subsystem granularity in the unified hierarchy means that the task and descendant iterators should behave differently depending on the specific subsystem the iteration is being performed for. * In majority of the cases, subsystems only care about its part in the cgroup hierarchy - ie. the hierarchy of css's. Subsystem methods often obtain the matching css pointer from the cgroup and don't bother with the cgroup pointer itself. Passing around css fits much better. This patch converts all cgroup_subsys methods to take @css instead of @cgroup. The conversions are mostly straight-forward. A few noteworthy changes are * ->css_alloc() now takes css of the parent cgroup rather than the pointer to the new cgroup as the css for the new cgroup doesn't exist yet. Knowing the parent css is enough for all the existing subsystems. * In kernel/cgroup.c::offline_css(), unnecessary open coded css dereference is replaced with local variable access. This patch shouldn't cause any behavior differences. v2: Unnecessary explicit cgrp->subsys[] deref in css_online() replaced with local variable @css as suggested by Li Zefan. Rebased on top of new for-3.12 which includes for-3.11-fixes so that ->css_free() invocation added by da0a12caff ("cgroup: fix a leak when percpu_ref_init() fails") is converted too. Suggested by Li Zefan. Signed-off-by: Tejun Heo Acked-by: Li Zefan Acked-by: Michal Hocko Acked-by: Vivek Goyal Acked-by: Aristeu Rozanski Acked-by: Daniel Wagner Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Johannes Weiner Cc: Balbir Singh Cc: Matt Helsley Cc: Jens Axboe Cc: Steven Rostedt diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 290792a..79fd9f4 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -765,18 +765,18 @@ struct cftype blkcg_files[] = { /** * blkcg_css_offline - cgroup css_offline callback - * @cgroup: cgroup of interest + * @css: css of interest * - * This function is called when @cgroup is about to go away and responsible - * for shooting down all blkgs associated with @cgroup. blkgs should be + * This function is called when @css is about to go away and responsible + * for shooting down all blkgs associated with @css. blkgs should be * removed while holding both q and blkcg locks. As blkcg lock is nested * inside q lock, this function performs reverse double lock dancing. * * This is the blkcg counterpart of ioc_release_fn(). */ -static void blkcg_css_offline(struct cgroup *cgroup) +static void blkcg_css_offline(struct cgroup_subsys_state *css) { - struct blkcg *blkcg = cgroup_to_blkcg(cgroup); + struct blkcg *blkcg = css_to_blkcg(css); spin_lock_irq(&blkcg->lock); @@ -798,21 +798,21 @@ static void blkcg_css_offline(struct cgroup *cgroup) spin_unlock_irq(&blkcg->lock); } -static void blkcg_css_free(struct cgroup *cgroup) +static void blkcg_css_free(struct cgroup_subsys_state *css) { - struct blkcg *blkcg = cgroup_to_blkcg(cgroup); + struct blkcg *blkcg = css_to_blkcg(css); if (blkcg != &blkcg_root) kfree(blkcg); } -static struct cgroup_subsys_state *blkcg_css_alloc(struct cgroup *cgroup) +static struct cgroup_subsys_state * +blkcg_css_alloc(struct cgroup_subsys_state *parent_css) { static atomic64_t id_seq = ATOMIC64_INIT(0); struct blkcg *blkcg; - struct cgroup *parent = cgroup->parent; - if (!parent) { + if (!parent_css) { blkcg = &blkcg_root; goto done; } @@ -883,14 +883,15 @@ void blkcg_exit_queue(struct request_queue *q) * of the main cic data structures. For now we allow a task to change * its cgroup only if it's the only owner of its ioc. */ -static int blkcg_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) +static int blkcg_can_attach(struct cgroup_subsys_state *css, + struct cgroup_taskset *tset) { struct task_struct *task; struct io_context *ioc; int ret = 0; /* task_lock() is needed to avoid races with exit_io_context() */ - cgroup_taskset_for_each(task, cgrp, tset) { + cgroup_taskset_for_each(task, css->cgroup, tset) { task_lock(task); ioc = task->io_context; if (ioc && atomic_read(&ioc->nr_tasks) > 1) diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 18112a3..9c2b9dd 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -579,18 +579,22 @@ int cgroup_taskset_size(struct cgroup_taskset *tset); */ struct cgroup_subsys { - struct cgroup_subsys_state *(*css_alloc)(struct cgroup *cgrp); - int (*css_online)(struct cgroup *cgrp); - void (*css_offline)(struct cgroup *cgrp); - void (*css_free)(struct cgroup *cgrp); - - int (*can_attach)(struct cgroup *cgrp, struct cgroup_taskset *tset); - void (*cancel_attach)(struct cgroup *cgrp, struct cgroup_taskset *tset); - void (*attach)(struct cgroup *cgrp, struct cgroup_taskset *tset); + struct cgroup_subsys_state *(*css_alloc)(struct cgroup_subsys_state *parent_css); + int (*css_online)(struct cgroup_subsys_state *css); + void (*css_offline)(struct cgroup_subsys_state *css); + void (*css_free)(struct cgroup_subsys_state *css); + + int (*can_attach)(struct cgroup_subsys_state *css, + struct cgroup_taskset *tset); + void (*cancel_attach)(struct cgroup_subsys_state *css, + struct cgroup_taskset *tset); + void (*attach)(struct cgroup_subsys_state *css, + struct cgroup_taskset *tset); void (*fork)(struct task_struct *task); - void (*exit)(struct cgroup *cgrp, struct cgroup *old_cgrp, + void (*exit)(struct cgroup_subsys_state *css, + struct cgroup_subsys_state *old_css, struct task_struct *task); - void (*bind)(struct cgroup *root); + void (*bind)(struct cgroup_subsys_state *root_css); int subsys_id; int disabled; diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 4234428..271d9a5 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -853,8 +853,11 @@ static void cgroup_free_fn(struct work_struct *work) /* * Release the subsystem state objects. */ - for_each_root_subsys(cgrp->root, ss) - ss->css_free(cgrp); + for_each_root_subsys(cgrp->root, ss) { + struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; + + ss->css_free(css); + } cgrp->root->number_of_cgroups--; mutex_unlock(&cgroup_mutex); @@ -1056,7 +1059,7 @@ static int rebind_subsystems(struct cgroupfs_root *root, list_move(&ss->sibling, &root->subsys_list); ss->root = root; if (ss->bind) - ss->bind(cgrp); + ss->bind(cgrp->subsys[i]); /* refcount was already taken, and we're keeping it */ root->subsys_mask |= bit; @@ -1066,7 +1069,7 @@ static int rebind_subsystems(struct cgroupfs_root *root, BUG_ON(cgrp->subsys[i]->cgroup != cgrp); if (ss->bind) - ss->bind(cgroup_dummy_top); + ss->bind(cgroup_dummy_top->subsys[i]); cgroup_dummy_top->subsys[i]->cgroup = cgroup_dummy_top; cgrp->subsys[i] = NULL; cgroup_subsys[i]->root = &cgroup_dummy_root; @@ -2049,8 +2052,10 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk, * step 1: check that we can legitimately attach to the cgroup. */ for_each_root_subsys(root, ss) { + struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; + if (ss->can_attach) { - retval = ss->can_attach(cgrp, &tset); + retval = ss->can_attach(css, &tset); if (retval) { failed_ss = ss; goto out_cancel_attach; @@ -2089,8 +2094,10 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk, * step 4: do subsystem attach callbacks. */ for_each_root_subsys(root, ss) { + struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; + if (ss->attach) - ss->attach(cgrp, &tset); + ss->attach(css, &tset); } /* @@ -2109,10 +2116,12 @@ out_put_css_set_refs: out_cancel_attach: if (retval) { for_each_root_subsys(root, ss) { + struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; + if (ss == failed_ss) break; if (ss->cancel_attach) - ss->cancel_attach(cgrp, &tset); + ss->cancel_attach(css, &tset); } } out_free_group_list: @@ -4206,14 +4215,15 @@ static void init_cgroup_css(struct cgroup_subsys_state *css, /* invoke ->css_online() on a new CSS and mark it online if successful */ static int online_css(struct cgroup_subsys *ss, struct cgroup *cgrp) { + struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; int ret = 0; lockdep_assert_held(&cgroup_mutex); if (ss->css_online) - ret = ss->css_online(cgrp); + ret = ss->css_online(css); if (!ret) - cgrp->subsys[ss->subsys_id]->flags |= CSS_ONLINE; + css->flags |= CSS_ONLINE; return ret; } @@ -4228,9 +4238,9 @@ static void offline_css(struct cgroup_subsys *ss, struct cgroup *cgrp) return; if (ss->css_offline) - ss->css_offline(cgrp); + ss->css_offline(css); - cgrp->subsys[ss->subsys_id]->flags &= ~CSS_ONLINE; + css->flags &= ~CSS_ONLINE; } /* @@ -4305,7 +4315,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, for_each_root_subsys(root, ss) { struct cgroup_subsys_state *css; - css = ss->css_alloc(cgrp); + css = ss->css_alloc(parent->subsys[ss->subsys_id]); if (IS_ERR(css)) { err = PTR_ERR(css); goto err_free_all; @@ -4313,7 +4323,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, err = percpu_ref_init(&css->refcnt, css_release); if (err) { - ss->css_free(cgrp); + ss->css_free(css); goto err_free_all; } @@ -4386,7 +4396,7 @@ err_free_all: if (css) { percpu_ref_cancel_init(&css->refcnt); - ss->css_free(cgrp); + ss->css_free(css); } } mutex_unlock(&cgroup_mutex); @@ -4641,7 +4651,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) /* Create the top cgroup state for this subsystem */ list_add(&ss->sibling, &cgroup_dummy_root.subsys_list); ss->root = &cgroup_dummy_root; - css = ss->css_alloc(cgroup_dummy_top); + css = ss->css_alloc(cgroup_dummy_top->subsys[ss->subsys_id]); /* We don't handle early failures gracefully */ BUG_ON(IS_ERR(css)); init_cgroup_css(css, ss, cgroup_dummy_top); @@ -4720,7 +4730,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) * struct, so this can happen first (i.e. before the dummy root * attachment). */ - css = ss->css_alloc(cgroup_dummy_top); + css = ss->css_alloc(cgroup_dummy_top->subsys[ss->subsys_id]); if (IS_ERR(css)) { /* failure case - need to deassign the cgroup_subsys[] slot. */ cgroup_subsys[ss->subsys_id] = NULL; @@ -4836,7 +4846,7 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss) * the cgrp->subsys pointer to find their state. note that this * also takes care of freeing the css_id. */ - ss->css_free(cgroup_dummy_top); + ss->css_free(cgroup_dummy_top->subsys[ss->subsys_id]); cgroup_dummy_top->subsys[ss->subsys_id] = NULL; mutex_unlock(&cgroup_mutex); @@ -5192,10 +5202,10 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks) */ for_each_builtin_subsys(ss, i) { if (ss->exit) { - struct cgroup *old_cgrp = cset->subsys[i]->cgroup; - struct cgroup *cgrp = task_cgroup(tsk, i); + struct cgroup_subsys_state *old_css = cset->subsys[i]; + struct cgroup_subsys_state *css = task_css(tsk, i); - ss->exit(cgrp, old_cgrp, tsk); + ss->exit(css, old_css, tsk); } } } @@ -5529,7 +5539,8 @@ struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id) } #ifdef CONFIG_CGROUP_DEBUG -static struct cgroup_subsys_state *debug_css_alloc(struct cgroup *cgrp) +static struct cgroup_subsys_state * +debug_css_alloc(struct cgroup_subsys_state *parent_css) { struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL); @@ -5539,9 +5550,9 @@ static struct cgroup_subsys_state *debug_css_alloc(struct cgroup *cgrp) return css; } -static void debug_css_free(struct cgroup *cgrp) +static void debug_css_free(struct cgroup_subsys_state *css) { - kfree(cgrp->subsys[debug_subsys_id]); + kfree(css); } static u64 debug_taskcount_read(struct cgroup *cgrp, struct cftype *cft) diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index 657a73c..f03a857 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -91,7 +91,8 @@ static const char *freezer_state_strs(unsigned int state) struct cgroup_subsys freezer_subsys; -static struct cgroup_subsys_state *freezer_css_alloc(struct cgroup *cgroup) +static struct cgroup_subsys_state * +freezer_css_alloc(struct cgroup_subsys_state *parent_css) { struct freezer *freezer; @@ -104,16 +105,16 @@ static struct cgroup_subsys_state *freezer_css_alloc(struct cgroup *cgroup) } /** - * freezer_css_online - commit creation of a freezer cgroup - * @cgroup: cgroup being created + * freezer_css_online - commit creation of a freezer css + * @css: css being created * - * We're committing to creation of @cgroup. Mark it online and inherit + * We're committing to creation of @css. Mark it online and inherit * parent's freezing state while holding both parent's and our * freezer->lock. */ -static int freezer_css_online(struct cgroup *cgroup) +static int freezer_css_online(struct cgroup_subsys_state *css) { - struct freezer *freezer = cgroup_freezer(cgroup); + struct freezer *freezer = css_freezer(css); struct freezer *parent = parent_freezer(freezer); /* @@ -140,15 +141,15 @@ static int freezer_css_online(struct cgroup *cgroup) } /** - * freezer_css_offline - initiate destruction of @cgroup - * @cgroup: cgroup being destroyed + * freezer_css_offline - initiate destruction of a freezer css + * @css: css being destroyed * - * @cgroup is going away. Mark it dead and decrement system_freezing_count - * if it was holding one. + * @css is going away. Mark it dead and decrement system_freezing_count if + * it was holding one. */ -static void freezer_css_offline(struct cgroup *cgroup) +static void freezer_css_offline(struct cgroup_subsys_state *css) { - struct freezer *freezer = cgroup_freezer(cgroup); + struct freezer *freezer = css_freezer(css); spin_lock_irq(&freezer->lock); @@ -160,9 +161,9 @@ static void freezer_css_offline(struct cgroup *cgroup) spin_unlock_irq(&freezer->lock); } -static void freezer_css_free(struct cgroup *cgroup) +static void freezer_css_free(struct cgroup_subsys_state *css) { - kfree(cgroup_freezer(cgroup)); + kfree(css_freezer(css)); } /* @@ -174,25 +175,26 @@ static void freezer_css_free(struct cgroup *cgroup) * @freezer->lock. freezer_attach() makes the new tasks conform to the * current state and all following state changes can see the new tasks. */ -static void freezer_attach(struct cgroup *new_cgrp, struct cgroup_taskset *tset) +static void freezer_attach(struct cgroup_subsys_state *new_css, + struct cgroup_taskset *tset) { - struct freezer *freezer = cgroup_freezer(new_cgrp); + struct freezer *freezer = css_freezer(new_css); struct task_struct *task; bool clear_frozen = false; spin_lock_irq(&freezer->lock); /* - * Make the new tasks conform to the current state of @new_cgrp. + * Make the new tasks conform to the current state of @new_css. * For simplicity, when migrating any task to a FROZEN cgroup, we * revert it to FREEZING and let update_if_frozen() determine the * correct state later. * - * Tasks in @tset are on @new_cgrp but may not conform to its + * Tasks in @tset are on @new_css but may not conform to its * current state before executing the following - !frozen tasks may * be visible in a FROZEN cgroup and frozen tasks in a THAWED one. */ - cgroup_taskset_for_each(task, new_cgrp, tset) { + cgroup_taskset_for_each(task, new_css->cgroup, tset) { if (!(freezer->state & CGROUP_FREEZING)) { __thaw_task(task); } else { diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 259a4af..8ce3fdc 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1455,9 +1455,10 @@ static int fmeter_getrate(struct fmeter *fmp) } /* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */ -static int cpuset_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) +static int cpuset_can_attach(struct cgroup_subsys_state *css, + struct cgroup_taskset *tset) { - struct cpuset *cs = cgroup_cs(cgrp); + struct cpuset *cs = css_cs(css); struct task_struct *task; int ret; @@ -1468,11 +1469,11 @@ static int cpuset_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) * flag is set. */ ret = -ENOSPC; - if (!cgroup_sane_behavior(cgrp) && + if (!cgroup_sane_behavior(css->cgroup) && (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))) goto out_unlock; - cgroup_taskset_for_each(task, cgrp, tset) { + cgroup_taskset_for_each(task, css->cgroup, tset) { /* * Kthreads which disallow setaffinity shouldn't be moved * to a new cpuset; we don't want to change their cpu @@ -1501,11 +1502,11 @@ out_unlock: return ret; } -static void cpuset_cancel_attach(struct cgroup *cgrp, +static void cpuset_cancel_attach(struct cgroup_subsys_state *css, struct cgroup_taskset *tset) { mutex_lock(&cpuset_mutex); - cgroup_cs(cgrp)->attach_in_progress--; + css_cs(css)->attach_in_progress--; mutex_unlock(&cpuset_mutex); } @@ -1516,7 +1517,8 @@ static void cpuset_cancel_attach(struct cgroup *cgrp, */ static cpumask_var_t cpus_attach; -static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) +static void cpuset_attach(struct cgroup_subsys_state *css, + struct cgroup_taskset *tset) { /* static buf protected by cpuset_mutex */ static nodemask_t cpuset_attach_nodemask_to; @@ -1524,7 +1526,7 @@ static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) struct task_struct *task; struct task_struct *leader = cgroup_taskset_first(tset); struct cgroup *oldcgrp = cgroup_taskset_cur_cgroup(tset); - struct cpuset *cs = cgroup_cs(cgrp); + struct cpuset *cs = css_cs(css); struct cpuset *oldcs = cgroup_cs(oldcgrp); struct cpuset *cpus_cs = effective_cpumask_cpuset(cs); struct cpuset *mems_cs = effective_nodemask_cpuset(cs); @@ -1539,7 +1541,7 @@ static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) guarantee_online_mems(mems_cs, &cpuset_attach_nodemask_to); - cgroup_taskset_for_each(task, cgrp, tset) { + cgroup_taskset_for_each(task, css->cgroup, tset) { /* * can_attach beforehand should guarantee that this doesn't * fail. TODO: have a better way to handle failure here @@ -1940,11 +1942,12 @@ static struct cftype files[] = { * cgrp: control group that the new cpuset will be part of */ -static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cgrp) +static struct cgroup_subsys_state * +cpuset_css_alloc(struct cgroup_subsys_state *parent_css) { struct cpuset *cs; - if (!cgrp->parent) + if (!parent_css) return &top_cpuset.css; cs = kzalloc(sizeof(*cs), GFP_KERNEL); @@ -1964,9 +1967,9 @@ static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cgrp) return &cs->css; } -static int cpuset_css_online(struct cgroup *cgrp) +static int cpuset_css_online(struct cgroup_subsys_state *css) { - struct cpuset *cs = cgroup_cs(cgrp); + struct cpuset *cs = css_cs(css); struct cpuset *parent = parent_cs(cs); struct cpuset *tmp_cs; struct cgroup *pos_cgrp; @@ -1984,7 +1987,7 @@ static int cpuset_css_online(struct cgroup *cgrp) number_of_cpusets++; - if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags)) + if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags)) goto out_unlock; /* @@ -2024,9 +2027,9 @@ out_unlock: * will call rebuild_sched_domains_locked(). */ -static void cpuset_css_offline(struct cgroup *cgrp) +static void cpuset_css_offline(struct cgroup_subsys_state *css) { - struct cpuset *cs = cgroup_cs(cgrp); + struct cpuset *cs = css_cs(css); mutex_lock(&cpuset_mutex); @@ -2039,9 +2042,9 @@ static void cpuset_css_offline(struct cgroup *cgrp) mutex_unlock(&cpuset_mutex); } -static void cpuset_css_free(struct cgroup *cgrp) +static void cpuset_css_free(struct cgroup_subsys_state *css) { - struct cpuset *cs = cgroup_cs(cgrp); + struct cpuset *cs = css_cs(css); free_cpumask_var(cs->cpus_allowed); kfree(cs); diff --git a/kernel/events/core.c b/kernel/events/core.c index 414c61f..9705a0e 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7778,7 +7778,8 @@ unlock: device_initcall(perf_event_sysfs_init); #ifdef CONFIG_CGROUP_PERF -static struct cgroup_subsys_state *perf_cgroup_css_alloc(struct cgroup *cont) +static struct cgroup_subsys_state * +perf_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) { struct perf_cgroup *jc; @@ -7795,11 +7796,10 @@ static struct cgroup_subsys_state *perf_cgroup_css_alloc(struct cgroup *cont) return &jc->css; } -static void perf_cgroup_css_free(struct cgroup *cont) +static void perf_cgroup_css_free(struct cgroup_subsys_state *css) { - struct perf_cgroup *jc; - jc = container_of(cgroup_css(cont, perf_subsys_id), - struct perf_cgroup, css); + struct perf_cgroup *jc = container_of(css, struct perf_cgroup, css); + free_percpu(jc->info); kfree(jc); } @@ -7811,15 +7811,17 @@ static int __perf_cgroup_move(void *info) return 0; } -static void perf_cgroup_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) +static void perf_cgroup_attach(struct cgroup_subsys_state *css, + struct cgroup_taskset *tset) { struct task_struct *task; - cgroup_taskset_for_each(task, cgrp, tset) + cgroup_taskset_for_each(task, css->cgroup, tset) task_function_call(task, __perf_cgroup_move, task); } -static void perf_cgroup_exit(struct cgroup *cgrp, struct cgroup *old_cgrp, +static void perf_cgroup_exit(struct cgroup_subsys_state *css, + struct cgroup_subsys_state *old_css, struct task_struct *task) { /* diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 7a10742..622b7ef 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7094,16 +7094,17 @@ static inline struct task_group *cgroup_tg(struct cgroup *cgrp) return css_tg(cgroup_css(cgrp, cpu_cgroup_subsys_id)); } -static struct cgroup_subsys_state *cpu_cgroup_css_alloc(struct cgroup *cgrp) +static struct cgroup_subsys_state * +cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) { - struct task_group *tg, *parent; + struct task_group *parent = css_tg(parent_css); + struct task_group *tg; - if (!cgrp->parent) { + if (!parent) { /* This is early initialization for the top cgroup */ return &root_task_group.css; } - parent = cgroup_tg(cgrp->parent); tg = sched_create_group(parent); if (IS_ERR(tg)) return ERR_PTR(-ENOMEM); @@ -7111,38 +7112,38 @@ static struct cgroup_subsys_state *cpu_cgroup_css_alloc(struct cgroup *cgrp) return &tg->css; } -static int cpu_cgroup_css_online(struct cgroup *cgrp) +static int cpu_cgroup_css_online(struct cgroup_subsys_state *css) { - struct task_group *tg = cgroup_tg(cgrp); - struct task_group *parent = css_tg(css_parent(&tg->css)); + struct task_group *tg = css_tg(css); + struct task_group *parent = css_tg(css_parent(css)); if (parent) sched_online_group(tg, parent); return 0; } -static void cpu_cgroup_css_free(struct cgroup *cgrp) +static void cpu_cgroup_css_free(struct cgroup_subsys_state *css) { - struct task_group *tg = cgroup_tg(cgrp); + struct task_group *tg = css_tg(css); sched_destroy_group(tg); } -static void cpu_cgroup_css_offline(struct cgroup *cgrp) +static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css) { - struct task_group *tg = cgroup_tg(cgrp); + struct task_group *tg = css_tg(css); sched_offline_group(tg); } -static int cpu_cgroup_can_attach(struct cgroup *cgrp, +static int cpu_cgroup_can_attach(struct cgroup_subsys_state *css, struct cgroup_taskset *tset) { struct task_struct *task; - cgroup_taskset_for_each(task, cgrp, tset) { + cgroup_taskset_for_each(task, css->cgroup, tset) { #ifdef CONFIG_RT_GROUP_SCHED - if (!sched_rt_can_attach(cgroup_tg(cgrp), task)) + if (!sched_rt_can_attach(css_tg(css), task)) return -EINVAL; #else /* We don't support RT-tasks being in separate groups */ @@ -7153,18 +7154,18 @@ static int cpu_cgroup_can_attach(struct cgroup *cgrp, return 0; } -static void cpu_cgroup_attach(struct cgroup *cgrp, +static void cpu_cgroup_attach(struct cgroup_subsys_state *css, struct cgroup_taskset *tset) { struct task_struct *task; - cgroup_taskset_for_each(task, cgrp, tset) + cgroup_taskset_for_each(task, css->cgroup, tset) sched_move_task(task); } -static void -cpu_cgroup_exit(struct cgroup *cgrp, struct cgroup *old_cgrp, - struct task_struct *task) +static void cpu_cgroup_exit(struct cgroup_subsys_state *css, + struct cgroup_subsys_state *old_css, + struct task_struct *task) { /* * cgroup_exit() is called in the copy_process() failure path. diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index f6926a1..1b784d9 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c @@ -62,11 +62,12 @@ static struct cpuacct root_cpuacct = { }; /* create a new cpu accounting group */ -static struct cgroup_subsys_state *cpuacct_css_alloc(struct cgroup *cgrp) +static struct cgroup_subsys_state * +cpuacct_css_alloc(struct cgroup_subsys_state *parent_css) { struct cpuacct *ca; - if (!cgrp->parent) + if (!parent_css) return &root_cpuacct.css; ca = kzalloc(sizeof(*ca), GFP_KERNEL); @@ -92,9 +93,9 @@ out: } /* destroy an existing cpu accounting group */ -static void cpuacct_css_free(struct cgroup *cgrp) +static void cpuacct_css_free(struct cgroup_subsys_state *css) { - struct cpuacct *ca = cgroup_ca(cgrp); + struct cpuacct *ca = css_ca(css); free_percpu(ca->cpustat); free_percpu(ca->cpuusage); diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c index 57ecb5d..e213243 100644 --- a/mm/hugetlb_cgroup.c +++ b/mm/hugetlb_cgroup.c @@ -73,19 +73,18 @@ static inline bool hugetlb_cgroup_have_usage(struct hugetlb_cgroup *h_cg) return false; } -static struct cgroup_subsys_state *hugetlb_cgroup_css_alloc(struct cgroup *cgroup) +static struct cgroup_subsys_state * +hugetlb_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) { + struct hugetlb_cgroup *parent_h_cgroup = hugetlb_cgroup_from_css(parent_css); + struct hugetlb_cgroup *h_cgroup; int idx; - struct cgroup *parent_cgroup; - struct hugetlb_cgroup *h_cgroup, *parent_h_cgroup; h_cgroup = kzalloc(sizeof(*h_cgroup), GFP_KERNEL); if (!h_cgroup) return ERR_PTR(-ENOMEM); - parent_cgroup = cgroup->parent; - if (parent_cgroup) { - parent_h_cgroup = hugetlb_cgroup_from_cgroup(parent_cgroup); + if (parent_h_cgroup) { for (idx = 0; idx < HUGE_MAX_HSTATE; idx++) res_counter_init(&h_cgroup->hugepage[idx], &parent_h_cgroup->hugepage[idx]); @@ -97,11 +96,11 @@ static struct cgroup_subsys_state *hugetlb_cgroup_css_alloc(struct cgroup *cgrou return &h_cgroup->css; } -static void hugetlb_cgroup_css_free(struct cgroup *cgroup) +static void hugetlb_cgroup_css_free(struct cgroup_subsys_state *css) { struct hugetlb_cgroup *h_cgroup; - h_cgroup = hugetlb_cgroup_from_cgroup(cgroup); + h_cgroup = hugetlb_cgroup_from_css(css); kfree(h_cgroup); } @@ -150,9 +149,9 @@ out: * Force the hugetlb cgroup to empty the hugetlb resources by moving them to * the parent cgroup. */ -static void hugetlb_cgroup_css_offline(struct cgroup *cgroup) +static void hugetlb_cgroup_css_offline(struct cgroup_subsys_state *css) { - struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_cgroup(cgroup); + struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css); struct hstate *h; struct page *page; int idx = 0; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 69b3e52..32cca0f 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -6211,7 +6211,7 @@ static void __init mem_cgroup_soft_limit_tree_init(void) } static struct cgroup_subsys_state * __ref -mem_cgroup_css_alloc(struct cgroup *cont) +mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) { struct mem_cgroup *memcg; long error = -ENOMEM; @@ -6226,7 +6226,7 @@ mem_cgroup_css_alloc(struct cgroup *cont) goto free_out; /* root ? */ - if (cont->parent == NULL) { + if (parent_css == NULL) { root_mem_cgroup = memcg; res_counter_init(&memcg->res, NULL); res_counter_init(&memcg->memsw, NULL); @@ -6248,10 +6248,10 @@ free_out: } static int -mem_cgroup_css_online(struct cgroup *cont) +mem_cgroup_css_online(struct cgroup_subsys_state *css) { - struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); - struct mem_cgroup *parent = mem_cgroup_from_css(css_parent(&memcg->css)); + struct mem_cgroup *memcg = mem_cgroup_from_css(css); + struct mem_cgroup *parent = mem_cgroup_from_css(css_parent(css)); int error = 0; if (!parent) @@ -6308,9 +6308,9 @@ static void mem_cgroup_invalidate_reclaim_iterators(struct mem_cgroup *memcg) mem_cgroup_iter_invalidate(root_mem_cgroup); } -static void mem_cgroup_css_offline(struct cgroup *cont) +static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) { - struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); + struct mem_cgroup *memcg = mem_cgroup_from_css(css); kmem_cgroup_css_offline(memcg); @@ -6319,9 +6319,9 @@ static void mem_cgroup_css_offline(struct cgroup *cont) mem_cgroup_destroy_all_caches(memcg); } -static void mem_cgroup_css_free(struct cgroup *cont) +static void mem_cgroup_css_free(struct cgroup_subsys_state *css) { - struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); + struct mem_cgroup *memcg = mem_cgroup_from_css(css); memcg_destroy_kmem(memcg); __mem_cgroup_free(memcg); @@ -6691,12 +6691,12 @@ static void mem_cgroup_clear_mc(void) mem_cgroup_end_move(from); } -static int mem_cgroup_can_attach(struct cgroup *cgroup, +static int mem_cgroup_can_attach(struct cgroup_subsys_state *css, struct cgroup_taskset *tset) { struct task_struct *p = cgroup_taskset_first(tset); int ret = 0; - struct mem_cgroup *memcg = mem_cgroup_from_cont(cgroup); + struct mem_cgroup *memcg = mem_cgroup_from_css(css); unsigned long move_charge_at_immigrate; /* @@ -6738,7 +6738,7 @@ static int mem_cgroup_can_attach(struct cgroup *cgroup, return ret; } -static void mem_cgroup_cancel_attach(struct cgroup *cgroup, +static void mem_cgroup_cancel_attach(struct cgroup_subsys_state *css, struct cgroup_taskset *tset) { mem_cgroup_clear_mc(); @@ -6886,7 +6886,7 @@ retry: up_read(&mm->mmap_sem); } -static void mem_cgroup_move_task(struct cgroup *cont, +static void mem_cgroup_move_task(struct cgroup_subsys_state *css, struct cgroup_taskset *tset) { struct task_struct *p = cgroup_taskset_first(tset); @@ -6901,16 +6901,16 @@ static void mem_cgroup_move_task(struct cgroup *cont, mem_cgroup_clear_mc(); } #else /* !CONFIG_MMU */ -static int mem_cgroup_can_attach(struct cgroup *cgroup, +static int mem_cgroup_can_attach(struct cgroup_subsys_state *css, struct cgroup_taskset *tset) { return 0; } -static void mem_cgroup_cancel_attach(struct cgroup *cgroup, +static void mem_cgroup_cancel_attach(struct cgroup_subsys_state *css, struct cgroup_taskset *tset) { } -static void mem_cgroup_move_task(struct cgroup *cont, +static void mem_cgroup_move_task(struct cgroup_subsys_state *css, struct cgroup_taskset *tset) { } @@ -6920,15 +6920,15 @@ static void mem_cgroup_move_task(struct cgroup *cont, * Cgroup retains root cgroups across [un]mount cycles making it necessary * to verify sane_behavior flag on each mount attempt. */ -static void mem_cgroup_bind(struct cgroup *root) +static void mem_cgroup_bind(struct cgroup_subsys_state *root_css) { /* * use_hierarchy is forced with sane_behavior. cgroup core * guarantees that @root doesn't have any children, so turning it * on for the root memcg is enough. */ - if (cgroup_sane_behavior(root)) - mem_cgroup_from_cont(root)->use_hierarchy = true; + if (cgroup_sane_behavior(root_css->cgroup)) + mem_cgroup_from_css(root_css)->use_hierarchy = true; } struct cgroup_subsys mem_cgroup_subsys = { diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c index 5dfac88..8d095b4 100644 --- a/net/core/netprio_cgroup.c +++ b/net/core/netprio_cgroup.c @@ -126,7 +126,8 @@ static int netprio_set_prio(struct cgroup_subsys_state *css, return 0; } -static struct cgroup_subsys_state *cgrp_css_alloc(struct cgroup *cgrp) +static struct cgroup_subsys_state * +cgrp_css_alloc(struct cgroup_subsys_state *parent_css) { struct cgroup_subsys_state *css; @@ -137,16 +138,14 @@ static struct cgroup_subsys_state *cgrp_css_alloc(struct cgroup *cgrp) return css; } -static int cgrp_css_online(struct cgroup *cgrp) +static int cgrp_css_online(struct cgroup_subsys_state *css) { - struct cgroup_subsys_state *css = cgroup_css(cgrp, net_prio_subsys_id); - struct cgroup_subsys_state *parent_css; + struct cgroup_subsys_state *parent_css = css_parent(css); struct net_device *dev; int ret = 0; - if (!cgrp->parent) + if (!parent_css) return 0; - parent_css = cgroup_css(cgrp->parent, net_prio_subsys_id); rtnl_lock(); /* @@ -164,9 +163,9 @@ static int cgrp_css_online(struct cgroup *cgrp) return ret; } -static void cgrp_css_free(struct cgroup *cgrp) +static void cgrp_css_free(struct cgroup_subsys_state *css) { - kfree(cgroup_css(cgrp, net_prio_subsys_id)); + kfree(css); } static u64 read_prioidx(struct cgroup *cgrp, struct cftype *cft) @@ -221,12 +220,13 @@ static int update_netprio(const void *v, struct file *file, unsigned n) return 0; } -static void net_prio_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) +static void net_prio_attach(struct cgroup_subsys_state *css, + struct cgroup_taskset *tset) { struct task_struct *p; void *v; - cgroup_taskset_for_each(p, cgrp, tset) { + cgroup_taskset_for_each(p, css->cgroup, tset) { task_lock(p); v = (void *)(unsigned long)task_netprioidx(p); iterate_fd(p->files, 0, update_netprio, v); diff --git a/net/sched/cls_cgroup.c b/net/sched/cls_cgroup.c index 9e6b75e..dc39838 100644 --- a/net/sched/cls_cgroup.c +++ b/net/sched/cls_cgroup.c @@ -38,7 +38,8 @@ static inline struct cgroup_cls_state *task_cls_state(struct task_struct *p) return css_cls_state(task_css(p, net_cls_subsys_id)); } -static struct cgroup_subsys_state *cgrp_css_alloc(struct cgroup *cgrp) +static struct cgroup_subsys_state * +cgrp_css_alloc(struct cgroup_subsys_state *parent_css) { struct cgroup_cls_state *cs; @@ -48,19 +49,19 @@ static struct cgroup_subsys_state *cgrp_css_alloc(struct cgroup *cgrp) return &cs->css; } -static int cgrp_css_online(struct cgroup *cgrp) +static int cgrp_css_online(struct cgroup_subsys_state *css) { - struct cgroup_cls_state *cs = cgrp_cls_state(cgrp); - struct cgroup_cls_state *parent = css_cls_state(css_parent(&cs->css)); + struct cgroup_cls_state *cs = css_cls_state(css); + struct cgroup_cls_state *parent = css_cls_state(css_parent(css)); if (parent) cs->classid = parent->classid; return 0; } -static void cgrp_css_free(struct cgroup *cgrp) +static void cgrp_css_free(struct cgroup_subsys_state *css) { - kfree(cgrp_cls_state(cgrp)); + kfree(css_cls_state(css)); } static int update_classid(const void *v, struct file *file, unsigned n) @@ -72,12 +73,13 @@ static int update_classid(const void *v, struct file *file, unsigned n) return 0; } -static void cgrp_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) +static void cgrp_attach(struct cgroup_subsys_state *css, + struct cgroup_taskset *tset) { struct task_struct *p; void *v; - cgroup_taskset_for_each(p, cgrp, tset) { + cgroup_taskset_for_each(p, css->cgroup, tset) { task_lock(p); v = (void *)(unsigned long)task_cls_classid(p); iterate_fd(p->files, 0, update_classid, v); diff --git a/security/device_cgroup.c b/security/device_cgroup.c index 635a49d..7293ac4 100644 --- a/security/device_cgroup.c +++ b/security/device_cgroup.c @@ -68,7 +68,7 @@ static inline struct dev_cgroup *task_devcgroup(struct task_struct *task) struct cgroup_subsys devices_subsys; -static int devcgroup_can_attach(struct cgroup *new_cgrp, +static int devcgroup_can_attach(struct cgroup_subsys_state *new_css, struct cgroup_taskset *set) { struct task_struct *task = cgroup_taskset_first(set); @@ -193,13 +193,13 @@ static inline bool is_devcg_online(const struct dev_cgroup *devcg) /** * devcgroup_online - initializes devcgroup's behavior and exceptions based on * parent's - * @cgroup: cgroup getting online + * @css: css getting online * returns 0 in case of success, error code otherwise */ -static int devcgroup_online(struct cgroup *cgroup) +static int devcgroup_online(struct cgroup_subsys_state *css) { - struct dev_cgroup *dev_cgroup = cgroup_to_devcgroup(cgroup); - struct dev_cgroup *parent_dev_cgroup = css_to_devcgroup(css_parent(&dev_cgroup->css)); + struct dev_cgroup *dev_cgroup = css_to_devcgroup(css); + struct dev_cgroup *parent_dev_cgroup = css_to_devcgroup(css_parent(css)); int ret = 0; mutex_lock(&devcgroup_mutex); @@ -217,9 +217,9 @@ static int devcgroup_online(struct cgroup *cgroup) return ret; } -static void devcgroup_offline(struct cgroup *cgroup) +static void devcgroup_offline(struct cgroup_subsys_state *css) { - struct dev_cgroup *dev_cgroup = cgroup_to_devcgroup(cgroup); + struct dev_cgroup *dev_cgroup = css_to_devcgroup(css); mutex_lock(&devcgroup_mutex); dev_cgroup->behavior = DEVCG_DEFAULT_NONE; @@ -229,7 +229,8 @@ static void devcgroup_offline(struct cgroup *cgroup) /* * called from kernel/cgroup.c with cgroup_lock() held. */ -static struct cgroup_subsys_state *devcgroup_css_alloc(struct cgroup *cgroup) +static struct cgroup_subsys_state * +devcgroup_css_alloc(struct cgroup_subsys_state *parent_css) { struct dev_cgroup *dev_cgroup; @@ -242,11 +243,10 @@ static struct cgroup_subsys_state *devcgroup_css_alloc(struct cgroup *cgroup) return &dev_cgroup->css; } -static void devcgroup_css_free(struct cgroup *cgroup) +static void devcgroup_css_free(struct cgroup_subsys_state *css) { - struct dev_cgroup *dev_cgroup; + struct dev_cgroup *dev_cgroup = css_to_devcgroup(css); - dev_cgroup = cgroup_to_devcgroup(cgroup); __dev_exception_clean(dev_cgroup); kfree(dev_cgroup); } -- cgit v0.10.2 From 2bb566cb68dfafad328af666ebadf0e49accd6ca Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 8 Aug 2013 20:11:23 -0400 Subject: cgroup: add subsys backlink pointer to cftype cgroup is transitioning to using css (cgroup_subsys_state) instead of cgroup as the primary subsystem handle. The cgroupfs file interface will be converted to use css's which requires finding out the subsystem from cftype so that the matching css can be determined from the cgroup. This patch adds cftype->ss which points to the subsystem the file belongs to. The field is initialized while a cftype is being registered. This makes it unnecessary to explicitly specify the subsystem for other cftype handling functions. @ss argument dropped from various cftype handling functions. This patch shouldn't introduce any behavior differences. Signed-off-by: Tejun Heo Acked-by: Li Zefan Acked-by: Vivek Goyal Cc: Jens Axboe diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 79fd9f4..3406373 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -1128,7 +1128,7 @@ void blkcg_policy_unregister(struct blkcg_policy *pol) /* kill the intf files first */ if (pol->cftypes) - cgroup_rm_cftypes(&blkio_subsys, pol->cftypes); + cgroup_rm_cftypes(pol->cftypes); /* unregister and update blkgs */ blkcg_policy[pol->plid] = NULL; diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 9c2b9dd..5db8138 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -429,6 +429,12 @@ struct cftype { /* CFTYPE_* flags */ unsigned int flags; + /* + * The subsys this file belongs to. Initialized automatically + * during registration. NULL for cgroup core files. + */ + struct cgroup_subsys *ss; + int (*open)(struct inode *inode, struct file *file); ssize_t (*read)(struct cgroup *cgrp, struct cftype *cft, struct file *file, @@ -542,7 +548,7 @@ static inline const char *cgroup_name(const struct cgroup *cgrp) } int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts); -int cgroup_rm_cftypes(struct cgroup_subsys *ss, struct cftype *cfts); +int cgroup_rm_cftypes(struct cftype *cfts); bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor); diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 271d9a5..c4bc8da 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -219,8 +219,8 @@ static struct cftype cgroup_base_files[]; static void cgroup_offline_fn(struct work_struct *work); static int cgroup_destroy_locked(struct cgroup *cgrp); -static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys, - struct cftype cfts[], bool is_add); +static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], + bool is_add); /* convenient tests for these bits */ static inline bool cgroup_is_dead(const struct cgroup *cgrp) @@ -974,7 +974,7 @@ static void cgroup_clear_dir(struct cgroup *cgrp, unsigned long subsys_mask) if (!test_bit(i, &subsys_mask)) continue; list_for_each_entry(set, &ss->cftsets, node) - cgroup_addrm_files(cgrp, NULL, set->cfts, false); + cgroup_addrm_files(cgrp, set->cfts, false); } } @@ -1623,7 +1623,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, */ cred = override_creds(&init_cred); - ret = cgroup_addrm_files(root_cgrp, NULL, cgroup_base_files, true); + ret = cgroup_addrm_files(root_cgrp, cgroup_base_files, true); if (ret) goto rm_base_files; @@ -1681,7 +1681,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, rm_base_files: free_cgrp_cset_links(&tmp_links); - cgroup_addrm_files(&root->top_cgroup, NULL, cgroup_base_files, false); + cgroup_addrm_files(&root->top_cgroup, cgroup_base_files, false); revert_creds(cred); unlock_drop: cgroup_exit_root_id(root); @@ -2694,8 +2694,7 @@ static umode_t cgroup_file_mode(const struct cftype *cft) return mode; } -static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys, - struct cftype *cft) +static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft) { struct dentry *dir = cgrp->dentry; struct cgroup *parent = __d_cgrp(dir); @@ -2705,8 +2704,8 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys, umode_t mode; char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 }; - if (subsys && !(cgrp->root->flags & CGRP_ROOT_NOPREFIX)) { - strcpy(name, subsys->name); + if (cft->ss && !(cgrp->root->flags & CGRP_ROOT_NOPREFIX)) { + strcpy(name, cft->ss->name); strcat(name, "."); } strcat(name, cft->name); @@ -2743,17 +2742,16 @@ out: /** * cgroup_addrm_files - add or remove files to a cgroup directory * @cgrp: the target cgroup - * @subsys: the subsystem of files to be added * @cfts: array of cftypes to be added * @is_add: whether to add or remove * * Depending on @is_add, add or remove files defined by @cfts on @cgrp. - * All @cfts should belong to @subsys. For removals, this function never - * fails. If addition fails, this function doesn't remove files already - * added. The caller is responsible for cleaning up. + * For removals, this function never fails. If addition fails, this + * function doesn't remove files already added. The caller is responsible + * for cleaning up. */ -static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys, - struct cftype cfts[], bool is_add) +static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], + bool is_add) { struct cftype *cft; int ret; @@ -2771,7 +2769,7 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys, continue; if (is_add) { - ret = cgroup_add_file(cgrp, subsys, cft); + ret = cgroup_add_file(cgrp, cft); if (ret) { pr_warn("cgroup_addrm_files: failed to add %s, err=%d\n", cft->name, ret); @@ -2796,11 +2794,11 @@ static void cgroup_cfts_prepare(void) mutex_lock(&cgroup_mutex); } -static int cgroup_cfts_commit(struct cgroup_subsys *ss, - struct cftype *cfts, bool is_add) +static int cgroup_cfts_commit(struct cftype *cfts, bool is_add) __releases(&cgroup_mutex) { LIST_HEAD(pending); + struct cgroup_subsys *ss = cfts[0].ss; struct cgroup *cgrp, *root = &ss->root->top_cgroup; struct super_block *sb = ss->root->sb; struct dentry *prev = NULL; @@ -2828,7 +2826,7 @@ static int cgroup_cfts_commit(struct cgroup_subsys *ss, inode = root->dentry->d_inode; mutex_lock(&inode->i_mutex); mutex_lock(&cgroup_mutex); - ret = cgroup_addrm_files(root, ss, cfts, is_add); + ret = cgroup_addrm_files(root, cfts, is_add); mutex_unlock(&cgroup_mutex); mutex_unlock(&inode->i_mutex); @@ -2851,7 +2849,7 @@ static int cgroup_cfts_commit(struct cgroup_subsys *ss, mutex_lock(&inode->i_mutex); mutex_lock(&cgroup_mutex); if (cgrp->serial_nr < update_before && !cgroup_is_dead(cgrp)) - ret = cgroup_addrm_files(cgrp, ss, cfts, is_add); + ret = cgroup_addrm_files(cgrp, cfts, is_add); mutex_unlock(&cgroup_mutex); mutex_unlock(&inode->i_mutex); @@ -2883,51 +2881,56 @@ out_deact: int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) { struct cftype_set *set; + struct cftype *cft; int ret; set = kzalloc(sizeof(*set), GFP_KERNEL); if (!set) return -ENOMEM; + for (cft = cfts; cft->name[0] != '\0'; cft++) + cft->ss = ss; + cgroup_cfts_prepare(); set->cfts = cfts; list_add_tail(&set->node, &ss->cftsets); - ret = cgroup_cfts_commit(ss, cfts, true); + ret = cgroup_cfts_commit(cfts, true); if (ret) - cgroup_rm_cftypes(ss, cfts); + cgroup_rm_cftypes(cfts); return ret; } EXPORT_SYMBOL_GPL(cgroup_add_cftypes); /** * cgroup_rm_cftypes - remove an array of cftypes from a subsystem - * @ss: target cgroup subsystem * @cfts: zero-length name terminated array of cftypes * - * Unregister @cfts from @ss. Files described by @cfts are removed from - * all existing cgroups to which @ss is attached and all future cgroups - * won't have them either. This function can be called anytime whether @ss - * is attached or not. + * Unregister @cfts. Files described by @cfts are removed from all + * existing cgroups and all future cgroups won't have them either. This + * function can be called anytime whether @cfts' subsys is attached or not. * * Returns 0 on successful unregistration, -ENOENT if @cfts is not - * registered with @ss. + * registered. */ -int cgroup_rm_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) +int cgroup_rm_cftypes(struct cftype *cfts) { struct cftype_set *set; + if (!cfts || !cfts[0].ss) + return -ENOENT; + cgroup_cfts_prepare(); - list_for_each_entry(set, &ss->cftsets, node) { + list_for_each_entry(set, &cfts[0].ss->cftsets, node) { if (set->cfts == cfts) { list_del(&set->node); kfree(set); - cgroup_cfts_commit(ss, cfts, false); + cgroup_cfts_commit(cfts, false); return 0; } } - cgroup_cfts_commit(ss, NULL, false); + cgroup_cfts_commit(NULL, false); return -ENOENT; } @@ -4148,7 +4151,7 @@ static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask) continue; list_for_each_entry(set, &ss->cftsets, node) { - ret = cgroup_addrm_files(cgrp, ss, set->cfts, true); + ret = cgroup_addrm_files(cgrp, set->cfts, true); if (ret < 0) goto err; } @@ -4377,7 +4380,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, idr_replace(&root->cgroup_idr, cgrp, cgrp->id); - err = cgroup_addrm_files(cgrp, NULL, cgroup_base_files, true); + err = cgroup_addrm_files(cgrp, cgroup_base_files, true); if (err) goto err_destroy; @@ -4538,7 +4541,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) * but we aren't quite done with @cgrp yet, so hold onto it. */ cgroup_clear_dir(cgrp, cgrp->root->subsys_mask); - cgroup_addrm_files(cgrp, NULL, cgroup_base_files, false); + cgroup_addrm_files(cgrp, cgroup_base_files, false); dget(d); cgroup_d_remove_dir(d); @@ -4632,6 +4635,11 @@ static void __init_or_module cgroup_init_cftsets(struct cgroup_subsys *ss) * deregistration. */ if (ss->base_cftypes) { + struct cftype *cft; + + for (cft = ss->base_cftypes; cft->name[0] != '\0'; cft++) + cft->ss = ss; + ss->base_cftset.cfts = ss->base_cftypes; list_add_tail(&ss->base_cftset.node, &ss->cftsets); } -- cgit v0.10.2 From f7d58818ba4249f04a83b73aaac135640050bb4f Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 8 Aug 2013 20:11:23 -0400 Subject: cgroup: pin cgroup_subsys_state when opening a cgroupfs file Previously, each file read/write operation relied on the inode reference count pinning the cgroup and simply checked whether the cgroup was marked dead before proceeding to invoke the per-subsystem callback. This was rather silly as it didn't have any synchronization or css pinning around the check and the cgroup may be removed and all css refs drained between the DEAD check and actual method invocation. This patch pins the css between open() and release() so that it is guaranteed to be alive for all file operations and remove the silly DEAD checks from cgroup_file_read/write(). Signed-off-by: Tejun Heo Acked-by: Li Zefan diff --git a/kernel/cgroup.c b/kernel/cgroup.c index c4bc8da..583f8f6 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2277,6 +2277,17 @@ static int cgroup_sane_behavior_show(struct cgroup *cgrp, struct cftype *cft, return 0; } +/* return the css for the given cgroup file */ +static struct cgroup_subsys_state *cgroup_file_css(struct cfent *cfe) +{ + struct cftype *cft = cfe->type; + struct cgroup *cgrp = __d_cgrp(cfe->dentry->d_parent); + + if (cft->ss) + return cgrp->subsys[cft->ss->subsys_id]; + return NULL; +} + /* A buffer size big enough for numbers or short strings */ #define CGROUP_LOCAL_BUFFER_SIZE 64 @@ -2354,8 +2365,6 @@ static ssize_t cgroup_file_write(struct file *file, const char __user *buf, struct cftype *cft = __d_cft(file->f_dentry); struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); - if (cgroup_is_dead(cgrp)) - return -ENODEV; if (cft->write) return cft->write(cgrp, cft, file, buf, nbytes, ppos); if (cft->write_u64 || cft->write_s64) @@ -2399,9 +2408,6 @@ static ssize_t cgroup_file_read(struct file *file, char __user *buf, struct cftype *cft = __d_cft(file->f_dentry); struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); - if (cgroup_is_dead(cgrp)) - return -ENODEV; - if (cft->read) return cft->read(cgrp, cft, file, buf, nbytes, ppos); if (cft->read_u64) @@ -2447,15 +2453,22 @@ static const struct file_operations cgroup_seqfile_operations = { static int cgroup_file_open(struct inode *inode, struct file *file) { + struct cfent *cfe = __d_cfe(file->f_dentry); + struct cftype *cft = __d_cft(file->f_dentry); + struct cgroup_subsys_state *css = cgroup_file_css(cfe); int err; - struct cfent *cfe; - struct cftype *cft; err = generic_file_open(inode, file); if (err) return err; - cfe = __d_cfe(file->f_dentry); - cft = cfe->type; + + /* + * If the file belongs to a subsystem, pin the css. Will be + * unpinned either on open failure or release. This ensures that + * @css stays alive for all file operations. + */ + if (css && !css_tryget(css)) + return -ENODEV; if (cft->read_map || cft->read_seq_string) { file->f_op = &cgroup_seqfile_operations; @@ -2464,15 +2477,23 @@ static int cgroup_file_open(struct inode *inode, struct file *file) err = cft->open(inode, file); } + if (css && err) + css_put(css); return err; } static int cgroup_file_release(struct inode *inode, struct file *file) { + struct cfent *cfe = __d_cfe(file->f_dentry); struct cftype *cft = __d_cft(file->f_dentry); + struct cgroup_subsys_state *css = cgroup_file_css(cfe); + int ret = 0; + if (cft->release) - return cft->release(inode, file); - return 0; + ret = cft->release(inode, file); + if (css) + css_put(css); + return ret; } /* -- cgit v0.10.2 From 67f4c36f83455b253445b2cb28ac9a2c4f85d99a Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 8 Aug 2013 20:11:24 -0400 Subject: cgroup: add cgroup->dummy_css cgroup subsystem API is being converted to use css (cgroup_subsys_state) as the main handle, which makes things a bit awkward for subsystem agnostic core features - the "cgroup.*" interface files and various iterations - a bit awkward as they don't have a css to use. This patch adds cgroup->dummy_css which has NULL ->ss and whose only role is pointing back to the cgroup. This will be used to support subsystem agnostic features on the coming css based API. css_parent() is updated to handle dummy_css's. Note that css will soon grow its own ->parent field and css_parent() will be made trivial. Signed-off-by: Tejun Heo Acked-by: Li Zefan diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 5db8138..b0d5f53 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -225,6 +225,9 @@ struct cgroup { struct list_head pidlists; struct mutex pidlist_mutex; + /* dummy css with NULL ->ss, points back to this cgroup */ + struct cgroup_subsys_state dummy_css; + /* For css percpu_ref killing and RCU-protected deletion */ struct rcu_head rcu_head; struct work_struct destroy_work; @@ -668,7 +671,13 @@ struct cgroup_subsys_state *css_parent(struct cgroup_subsys_state *css) { struct cgroup *parent_cgrp = css->cgroup->parent; - return parent_cgrp ? parent_cgrp->subsys[css->ss->subsys_id] : NULL; + if (!parent_cgrp) + return NULL; + + if (css->ss) + return parent_cgrp->subsys[css->ss->subsys_id]; + else + return &parent_cgrp->dummy_css; } /** diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 583f8f6..c049992 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1365,6 +1365,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) INIT_LIST_HEAD(&cgrp->release_list); INIT_LIST_HEAD(&cgrp->pidlists); mutex_init(&cgrp->pidlist_mutex); + cgrp->dummy_css.cgroup = cgrp; INIT_LIST_HEAD(&cgrp->event_list); spin_lock_init(&cgrp->event_list_lock); simple_xattrs_init(&cgrp->xattrs); @@ -2285,7 +2286,7 @@ static struct cgroup_subsys_state *cgroup_file_css(struct cfent *cfe) if (cft->ss) return cgrp->subsys[cft->ss->subsys_id]; - return NULL; + return &cgrp->dummy_css; } /* A buffer size big enough for numbers or short strings */ @@ -2467,7 +2468,7 @@ static int cgroup_file_open(struct inode *inode, struct file *file) * unpinned either on open failure or release. This ensures that * @css stays alive for all file operations. */ - if (css && !css_tryget(css)) + if (css->ss && !css_tryget(css)) return -ENODEV; if (cft->read_map || cft->read_seq_string) { @@ -2477,7 +2478,7 @@ static int cgroup_file_open(struct inode *inode, struct file *file) err = cft->open(inode, file); } - if (css && err) + if (css->ss && err) css_put(css); return err; } @@ -2491,7 +2492,7 @@ static int cgroup_file_release(struct inode *inode, struct file *file) if (cft->release) ret = cft->release(inode, file); - if (css) + if (css->ss) css_put(css); return ret; } -- cgit v0.10.2 From 182446d087906de40e514573a92a97b203695f71 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 8 Aug 2013 20:11:24 -0400 Subject: cgroup: pass around cgroup_subsys_state instead of cgroup in file methods cgroup is currently in the process of transitioning to using struct cgroup_subsys_state * as the primary handle instead of struct cgroup. Please see the previous commit which converts the subsystem methods for rationale. This patch converts all cftype file operations to take @css instead of @cgroup. cftypes for the cgroup core files don't have their subsytem pointer set. These will automatically use the dummy_css added by the previous patch and can be converted the same way. Most subsystem conversions are straight forwards but there are some interesting ones. * freezer: update_if_frozen() is also converted to take @css instead of @cgroup for consistency. This will make the code look simpler too once iterators are converted to use css. * memory/vmpressure: mem_cgroup_from_css() needs to be exported to vmpressure while mem_cgroup_from_cont() can be made static. Updated accordingly. * cpu: cgroup_tg() doesn't have any user left. Removed. * cpuacct: cgroup_ca() doesn't have any user left. Removed. * hugetlb: hugetlb_cgroup_form_cgroup() doesn't have any user left. Removed. * net_cls: cgrp_cls_state() doesn't have any user left. Removed. Signed-off-by: Tejun Heo Acked-by: Li Zefan Acked-by: Michal Hocko Acked-by: Vivek Goyal Acked-by: Aristeu Rozanski Acked-by: Daniel Wagner Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Johannes Weiner Cc: Balbir Singh Cc: Matt Helsley Cc: Jens Axboe Cc: Steven Rostedt diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 3406373..f46f3c6 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -437,10 +437,10 @@ struct request_list *__blk_queue_next_rl(struct request_list *rl, return &blkg->rl; } -static int blkcg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, - u64 val) +static int blkcg_reset_stats(struct cgroup_subsys_state *css, + struct cftype *cftype, u64 val) { - struct blkcg *blkcg = cgroup_to_blkcg(cgroup); + struct blkcg *blkcg = css_to_blkcg(css); struct blkcg_gq *blkg; int i; diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 08a32df..88bcfb6 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -1293,10 +1293,10 @@ static u64 tg_prfill_cpu_rwstat(struct seq_file *sf, return __blkg_prfill_rwstat(sf, pd, &rwstat); } -static int tg_print_cpu_rwstat(struct cgroup *cgrp, struct cftype *cft, - struct seq_file *sf) +static int tg_print_cpu_rwstat(struct cgroup_subsys_state *css, + struct cftype *cft, struct seq_file *sf) { - struct blkcg *blkcg = cgroup_to_blkcg(cgrp); + struct blkcg *blkcg = css_to_blkcg(css); blkcg_print_blkgs(sf, blkcg, tg_prfill_cpu_rwstat, &blkcg_policy_throtl, cft->private, true); @@ -1325,26 +1325,26 @@ static u64 tg_prfill_conf_uint(struct seq_file *sf, struct blkg_policy_data *pd, return __blkg_prfill_u64(sf, pd, v); } -static int tg_print_conf_u64(struct cgroup *cgrp, struct cftype *cft, - struct seq_file *sf) +static int tg_print_conf_u64(struct cgroup_subsys_state *css, + struct cftype *cft, struct seq_file *sf) { - blkcg_print_blkgs(sf, cgroup_to_blkcg(cgrp), tg_prfill_conf_u64, + blkcg_print_blkgs(sf, css_to_blkcg(css), tg_prfill_conf_u64, &blkcg_policy_throtl, cft->private, false); return 0; } -static int tg_print_conf_uint(struct cgroup *cgrp, struct cftype *cft, - struct seq_file *sf) +static int tg_print_conf_uint(struct cgroup_subsys_state *css, + struct cftype *cft, struct seq_file *sf) { - blkcg_print_blkgs(sf, cgroup_to_blkcg(cgrp), tg_prfill_conf_uint, + blkcg_print_blkgs(sf, css_to_blkcg(css), tg_prfill_conf_uint, &blkcg_policy_throtl, cft->private, false); return 0; } -static int tg_set_conf(struct cgroup *cgrp, struct cftype *cft, const char *buf, - bool is_u64) +static int tg_set_conf(struct cgroup_subsys_state *css, struct cftype *cft, + const char *buf, bool is_u64) { - struct blkcg *blkcg = cgroup_to_blkcg(cgrp); + struct blkcg *blkcg = css_to_blkcg(css); struct blkg_conf_ctx ctx; struct throtl_grp *tg; struct throtl_service_queue *sq; @@ -1403,16 +1403,16 @@ static int tg_set_conf(struct cgroup *cgrp, struct cftype *cft, const char *buf, return 0; } -static int tg_set_conf_u64(struct cgroup *cgrp, struct cftype *cft, +static int tg_set_conf_u64(struct cgroup_subsys_state *css, struct cftype *cft, const char *buf) { - return tg_set_conf(cgrp, cft, buf, true); + return tg_set_conf(css, cft, buf, true); } -static int tg_set_conf_uint(struct cgroup *cgrp, struct cftype *cft, +static int tg_set_conf_uint(struct cgroup_subsys_state *css, struct cftype *cft, const char *buf) { - return tg_set_conf(cgrp, cft, buf, false); + return tg_set_conf(css, cft, buf, false); } static struct cftype throtl_files[] = { diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index d5bbdcf..dabb9d0 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -1607,12 +1607,11 @@ static u64 cfqg_prfill_weight_device(struct seq_file *sf, return __blkg_prfill_u64(sf, pd, cfqg->dev_weight); } -static int cfqg_print_weight_device(struct cgroup *cgrp, struct cftype *cft, - struct seq_file *sf) +static int cfqg_print_weight_device(struct cgroup_subsys_state *css, + struct cftype *cft, struct seq_file *sf) { - blkcg_print_blkgs(sf, cgroup_to_blkcg(cgrp), - cfqg_prfill_weight_device, &blkcg_policy_cfq, 0, - false); + blkcg_print_blkgs(sf, css_to_blkcg(css), cfqg_prfill_weight_device, + &blkcg_policy_cfq, 0, false); return 0; } @@ -1626,35 +1625,34 @@ static u64 cfqg_prfill_leaf_weight_device(struct seq_file *sf, return __blkg_prfill_u64(sf, pd, cfqg->dev_leaf_weight); } -static int cfqg_print_leaf_weight_device(struct cgroup *cgrp, +static int cfqg_print_leaf_weight_device(struct cgroup_subsys_state *css, struct cftype *cft, struct seq_file *sf) { - blkcg_print_blkgs(sf, cgroup_to_blkcg(cgrp), - cfqg_prfill_leaf_weight_device, &blkcg_policy_cfq, 0, - false); + blkcg_print_blkgs(sf, css_to_blkcg(css), cfqg_prfill_leaf_weight_device, + &blkcg_policy_cfq, 0, false); return 0; } -static int cfq_print_weight(struct cgroup *cgrp, struct cftype *cft, +static int cfq_print_weight(struct cgroup_subsys_state *css, struct cftype *cft, struct seq_file *sf) { - seq_printf(sf, "%u\n", cgroup_to_blkcg(cgrp)->cfq_weight); + seq_printf(sf, "%u\n", css_to_blkcg(css)->cfq_weight); return 0; } -static int cfq_print_leaf_weight(struct cgroup *cgrp, struct cftype *cft, - struct seq_file *sf) +static int cfq_print_leaf_weight(struct cgroup_subsys_state *css, + struct cftype *cft, struct seq_file *sf) { - seq_printf(sf, "%u\n", - cgroup_to_blkcg(cgrp)->cfq_leaf_weight); + seq_printf(sf, "%u\n", css_to_blkcg(css)->cfq_leaf_weight); return 0; } -static int __cfqg_set_weight_device(struct cgroup *cgrp, struct cftype *cft, - const char *buf, bool is_leaf_weight) +static int __cfqg_set_weight_device(struct cgroup_subsys_state *css, + struct cftype *cft, const char *buf, + bool is_leaf_weight) { - struct blkcg *blkcg = cgroup_to_blkcg(cgrp); + struct blkcg *blkcg = css_to_blkcg(css); struct blkg_conf_ctx ctx; struct cfq_group *cfqg; int ret; @@ -1680,22 +1678,22 @@ static int __cfqg_set_weight_device(struct cgroup *cgrp, struct cftype *cft, return ret; } -static int cfqg_set_weight_device(struct cgroup *cgrp, struct cftype *cft, - const char *buf) +static int cfqg_set_weight_device(struct cgroup_subsys_state *css, + struct cftype *cft, const char *buf) { - return __cfqg_set_weight_device(cgrp, cft, buf, false); + return __cfqg_set_weight_device(css, cft, buf, false); } -static int cfqg_set_leaf_weight_device(struct cgroup *cgrp, struct cftype *cft, - const char *buf) +static int cfqg_set_leaf_weight_device(struct cgroup_subsys_state *css, + struct cftype *cft, const char *buf) { - return __cfqg_set_weight_device(cgrp, cft, buf, true); + return __cfqg_set_weight_device(css, cft, buf, true); } -static int __cfq_set_weight(struct cgroup *cgrp, struct cftype *cft, u64 val, - bool is_leaf_weight) +static int __cfq_set_weight(struct cgroup_subsys_state *css, struct cftype *cft, + u64 val, bool is_leaf_weight) { - struct blkcg *blkcg = cgroup_to_blkcg(cgrp); + struct blkcg *blkcg = css_to_blkcg(css); struct blkcg_gq *blkg; if (val < CFQ_WEIGHT_MIN || val > CFQ_WEIGHT_MAX) @@ -1727,30 +1725,32 @@ static int __cfq_set_weight(struct cgroup *cgrp, struct cftype *cft, u64 val, return 0; } -static int cfq_set_weight(struct cgroup *cgrp, struct cftype *cft, u64 val) +static int cfq_set_weight(struct cgroup_subsys_state *css, struct cftype *cft, + u64 val) { - return __cfq_set_weight(cgrp, cft, val, false); + return __cfq_set_weight(css, cft, val, false); } -static int cfq_set_leaf_weight(struct cgroup *cgrp, struct cftype *cft, u64 val) +static int cfq_set_leaf_weight(struct cgroup_subsys_state *css, + struct cftype *cft, u64 val) { - return __cfq_set_weight(cgrp, cft, val, true); + return __cfq_set_weight(css, cft, val, true); } -static int cfqg_print_stat(struct cgroup *cgrp, struct cftype *cft, +static int cfqg_print_stat(struct cgroup_subsys_state *css, struct cftype *cft, struct seq_file *sf) { - struct blkcg *blkcg = cgroup_to_blkcg(cgrp); + struct blkcg *blkcg = css_to_blkcg(css); blkcg_print_blkgs(sf, blkcg, blkg_prfill_stat, &blkcg_policy_cfq, cft->private, false); return 0; } -static int cfqg_print_rwstat(struct cgroup *cgrp, struct cftype *cft, - struct seq_file *sf) +static int cfqg_print_rwstat(struct cgroup_subsys_state *css, + struct cftype *cft, struct seq_file *sf) { - struct blkcg *blkcg = cgroup_to_blkcg(cgrp); + struct blkcg *blkcg = css_to_blkcg(css); blkcg_print_blkgs(sf, blkcg, blkg_prfill_rwstat, &blkcg_policy_cfq, cft->private, true); @@ -1773,20 +1773,20 @@ static u64 cfqg_prfill_rwstat_recursive(struct seq_file *sf, return __blkg_prfill_rwstat(sf, pd, &sum); } -static int cfqg_print_stat_recursive(struct cgroup *cgrp, struct cftype *cft, - struct seq_file *sf) +static int cfqg_print_stat_recursive(struct cgroup_subsys_state *css, + struct cftype *cft, struct seq_file *sf) { - struct blkcg *blkcg = cgroup_to_blkcg(cgrp); + struct blkcg *blkcg = css_to_blkcg(css); blkcg_print_blkgs(sf, blkcg, cfqg_prfill_stat_recursive, &blkcg_policy_cfq, cft->private, false); return 0; } -static int cfqg_print_rwstat_recursive(struct cgroup *cgrp, struct cftype *cft, - struct seq_file *sf) +static int cfqg_print_rwstat_recursive(struct cgroup_subsys_state *css, + struct cftype *cft, struct seq_file *sf) { - struct blkcg *blkcg = cgroup_to_blkcg(cgrp); + struct blkcg *blkcg = css_to_blkcg(css); blkcg_print_blkgs(sf, blkcg, cfqg_prfill_rwstat_recursive, &blkcg_policy_cfq, cft->private, true); @@ -1810,10 +1810,10 @@ static u64 cfqg_prfill_avg_queue_size(struct seq_file *sf, } /* print avg_queue_size */ -static int cfqg_print_avg_queue_size(struct cgroup *cgrp, struct cftype *cft, - struct seq_file *sf) +static int cfqg_print_avg_queue_size(struct cgroup_subsys_state *css, + struct cftype *cft, struct seq_file *sf) { - struct blkcg *blkcg = cgroup_to_blkcg(cgrp); + struct blkcg *blkcg = css_to_blkcg(css); blkcg_print_blkgs(sf, blkcg, cfqg_prfill_avg_queue_size, &blkcg_policy_cfq, 0, false); diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index b0d5f53..0b91436 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -439,34 +439,34 @@ struct cftype { struct cgroup_subsys *ss; int (*open)(struct inode *inode, struct file *file); - ssize_t (*read)(struct cgroup *cgrp, struct cftype *cft, + ssize_t (*read)(struct cgroup_subsys_state *css, struct cftype *cft, struct file *file, char __user *buf, size_t nbytes, loff_t *ppos); /* * read_u64() is a shortcut for the common case of returning a * single integer. Use it in place of read() */ - u64 (*read_u64)(struct cgroup *cgrp, struct cftype *cft); + u64 (*read_u64)(struct cgroup_subsys_state *css, struct cftype *cft); /* * read_s64() is a signed version of read_u64() */ - s64 (*read_s64)(struct cgroup *cgrp, struct cftype *cft); + s64 (*read_s64)(struct cgroup_subsys_state *css, struct cftype *cft); /* * read_map() is used for defining a map of key/value * pairs. It should call cb->fill(cb, key, value) for each * entry. The key/value pairs (and their ordering) should not * change between reboots. */ - int (*read_map)(struct cgroup *cgrp, struct cftype *cft, + int (*read_map)(struct cgroup_subsys_state *css, struct cftype *cft, struct cgroup_map_cb *cb); /* * read_seq_string() is used for outputting a simple sequence * using seqfile. */ - int (*read_seq_string)(struct cgroup *cgrp, struct cftype *cft, - struct seq_file *m); + int (*read_seq_string)(struct cgroup_subsys_state *css, + struct cftype *cft, struct seq_file *m); - ssize_t (*write)(struct cgroup *cgrp, struct cftype *cft, + ssize_t (*write)(struct cgroup_subsys_state *css, struct cftype *cft, struct file *file, const char __user *buf, size_t nbytes, loff_t *ppos); @@ -475,18 +475,20 @@ struct cftype { * a single integer (as parsed by simple_strtoull) from * userspace. Use in place of write(); return 0 or error. */ - int (*write_u64)(struct cgroup *cgrp, struct cftype *cft, u64 val); + int (*write_u64)(struct cgroup_subsys_state *css, struct cftype *cft, + u64 val); /* * write_s64() is a signed version of write_u64() */ - int (*write_s64)(struct cgroup *cgrp, struct cftype *cft, s64 val); + int (*write_s64)(struct cgroup_subsys_state *css, struct cftype *cft, + s64 val); /* * write_string() is passed a nul-terminated kernelspace * buffer of maximum length determined by max_write_len. * Returns 0 or -ve error code. */ - int (*write_string)(struct cgroup *cgrp, struct cftype *cft, + int (*write_string)(struct cgroup_subsys_state *css, struct cftype *cft, const char *buffer); /* * trigger() callback can be used to get some kick from the @@ -494,7 +496,7 @@ struct cftype { * at all. The private field can be used to determine the * kick type for multiplexing. */ - int (*trigger)(struct cgroup *cgrp, unsigned int event); + int (*trigger)(struct cgroup_subsys_state *css, unsigned int event); int (*release)(struct inode *inode, struct file *file); diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 7b4d9d7..6c41609 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -85,7 +85,7 @@ extern struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p); extern struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm); extern struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg); -extern struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont); +extern struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css); static inline bool mm_match_cgroup(const struct mm_struct *mm, const struct mem_cgroup *memcg) diff --git a/kernel/cgroup.c b/kernel/cgroup.c index c049992..6ee4698 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2235,34 +2235,38 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) } EXPORT_SYMBOL_GPL(cgroup_attach_task_all); -static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid) +static int cgroup_tasks_write(struct cgroup_subsys_state *css, + struct cftype *cft, u64 pid) { - return attach_task_by_pid(cgrp, pid, false); + return attach_task_by_pid(css->cgroup, pid, false); } -static int cgroup_procs_write(struct cgroup *cgrp, struct cftype *cft, u64 tgid) +static int cgroup_procs_write(struct cgroup_subsys_state *css, + struct cftype *cft, u64 tgid) { - return attach_task_by_pid(cgrp, tgid, true); + return attach_task_by_pid(css->cgroup, tgid, true); } -static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft, - const char *buffer) +static int cgroup_release_agent_write(struct cgroup_subsys_state *css, + struct cftype *cft, const char *buffer) { - BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX); + BUILD_BUG_ON(sizeof(css->cgroup->root->release_agent_path) < PATH_MAX); if (strlen(buffer) >= PATH_MAX) return -EINVAL; - if (!cgroup_lock_live_group(cgrp)) + if (!cgroup_lock_live_group(css->cgroup)) return -ENODEV; mutex_lock(&cgroup_root_mutex); - strcpy(cgrp->root->release_agent_path, buffer); + strcpy(css->cgroup->root->release_agent_path, buffer); mutex_unlock(&cgroup_root_mutex); mutex_unlock(&cgroup_mutex); return 0; } -static int cgroup_release_agent_show(struct cgroup *cgrp, struct cftype *cft, - struct seq_file *seq) +static int cgroup_release_agent_show(struct cgroup_subsys_state *css, + struct cftype *cft, struct seq_file *seq) { + struct cgroup *cgrp = css->cgroup; + if (!cgroup_lock_live_group(cgrp)) return -ENODEV; seq_puts(seq, cgrp->root->release_agent_path); @@ -2271,10 +2275,10 @@ static int cgroup_release_agent_show(struct cgroup *cgrp, struct cftype *cft, return 0; } -static int cgroup_sane_behavior_show(struct cgroup *cgrp, struct cftype *cft, - struct seq_file *seq) +static int cgroup_sane_behavior_show(struct cgroup_subsys_state *css, + struct cftype *cft, struct seq_file *seq) { - seq_printf(seq, "%d\n", cgroup_sane_behavior(cgrp)); + seq_printf(seq, "%d\n", cgroup_sane_behavior(css->cgroup)); return 0; } @@ -2292,10 +2296,10 @@ static struct cgroup_subsys_state *cgroup_file_css(struct cfent *cfe) /* A buffer size big enough for numbers or short strings */ #define CGROUP_LOCAL_BUFFER_SIZE 64 -static ssize_t cgroup_write_X64(struct cgroup *cgrp, struct cftype *cft, - struct file *file, - const char __user *userbuf, - size_t nbytes, loff_t *unused_ppos) +static ssize_t cgroup_write_X64(struct cgroup_subsys_state *css, + struct cftype *cft, struct file *file, + const char __user *userbuf, size_t nbytes, + loff_t *unused_ppos) { char buffer[CGROUP_LOCAL_BUFFER_SIZE]; int retval = 0; @@ -2313,22 +2317,22 @@ static ssize_t cgroup_write_X64(struct cgroup *cgrp, struct cftype *cft, u64 val = simple_strtoull(strstrip(buffer), &end, 0); if (*end) return -EINVAL; - retval = cft->write_u64(cgrp, cft, val); + retval = cft->write_u64(css, cft, val); } else { s64 val = simple_strtoll(strstrip(buffer), &end, 0); if (*end) return -EINVAL; - retval = cft->write_s64(cgrp, cft, val); + retval = cft->write_s64(css, cft, val); } if (!retval) retval = nbytes; return retval; } -static ssize_t cgroup_write_string(struct cgroup *cgrp, struct cftype *cft, - struct file *file, - const char __user *userbuf, - size_t nbytes, loff_t *unused_ppos) +static ssize_t cgroup_write_string(struct cgroup_subsys_state *css, + struct cftype *cft, struct file *file, + const char __user *userbuf, size_t nbytes, + loff_t *unused_ppos) { char local_buffer[CGROUP_LOCAL_BUFFER_SIZE]; int retval = 0; @@ -2351,7 +2355,7 @@ static ssize_t cgroup_write_string(struct cgroup *cgrp, struct cftype *cft, } buffer[nbytes] = 0; /* nul-terminate */ - retval = cft->write_string(cgrp, cft, strstrip(buffer)); + retval = cft->write_string(css, cft, strstrip(buffer)); if (!retval) retval = nbytes; out: @@ -2361,60 +2365,60 @@ out: } static ssize_t cgroup_file_write(struct file *file, const char __user *buf, - size_t nbytes, loff_t *ppos) + size_t nbytes, loff_t *ppos) { + struct cfent *cfe = __d_cfe(file->f_dentry); struct cftype *cft = __d_cft(file->f_dentry); - struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); + struct cgroup_subsys_state *css = cgroup_file_css(cfe); if (cft->write) - return cft->write(cgrp, cft, file, buf, nbytes, ppos); + return cft->write(css, cft, file, buf, nbytes, ppos); if (cft->write_u64 || cft->write_s64) - return cgroup_write_X64(cgrp, cft, file, buf, nbytes, ppos); + return cgroup_write_X64(css, cft, file, buf, nbytes, ppos); if (cft->write_string) - return cgroup_write_string(cgrp, cft, file, buf, nbytes, ppos); + return cgroup_write_string(css, cft, file, buf, nbytes, ppos); if (cft->trigger) { - int ret = cft->trigger(cgrp, (unsigned int)cft->private); + int ret = cft->trigger(css, (unsigned int)cft->private); return ret ? ret : nbytes; } return -EINVAL; } -static ssize_t cgroup_read_u64(struct cgroup *cgrp, struct cftype *cft, - struct file *file, - char __user *buf, size_t nbytes, - loff_t *ppos) +static ssize_t cgroup_read_u64(struct cgroup_subsys_state *css, + struct cftype *cft, struct file *file, + char __user *buf, size_t nbytes, loff_t *ppos) { char tmp[CGROUP_LOCAL_BUFFER_SIZE]; - u64 val = cft->read_u64(cgrp, cft); + u64 val = cft->read_u64(css, cft); int len = sprintf(tmp, "%llu\n", (unsigned long long) val); return simple_read_from_buffer(buf, nbytes, ppos, tmp, len); } -static ssize_t cgroup_read_s64(struct cgroup *cgrp, struct cftype *cft, - struct file *file, - char __user *buf, size_t nbytes, - loff_t *ppos) +static ssize_t cgroup_read_s64(struct cgroup_subsys_state *css, + struct cftype *cft, struct file *file, + char __user *buf, size_t nbytes, loff_t *ppos) { char tmp[CGROUP_LOCAL_BUFFER_SIZE]; - s64 val = cft->read_s64(cgrp, cft); + s64 val = cft->read_s64(css, cft); int len = sprintf(tmp, "%lld\n", (long long) val); return simple_read_from_buffer(buf, nbytes, ppos, tmp, len); } static ssize_t cgroup_file_read(struct file *file, char __user *buf, - size_t nbytes, loff_t *ppos) + size_t nbytes, loff_t *ppos) { + struct cfent *cfe = __d_cfe(file->f_dentry); struct cftype *cft = __d_cft(file->f_dentry); - struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); + struct cgroup_subsys_state *css = cgroup_file_css(cfe); if (cft->read) - return cft->read(cgrp, cft, file, buf, nbytes, ppos); + return cft->read(css, cft, file, buf, nbytes, ppos); if (cft->read_u64) - return cgroup_read_u64(cgrp, cft, file, buf, nbytes, ppos); + return cgroup_read_u64(css, cft, file, buf, nbytes, ppos); if (cft->read_s64) - return cgroup_read_s64(cgrp, cft, file, buf, nbytes, ppos); + return cgroup_read_s64(css, cft, file, buf, nbytes, ppos); return -EINVAL; } @@ -2433,16 +2437,16 @@ static int cgroup_seqfile_show(struct seq_file *m, void *arg) { struct cfent *cfe = m->private; struct cftype *cft = cfe->type; - struct cgroup *cgrp = __d_cgrp(cfe->dentry->d_parent); + struct cgroup_subsys_state *css = cgroup_file_css(cfe); if (cft->read_map) { struct cgroup_map_cb cb = { .fill = cgroup_map_add, .state = m, }; - return cft->read_map(cgrp, cft, &cb); + return cft->read_map(css, cft, &cb); } - return cft->read_seq_string(cgrp, cft, m); + return cft->read_seq_string(css, cft, m); } static const struct file_operations cgroup_seqfile_operations = { @@ -3860,21 +3864,20 @@ static int cgroup_procs_open(struct inode *unused, struct file *file) return cgroup_pidlist_open(file, CGROUP_FILE_PROCS); } -static u64 cgroup_read_notify_on_release(struct cgroup *cgrp, - struct cftype *cft) +static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css, + struct cftype *cft) { - return notify_on_release(cgrp); + return notify_on_release(css->cgroup); } -static int cgroup_write_notify_on_release(struct cgroup *cgrp, - struct cftype *cft, - u64 val) +static int cgroup_write_notify_on_release(struct cgroup_subsys_state *css, + struct cftype *cft, u64 val) { - clear_bit(CGRP_RELEASABLE, &cgrp->flags); + clear_bit(CGRP_RELEASABLE, &css->cgroup->flags); if (val) - set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); + set_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags); else - clear_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); + clear_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags); return 0; } @@ -3972,9 +3975,10 @@ static void cgroup_event_ptable_queue_proc(struct file *file, * Input must be in format ' '. * Interpretation of args is defined by control file implementation. */ -static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft, - const char *buffer) +static int cgroup_write_event_control(struct cgroup_subsys_state *css, + struct cftype *cft, const char *buffer) { + struct cgroup *cgrp = css->cgroup; struct cgroup_event *event; struct cgroup *cgrp_cfile; unsigned int efd, cfd; @@ -4082,20 +4086,19 @@ out_kfree: return ret; } -static u64 cgroup_clone_children_read(struct cgroup *cgrp, - struct cftype *cft) +static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css, + struct cftype *cft) { - return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags); + return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags); } -static int cgroup_clone_children_write(struct cgroup *cgrp, - struct cftype *cft, - u64 val) +static int cgroup_clone_children_write(struct cgroup_subsys_state *css, + struct cftype *cft, u64 val) { if (val) - set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags); + set_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags); else - clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags); + clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags); return 0; } @@ -5585,17 +5588,19 @@ static void debug_css_free(struct cgroup_subsys_state *css) kfree(css); } -static u64 debug_taskcount_read(struct cgroup *cgrp, struct cftype *cft) +static u64 debug_taskcount_read(struct cgroup_subsys_state *css, + struct cftype *cft) { - return cgroup_task_count(cgrp); + return cgroup_task_count(css->cgroup); } -static u64 current_css_set_read(struct cgroup *cgrp, struct cftype *cft) +static u64 current_css_set_read(struct cgroup_subsys_state *css, + struct cftype *cft) { return (u64)(unsigned long)current->cgroups; } -static u64 current_css_set_refcount_read(struct cgroup *cgrp, +static u64 current_css_set_refcount_read(struct cgroup_subsys_state *css, struct cftype *cft) { u64 count; @@ -5606,7 +5611,7 @@ static u64 current_css_set_refcount_read(struct cgroup *cgrp, return count; } -static int current_css_set_cg_links_read(struct cgroup *cgrp, +static int current_css_set_cg_links_read(struct cgroup_subsys_state *css, struct cftype *cft, struct seq_file *seq) { @@ -5633,14 +5638,13 @@ static int current_css_set_cg_links_read(struct cgroup *cgrp, } #define MAX_TASKS_SHOWN_PER_CSS 25 -static int cgroup_css_links_read(struct cgroup *cgrp, - struct cftype *cft, - struct seq_file *seq) +static int cgroup_css_links_read(struct cgroup_subsys_state *css, + struct cftype *cft, struct seq_file *seq) { struct cgrp_cset_link *link; read_lock(&css_set_lock); - list_for_each_entry(link, &cgrp->cset_links, cset_link) { + list_for_each_entry(link, &css->cgroup->cset_links, cset_link) { struct css_set *cset = link->cset; struct task_struct *task; int count = 0; @@ -5659,9 +5663,9 @@ static int cgroup_css_links_read(struct cgroup *cgrp, return 0; } -static u64 releasable_read(struct cgroup *cgrp, struct cftype *cft) +static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft) { - return test_bit(CGRP_RELEASABLE, &cgrp->flags); + return test_bit(CGRP_RELEASABLE, &css->cgroup->flags); } static struct cftype debug_files[] = { diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index f03a857..19613ba 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -245,7 +245,7 @@ out: /** * update_if_frozen - update whether a cgroup finished freezing - * @cgroup: cgroup of interest + * @css: css of interest * * Once FREEZING is initiated, transition to FROZEN is lazily updated by * calling this function. If the current state is FREEZING but not FROZEN, @@ -256,12 +256,12 @@ out: * update_if_frozen() on all descendants prior to invoking this function. * * Task states and freezer state might disagree while tasks are being - * migrated into or out of @cgroup, so we can't verify task states against + * migrated into or out of @css, so we can't verify task states against * @freezer state here. See freezer_attach() for details. */ -static void update_if_frozen(struct cgroup *cgroup) +static void update_if_frozen(struct cgroup_subsys_state *css) { - struct freezer *freezer = cgroup_freezer(cgroup); + struct freezer *freezer = css_freezer(css); struct cgroup *pos; struct cgroup_iter it; struct task_struct *task; @@ -275,7 +275,7 @@ static void update_if_frozen(struct cgroup *cgroup) goto out_unlock; /* are all (live) children frozen? */ - cgroup_for_each_child(pos, cgroup) { + cgroup_for_each_child(pos, css->cgroup) { struct freezer *child = cgroup_freezer(pos); if ((child->state & CGROUP_FREEZER_ONLINE) && @@ -284,9 +284,9 @@ static void update_if_frozen(struct cgroup *cgroup) } /* are all tasks frozen? */ - cgroup_iter_start(cgroup, &it); + cgroup_iter_start(css->cgroup, &it); - while ((task = cgroup_iter_next(cgroup, &it))) { + while ((task = cgroup_iter_next(css->cgroup, &it))) { if (freezing(task)) { /* * freezer_should_skip() indicates that the task @@ -301,12 +301,12 @@ static void update_if_frozen(struct cgroup *cgroup) freezer->state |= CGROUP_FROZEN; out_iter_end: - cgroup_iter_end(cgroup, &it); + cgroup_iter_end(css->cgroup, &it); out_unlock: spin_unlock_irq(&freezer->lock); } -static int freezer_read(struct cgroup *cgroup, struct cftype *cft, +static int freezer_read(struct cgroup_subsys_state *css, struct cftype *cft, struct seq_file *m) { struct cgroup *pos; @@ -314,13 +314,13 @@ static int freezer_read(struct cgroup *cgroup, struct cftype *cft, rcu_read_lock(); /* update states bottom-up */ - cgroup_for_each_descendant_post(pos, cgroup) - update_if_frozen(pos); - update_if_frozen(cgroup); + cgroup_for_each_descendant_post(pos, css->cgroup) + update_if_frozen(cgroup_css(pos, freezer_subsys_id)); + update_if_frozen(css); rcu_read_unlock(); - seq_puts(m, freezer_state_strs(cgroup_freezer(cgroup)->state)); + seq_puts(m, freezer_state_strs(css_freezer(css)->state)); seq_putc(m, '\n'); return 0; } @@ -426,7 +426,7 @@ static void freezer_change_state(struct freezer *freezer, bool freeze) rcu_read_unlock(); } -static int freezer_write(struct cgroup *cgroup, struct cftype *cft, +static int freezer_write(struct cgroup_subsys_state *css, struct cftype *cft, const char *buffer) { bool freeze; @@ -438,20 +438,22 @@ static int freezer_write(struct cgroup *cgroup, struct cftype *cft, else return -EINVAL; - freezer_change_state(cgroup_freezer(cgroup), freeze); + freezer_change_state(css_freezer(css), freeze); return 0; } -static u64 freezer_self_freezing_read(struct cgroup *cgroup, struct cftype *cft) +static u64 freezer_self_freezing_read(struct cgroup_subsys_state *css, + struct cftype *cft) { - struct freezer *freezer = cgroup_freezer(cgroup); + struct freezer *freezer = css_freezer(css); return (bool)(freezer->state & CGROUP_FREEZING_SELF); } -static u64 freezer_parent_freezing_read(struct cgroup *cgroup, struct cftype *cft) +static u64 freezer_parent_freezing_read(struct cgroup_subsys_state *css, + struct cftype *cft) { - struct freezer *freezer = cgroup_freezer(cgroup); + struct freezer *freezer = css_freezer(css); return (bool)(freezer->state & CGROUP_FREEZING_PARENT); } diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 8ce3fdc..89b76e1 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1603,9 +1603,10 @@ typedef enum { FILE_SPREAD_SLAB, } cpuset_filetype_t; -static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val) +static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft, + u64 val) { - struct cpuset *cs = cgroup_cs(cgrp); + struct cpuset *cs = css_cs(css); cpuset_filetype_t type = cft->private; int retval = -ENODEV; @@ -1650,9 +1651,10 @@ out_unlock: return retval; } -static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val) +static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft, + s64 val) { - struct cpuset *cs = cgroup_cs(cgrp); + struct cpuset *cs = css_cs(css); cpuset_filetype_t type = cft->private; int retval = -ENODEV; @@ -1676,10 +1678,10 @@ out_unlock: /* * Common handling for a write to a "cpus" or "mems" file. */ -static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft, - const char *buf) +static int cpuset_write_resmask(struct cgroup_subsys_state *css, + struct cftype *cft, const char *buf) { - struct cpuset *cs = cgroup_cs(cgrp); + struct cpuset *cs = css_cs(css); struct cpuset *trialcs; int retval = -ENODEV; @@ -1758,13 +1760,12 @@ static size_t cpuset_sprintf_memlist(char *page, struct cpuset *cs) return count; } -static ssize_t cpuset_common_file_read(struct cgroup *cgrp, - struct cftype *cft, - struct file *file, - char __user *buf, - size_t nbytes, loff_t *ppos) +static ssize_t cpuset_common_file_read(struct cgroup_subsys_state *css, + struct cftype *cft, struct file *file, + char __user *buf, size_t nbytes, + loff_t *ppos) { - struct cpuset *cs = cgroup_cs(cgrp); + struct cpuset *cs = css_cs(css); cpuset_filetype_t type = cft->private; char *page; ssize_t retval = 0; @@ -1794,9 +1795,9 @@ out: return retval; } -static u64 cpuset_read_u64(struct cgroup *cgrp, struct cftype *cft) +static u64 cpuset_read_u64(struct cgroup_subsys_state *css, struct cftype *cft) { - struct cpuset *cs = cgroup_cs(cgrp); + struct cpuset *cs = css_cs(css); cpuset_filetype_t type = cft->private; switch (type) { case FILE_CPU_EXCLUSIVE: @@ -1825,9 +1826,9 @@ static u64 cpuset_read_u64(struct cgroup *cgrp, struct cftype *cft) return 0; } -static s64 cpuset_read_s64(struct cgroup *cgrp, struct cftype *cft) +static s64 cpuset_read_s64(struct cgroup_subsys_state *css, struct cftype *cft) { - struct cpuset *cs = cgroup_cs(cgrp); + struct cpuset *cs = css_cs(css); cpuset_filetype_t type = cft->private; switch (type) { case FILE_SCHED_RELAX_DOMAIN_LEVEL: diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 622b7ef..cc9a492 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7088,12 +7088,6 @@ static inline struct task_group *css_tg(struct cgroup_subsys_state *css) return css ? container_of(css, struct task_group, css) : NULL; } -/* return corresponding task_group object of a cgroup */ -static inline struct task_group *cgroup_tg(struct cgroup *cgrp) -{ - return css_tg(cgroup_css(cgrp, cpu_cgroup_subsys_id)); -} - static struct cgroup_subsys_state * cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) { @@ -7179,15 +7173,16 @@ static void cpu_cgroup_exit(struct cgroup_subsys_state *css, } #ifdef CONFIG_FAIR_GROUP_SCHED -static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype, - u64 shareval) +static int cpu_shares_write_u64(struct cgroup_subsys_state *css, + struct cftype *cftype, u64 shareval) { - return sched_group_set_shares(cgroup_tg(cgrp), scale_load(shareval)); + return sched_group_set_shares(css_tg(css), scale_load(shareval)); } -static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft) +static u64 cpu_shares_read_u64(struct cgroup_subsys_state *css, + struct cftype *cft) { - struct task_group *tg = cgroup_tg(cgrp); + struct task_group *tg = css_tg(css); return (u64) scale_load_down(tg->shares); } @@ -7309,26 +7304,28 @@ long tg_get_cfs_period(struct task_group *tg) return cfs_period_us; } -static s64 cpu_cfs_quota_read_s64(struct cgroup *cgrp, struct cftype *cft) +static s64 cpu_cfs_quota_read_s64(struct cgroup_subsys_state *css, + struct cftype *cft) { - return tg_get_cfs_quota(cgroup_tg(cgrp)); + return tg_get_cfs_quota(css_tg(css)); } -static int cpu_cfs_quota_write_s64(struct cgroup *cgrp, struct cftype *cftype, - s64 cfs_quota_us) +static int cpu_cfs_quota_write_s64(struct cgroup_subsys_state *css, + struct cftype *cftype, s64 cfs_quota_us) { - return tg_set_cfs_quota(cgroup_tg(cgrp), cfs_quota_us); + return tg_set_cfs_quota(css_tg(css), cfs_quota_us); } -static u64 cpu_cfs_period_read_u64(struct cgroup *cgrp, struct cftype *cft) +static u64 cpu_cfs_period_read_u64(struct cgroup_subsys_state *css, + struct cftype *cft) { - return tg_get_cfs_period(cgroup_tg(cgrp)); + return tg_get_cfs_period(css_tg(css)); } -static int cpu_cfs_period_write_u64(struct cgroup *cgrp, struct cftype *cftype, - u64 cfs_period_us) +static int cpu_cfs_period_write_u64(struct cgroup_subsys_state *css, + struct cftype *cftype, u64 cfs_period_us) { - return tg_set_cfs_period(cgroup_tg(cgrp), cfs_period_us); + return tg_set_cfs_period(css_tg(css), cfs_period_us); } struct cfs_schedulable_data { @@ -7409,10 +7406,10 @@ static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota) return ret; } -static int cpu_stats_show(struct cgroup *cgrp, struct cftype *cft, +static int cpu_stats_show(struct cgroup_subsys_state *css, struct cftype *cft, struct cgroup_map_cb *cb) { - struct task_group *tg = cgroup_tg(cgrp); + struct task_group *tg = css_tg(css); struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; cb->fill(cb, "nr_periods", cfs_b->nr_periods); @@ -7425,26 +7422,28 @@ static int cpu_stats_show(struct cgroup *cgrp, struct cftype *cft, #endif /* CONFIG_FAIR_GROUP_SCHED */ #ifdef CONFIG_RT_GROUP_SCHED -static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft, - s64 val) +static int cpu_rt_runtime_write(struct cgroup_subsys_state *css, + struct cftype *cft, s64 val) { - return sched_group_set_rt_runtime(cgroup_tg(cgrp), val); + return sched_group_set_rt_runtime(css_tg(css), val); } -static s64 cpu_rt_runtime_read(struct cgroup *cgrp, struct cftype *cft) +static s64 cpu_rt_runtime_read(struct cgroup_subsys_state *css, + struct cftype *cft) { - return sched_group_rt_runtime(cgroup_tg(cgrp)); + return sched_group_rt_runtime(css_tg(css)); } -static int cpu_rt_period_write_uint(struct cgroup *cgrp, struct cftype *cftype, - u64 rt_period_us) +static int cpu_rt_period_write_uint(struct cgroup_subsys_state *css, + struct cftype *cftype, u64 rt_period_us) { - return sched_group_set_rt_period(cgroup_tg(cgrp), rt_period_us); + return sched_group_set_rt_period(css_tg(css), rt_period_us); } -static u64 cpu_rt_period_read_uint(struct cgroup *cgrp, struct cftype *cft) +static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css, + struct cftype *cft) { - return sched_group_rt_period(cgroup_tg(cgrp)); + return sched_group_rt_period(css_tg(css)); } #endif /* CONFIG_RT_GROUP_SCHED */ diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index 1b784d9..f64722f 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c @@ -38,12 +38,6 @@ static inline struct cpuacct *css_ca(struct cgroup_subsys_state *css) return css ? container_of(css, struct cpuacct, css) : NULL; } -/* return cpu accounting group corresponding to this container */ -static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp) -{ - return css_ca(cgroup_css(cgrp, cpuacct_subsys_id)); -} - /* return cpu accounting group to which this task belongs */ static inline struct cpuacct *task_ca(struct task_struct *tsk) { @@ -138,9 +132,9 @@ static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val) } /* return total cpu usage (in nanoseconds) of a group */ -static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft) +static u64 cpuusage_read(struct cgroup_subsys_state *css, struct cftype *cft) { - struct cpuacct *ca = cgroup_ca(cgrp); + struct cpuacct *ca = css_ca(css); u64 totalcpuusage = 0; int i; @@ -150,10 +144,10 @@ static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft) return totalcpuusage; } -static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype, - u64 reset) +static int cpuusage_write(struct cgroup_subsys_state *css, struct cftype *cft, + u64 reset) { - struct cpuacct *ca = cgroup_ca(cgrp); + struct cpuacct *ca = css_ca(css); int err = 0; int i; @@ -169,10 +163,10 @@ out: return err; } -static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft, - struct seq_file *m) +static int cpuacct_percpu_seq_read(struct cgroup_subsys_state *css, + struct cftype *cft, struct seq_file *m) { - struct cpuacct *ca = cgroup_ca(cgroup); + struct cpuacct *ca = css_ca(css); u64 percpu; int i; @@ -189,10 +183,10 @@ static const char * const cpuacct_stat_desc[] = { [CPUACCT_STAT_SYSTEM] = "system", }; -static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft, - struct cgroup_map_cb *cb) +static int cpuacct_stats_show(struct cgroup_subsys_state *css, + struct cftype *cft, struct cgroup_map_cb *cb) { - struct cpuacct *ca = cgroup_ca(cgrp); + struct cpuacct *ca = css_ca(css); int cpu; s64 val = 0; diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c index e213243..bda8e44 100644 --- a/mm/hugetlb_cgroup.c +++ b/mm/hugetlb_cgroup.c @@ -40,12 +40,6 @@ struct hugetlb_cgroup *hugetlb_cgroup_from_css(struct cgroup_subsys_state *s) } static inline -struct hugetlb_cgroup *hugetlb_cgroup_from_cgroup(struct cgroup *cgroup) -{ - return hugetlb_cgroup_from_css(cgroup_css(cgroup, hugetlb_subsys_id)); -} - -static inline struct hugetlb_cgroup *hugetlb_cgroup_from_task(struct task_struct *task) { return hugetlb_cgroup_from_css(task_css(task, hugetlb_subsys_id)); @@ -248,14 +242,15 @@ void hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages, return; } -static ssize_t hugetlb_cgroup_read(struct cgroup *cgroup, struct cftype *cft, - struct file *file, char __user *buf, - size_t nbytes, loff_t *ppos) +static ssize_t hugetlb_cgroup_read(struct cgroup_subsys_state *css, + struct cftype *cft, struct file *file, + char __user *buf, size_t nbytes, + loff_t *ppos) { u64 val; char str[64]; int idx, name, len; - struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_cgroup(cgroup); + struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css); idx = MEMFILE_IDX(cft->private); name = MEMFILE_ATTR(cft->private); @@ -265,12 +260,12 @@ static ssize_t hugetlb_cgroup_read(struct cgroup *cgroup, struct cftype *cft, return simple_read_from_buffer(buf, nbytes, ppos, str, len); } -static int hugetlb_cgroup_write(struct cgroup *cgroup, struct cftype *cft, - const char *buffer) +static int hugetlb_cgroup_write(struct cgroup_subsys_state *css, + struct cftype *cft, const char *buffer) { int idx, name, ret; unsigned long long val; - struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_cgroup(cgroup); + struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css); idx = MEMFILE_IDX(cft->private); name = MEMFILE_ATTR(cft->private); @@ -295,10 +290,11 @@ static int hugetlb_cgroup_write(struct cgroup *cgroup, struct cftype *cft, return ret; } -static int hugetlb_cgroup_reset(struct cgroup *cgroup, unsigned int event) +static int hugetlb_cgroup_reset(struct cgroup_subsys_state *css, + unsigned int event) { int idx, name, ret = 0; - struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_cgroup(cgroup); + struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css); idx = MEMFILE_IDX(event); name = MEMFILE_ATTR(event); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 32cca0f..ab64dfc 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -483,7 +483,6 @@ enum res_type { */ static DEFINE_MUTEX(memcg_create_mutex); -static inline struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *s) { return s ? container_of(s, struct mem_cgroup, css) : NULL; @@ -1035,7 +1034,7 @@ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page) preempt_enable(); } -struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont) +static inline struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont) { return mem_cgroup_from_css(cgroup_css(cont, mem_cgroup_subsys_id)); } @@ -2951,10 +2950,10 @@ static struct kmem_cache *memcg_params_to_cache(struct memcg_cache_params *p) } #ifdef CONFIG_SLABINFO -static int mem_cgroup_slabinfo_read(struct cgroup *cont, struct cftype *cft, - struct seq_file *m) +static int mem_cgroup_slabinfo_read(struct cgroup_subsys_state *css, + struct cftype *cft, struct seq_file *m) { - struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); + struct mem_cgroup *memcg = mem_cgroup_from_css(css); struct memcg_cache_params *params; if (!memcg_can_account_kmem(memcg)) @@ -4999,9 +4998,10 @@ static int mem_cgroup_force_empty(struct mem_cgroup *memcg) return 0; } -static int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event) +static int mem_cgroup_force_empty_write(struct cgroup_subsys_state *css, + unsigned int event) { - struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); + struct mem_cgroup *memcg = mem_cgroup_from_css(css); int ret; if (mem_cgroup_is_root(memcg)) @@ -5014,16 +5014,17 @@ static int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event) } -static u64 mem_cgroup_hierarchy_read(struct cgroup *cont, struct cftype *cft) +static u64 mem_cgroup_hierarchy_read(struct cgroup_subsys_state *css, + struct cftype *cft) { - return mem_cgroup_from_cont(cont)->use_hierarchy; + return mem_cgroup_from_css(css)->use_hierarchy; } -static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft, - u64 val) +static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css, + struct cftype *cft, u64 val) { int retval = 0; - struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); + struct mem_cgroup *memcg = mem_cgroup_from_css(css); struct mem_cgroup *parent_memcg = mem_cgroup_from_css(css_parent(&memcg->css)); mutex_lock(&memcg_create_mutex); @@ -5094,11 +5095,11 @@ static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) return val << PAGE_SHIFT; } -static ssize_t mem_cgroup_read(struct cgroup *cont, struct cftype *cft, - struct file *file, char __user *buf, - size_t nbytes, loff_t *ppos) +static ssize_t mem_cgroup_read(struct cgroup_subsys_state *css, + struct cftype *cft, struct file *file, + char __user *buf, size_t nbytes, loff_t *ppos) { - struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); + struct mem_cgroup *memcg = mem_cgroup_from_css(css); char str[64]; u64 val; int name, len; @@ -5131,11 +5132,11 @@ static ssize_t mem_cgroup_read(struct cgroup *cont, struct cftype *cft, return simple_read_from_buffer(buf, nbytes, ppos, str, len); } -static int memcg_update_kmem_limit(struct cgroup *cont, u64 val) +static int memcg_update_kmem_limit(struct cgroup_subsys_state *css, u64 val) { int ret = -EINVAL; #ifdef CONFIG_MEMCG_KMEM - struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); + struct mem_cgroup *memcg = mem_cgroup_from_css(css); /* * For simplicity, we won't allow this to be disabled. It also can't * be changed if the cgroup has children already, or if tasks had @@ -5151,7 +5152,7 @@ static int memcg_update_kmem_limit(struct cgroup *cont, u64 val) mutex_lock(&memcg_create_mutex); mutex_lock(&set_limit_mutex); if (!memcg->kmem_account_flags && val != RESOURCE_MAX) { - if (cgroup_task_count(cont) || memcg_has_children(memcg)) { + if (cgroup_task_count(css->cgroup) || memcg_has_children(memcg)) { ret = -EBUSY; goto out; } @@ -5221,10 +5222,10 @@ out: * The user of this function is... * RES_LIMIT. */ -static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft, +static int mem_cgroup_write(struct cgroup_subsys_state *css, struct cftype *cft, const char *buffer) { - struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); + struct mem_cgroup *memcg = mem_cgroup_from_css(css); enum res_type type; int name; unsigned long long val; @@ -5248,7 +5249,7 @@ static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft, else if (type == _MEMSWAP) ret = mem_cgroup_resize_memsw_limit(memcg, val); else if (type == _KMEM) - ret = memcg_update_kmem_limit(cont, val); + ret = memcg_update_kmem_limit(css, val); else return -EINVAL; break; @@ -5297,9 +5298,9 @@ out: *memsw_limit = min_memsw_limit; } -static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) +static int mem_cgroup_reset(struct cgroup_subsys_state *css, unsigned int event) { - struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); + struct mem_cgroup *memcg = mem_cgroup_from_css(css); int name; enum res_type type; @@ -5332,17 +5333,17 @@ static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) return 0; } -static u64 mem_cgroup_move_charge_read(struct cgroup *cgrp, +static u64 mem_cgroup_move_charge_read(struct cgroup_subsys_state *css, struct cftype *cft) { - return mem_cgroup_from_cont(cgrp)->move_charge_at_immigrate; + return mem_cgroup_from_css(css)->move_charge_at_immigrate; } #ifdef CONFIG_MMU -static int mem_cgroup_move_charge_write(struct cgroup *cgrp, +static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css, struct cftype *cft, u64 val) { - struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); + struct mem_cgroup *memcg = mem_cgroup_from_css(css); if (val >= (1 << NR_MOVE_TYPE)) return -EINVAL; @@ -5357,7 +5358,7 @@ static int mem_cgroup_move_charge_write(struct cgroup *cgrp, return 0; } #else -static int mem_cgroup_move_charge_write(struct cgroup *cgrp, +static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css, struct cftype *cft, u64 val) { return -ENOSYS; @@ -5365,13 +5366,13 @@ static int mem_cgroup_move_charge_write(struct cgroup *cgrp, #endif #ifdef CONFIG_NUMA -static int memcg_numa_stat_show(struct cgroup *cont, struct cftype *cft, - struct seq_file *m) +static int memcg_numa_stat_show(struct cgroup_subsys_state *css, + struct cftype *cft, struct seq_file *m) { int nid; unsigned long total_nr, file_nr, anon_nr, unevictable_nr; unsigned long node_nr; - struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); + struct mem_cgroup *memcg = mem_cgroup_from_css(css); total_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL); seq_printf(m, "total=%lu", total_nr); @@ -5416,10 +5417,10 @@ static inline void mem_cgroup_lru_names_not_uptodate(void) BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS); } -static int memcg_stat_show(struct cgroup *cont, struct cftype *cft, +static int memcg_stat_show(struct cgroup_subsys_state *css, struct cftype *cft, struct seq_file *m) { - struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); + struct mem_cgroup *memcg = mem_cgroup_from_css(css); struct mem_cgroup *mi; unsigned int i; @@ -5503,17 +5504,18 @@ static int memcg_stat_show(struct cgroup *cont, struct cftype *cft, return 0; } -static u64 mem_cgroup_swappiness_read(struct cgroup *cgrp, struct cftype *cft) +static u64 mem_cgroup_swappiness_read(struct cgroup_subsys_state *css, + struct cftype *cft) { - struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); + struct mem_cgroup *memcg = mem_cgroup_from_css(css); return mem_cgroup_swappiness(memcg); } -static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft, - u64 val) +static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css, + struct cftype *cft, u64 val) { - struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); + struct mem_cgroup *memcg = mem_cgroup_from_css(css); struct mem_cgroup *parent = mem_cgroup_from_css(css_parent(&memcg->css)); if (val > 100 || !parent) @@ -5829,10 +5831,10 @@ static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp, spin_unlock(&memcg_oom_lock); } -static int mem_cgroup_oom_control_read(struct cgroup *cgrp, +static int mem_cgroup_oom_control_read(struct cgroup_subsys_state *css, struct cftype *cft, struct cgroup_map_cb *cb) { - struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); + struct mem_cgroup *memcg = mem_cgroup_from_css(css); cb->fill(cb, "oom_kill_disable", memcg->oom_kill_disable); @@ -5843,10 +5845,10 @@ static int mem_cgroup_oom_control_read(struct cgroup *cgrp, return 0; } -static int mem_cgroup_oom_control_write(struct cgroup *cgrp, +static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css, struct cftype *cft, u64 val) { - struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); + struct mem_cgroup *memcg = mem_cgroup_from_css(css); struct mem_cgroup *parent = mem_cgroup_from_css(css_parent(&memcg->css)); /* cannot set to root cgroup and only 0 and 1 are allowed */ diff --git a/mm/vmpressure.c b/mm/vmpressure.c index 7f1654d..2a8a736 100644 --- a/mm/vmpressure.c +++ b/mm/vmpressure.c @@ -81,8 +81,8 @@ static struct vmpressure *cg_to_vmpressure(struct cgroup *cg) static struct vmpressure *vmpressure_parent(struct vmpressure *vmpr) { - struct cgroup *cg = vmpressure_to_css(vmpr)->cgroup; - struct mem_cgroup *memcg = mem_cgroup_from_cont(cg); + struct cgroup_subsys_state *css = vmpressure_to_css(vmpr); + struct mem_cgroup *memcg = mem_cgroup_from_css(css); memcg = parent_mem_cgroup(memcg); if (!memcg) diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c index 8d095b4..e00f60e 100644 --- a/net/core/netprio_cgroup.c +++ b/net/core/netprio_cgroup.c @@ -168,15 +168,14 @@ static void cgrp_css_free(struct cgroup_subsys_state *css) kfree(css); } -static u64 read_prioidx(struct cgroup *cgrp, struct cftype *cft) +static u64 read_prioidx(struct cgroup_subsys_state *css, struct cftype *cft) { - return cgrp->id; + return css->cgroup->id; } -static int read_priomap(struct cgroup *cont, struct cftype *cft, +static int read_priomap(struct cgroup_subsys_state *css, struct cftype *cft, struct cgroup_map_cb *cb) { - struct cgroup_subsys_state *css = cgroup_css(cont, net_prio_subsys_id); struct net_device *dev; rcu_read_lock(); @@ -186,10 +185,9 @@ static int read_priomap(struct cgroup *cont, struct cftype *cft, return 0; } -static int write_priomap(struct cgroup *cgrp, struct cftype *cft, +static int write_priomap(struct cgroup_subsys_state *css, struct cftype *cft, const char *buffer) { - struct cgroup_subsys_state *css = cgroup_css(cgrp, net_prio_subsys_id); char devname[IFNAMSIZ + 1]; struct net_device *dev; u32 prio; diff --git a/net/ipv4/tcp_memcontrol.c b/net/ipv4/tcp_memcontrol.c index da14436..8a57d79 100644 --- a/net/ipv4/tcp_memcontrol.c +++ b/net/ipv4/tcp_memcontrol.c @@ -132,10 +132,10 @@ static int tcp_update_limit(struct mem_cgroup *memcg, u64 val) return 0; } -static int tcp_cgroup_write(struct cgroup *cont, struct cftype *cft, +static int tcp_cgroup_write(struct cgroup_subsys_state *css, struct cftype *cft, const char *buffer) { - struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); + struct mem_cgroup *memcg = mem_cgroup_from_css(css); unsigned long long val; int ret = 0; @@ -180,9 +180,9 @@ static u64 tcp_read_usage(struct mem_cgroup *memcg) return res_counter_read_u64(&tcp->tcp_memory_allocated, RES_USAGE); } -static u64 tcp_cgroup_read(struct cgroup *cont, struct cftype *cft) +static u64 tcp_cgroup_read(struct cgroup_subsys_state *css, struct cftype *cft) { - struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); + struct mem_cgroup *memcg = mem_cgroup_from_css(css); u64 val; switch (cft->private) { @@ -202,13 +202,13 @@ static u64 tcp_cgroup_read(struct cgroup *cont, struct cftype *cft) return val; } -static int tcp_cgroup_reset(struct cgroup *cont, unsigned int event) +static int tcp_cgroup_reset(struct cgroup_subsys_state *css, unsigned int event) { struct mem_cgroup *memcg; struct tcp_memcontrol *tcp; struct cg_proto *cg_proto; - memcg = mem_cgroup_from_cont(cont); + memcg = mem_cgroup_from_css(css); cg_proto = tcp_prot.proto_cgroup(memcg); if (!cg_proto) return 0; diff --git a/net/sched/cls_cgroup.c b/net/sched/cls_cgroup.c index dc39838..8ea1184 100644 --- a/net/sched/cls_cgroup.c +++ b/net/sched/cls_cgroup.c @@ -28,11 +28,6 @@ static inline struct cgroup_cls_state *css_cls_state(struct cgroup_subsys_state return css ? container_of(css, struct cgroup_cls_state, css) : NULL; } -static inline struct cgroup_cls_state *cgrp_cls_state(struct cgroup *cgrp) -{ - return css_cls_state(cgroup_css(cgrp, net_cls_subsys_id)); -} - static inline struct cgroup_cls_state *task_cls_state(struct task_struct *p) { return css_cls_state(task_css(p, net_cls_subsys_id)); @@ -87,14 +82,15 @@ static void cgrp_attach(struct cgroup_subsys_state *css, } } -static u64 read_classid(struct cgroup *cgrp, struct cftype *cft) +static u64 read_classid(struct cgroup_subsys_state *css, struct cftype *cft) { - return cgrp_cls_state(cgrp)->classid; + return css_cls_state(css)->classid; } -static int write_classid(struct cgroup *cgrp, struct cftype *cft, u64 value) +static int write_classid(struct cgroup_subsys_state *css, struct cftype *cft, + u64 value) { - cgrp_cls_state(cgrp)->classid = (u32) value; + css_cls_state(css)->classid = (u32) value; return 0; } diff --git a/security/device_cgroup.c b/security/device_cgroup.c index 7293ac4..e0ca464 100644 --- a/security/device_cgroup.c +++ b/security/device_cgroup.c @@ -289,10 +289,10 @@ static void set_majmin(char *str, unsigned m) sprintf(str, "%u", m); } -static int devcgroup_seq_read(struct cgroup *cgroup, struct cftype *cft, - struct seq_file *m) +static int devcgroup_seq_read(struct cgroup_subsys_state *css, + struct cftype *cft, struct seq_file *m) { - struct dev_cgroup *devcgroup = cgroup_to_devcgroup(cgroup); + struct dev_cgroup *devcgroup = css_to_devcgroup(css); struct dev_exception_item *ex; char maj[MAJMINLEN], min[MAJMINLEN], acc[ACCLEN]; @@ -669,13 +669,13 @@ static int devcgroup_update_access(struct dev_cgroup *devcgroup, return rc; } -static int devcgroup_access_write(struct cgroup *cgrp, struct cftype *cft, - const char *buffer) +static int devcgroup_access_write(struct cgroup_subsys_state *css, + struct cftype *cft, const char *buffer) { int retval; mutex_lock(&devcgroup_mutex); - retval = devcgroup_update_access(cgroup_to_devcgroup(cgrp), + retval = devcgroup_update_access(css_to_devcgroup(css), cft->private, buffer); mutex_unlock(&devcgroup_mutex); return retval; -- cgit v0.10.2 From 3b287a505ef4024634beb12a93773254909d5dae Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 8 Aug 2013 20:11:24 -0400 Subject: cgroup: convert cgroup_next_sibling() to cgroup_next_child() cgroup is transitioning to using css (cgroup_subsys_state) as the main subsys interface handle instead of cgroup and the iterators will be updated to use css too. The iterators need to walk the cgroup hierarchy and return the css's matching the origin css, which is a bit cumbersome to open code. This patch converts cgroup_next_sibling() to cgroup_next_child() so that it can handle all steps of direct child iteration. This will be used to update iterators to take @css instead of @cgrp. In addition to the new iteration init handling, cgroup_next_child() is restructured so that the different branches share the end of iteration condition check. This patch doesn't change any behavior. Signed-off-by: Tejun Heo Acked-by: Li Zefan diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 0b91436..5f9ba58 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -779,7 +779,7 @@ static inline struct cgroup *cgroup_from_id(struct cgroup_subsys *ss, int id) return idr_find(&ss->root->cgroup_idr, id); } -struct cgroup *cgroup_next_sibling(struct cgroup *pos); +struct cgroup *cgroup_next_child(struct cgroup *pos, struct cgroup *cgrp); /** * cgroup_for_each_child - iterate through children of a cgroup @@ -802,7 +802,7 @@ struct cgroup *cgroup_next_sibling(struct cgroup *pos); #define cgroup_for_each_child(pos, cgrp) \ for ((pos) = list_first_or_null_rcu(&(cgrp)->children, \ struct cgroup, sibling); \ - (pos); (pos) = cgroup_next_sibling((pos))) + (pos); (pos) = cgroup_next_child((pos), (cgrp))) struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos, struct cgroup *cgroup); diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 6ee4698..dd55244 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -3037,15 +3037,16 @@ static void cgroup_enable_task_cg_lists(void) } /** - * cgroup_next_sibling - find the next sibling of a given cgroup - * @pos: the current cgroup + * cgroup_next_child - find the next child of a given cgroup + * @pos: the current position (%NULL to initiate traversal) + * @cgrp: cgroup whose descendants to walk * - * This function returns the next sibling of @pos and should be called - * under RCU read lock. The only requirement is that @pos is accessible. - * The next sibling is guaranteed to be returned regardless of @pos's - * state. + * This function returns the next child of @cgrp and should be called under + * RCU read lock. The only requirement is that @cgrp and @pos are + * accessible. The next sibling is guaranteed to be returned regardless of + * their states. */ -struct cgroup *cgroup_next_sibling(struct cgroup *pos) +struct cgroup *cgroup_next_child(struct cgroup *pos, struct cgroup *cgrp) { struct cgroup *next; @@ -3061,30 +3062,30 @@ struct cgroup *cgroup_next_sibling(struct cgroup *pos) * safe to dereference from this RCU critical section. If * ->sibling.next is inaccessible, cgroup_is_dead() is guaranteed * to be visible as %true here. + * + * If @pos is dead, its next pointer can't be dereferenced; + * however, as each cgroup is given a monotonically increasing + * unique serial number and always appended to the sibling list, + * the next one can be found by walking the parent's children until + * we see a cgroup with higher serial number than @pos's. While + * this path can be slower, it's taken only when either the current + * cgroup is removed or iteration and removal race. */ - if (likely(!cgroup_is_dead(pos))) { + if (!pos) { + next = list_entry_rcu(cgrp->children.next, struct cgroup, sibling); + } else if (likely(!cgroup_is_dead(pos))) { next = list_entry_rcu(pos->sibling.next, struct cgroup, sibling); - if (&next->sibling != &pos->parent->children) - return next; - return NULL; + } else { + list_for_each_entry_rcu(next, &cgrp->children, sibling) + if (next->serial_nr > pos->serial_nr) + break; } - /* - * Can't dereference the next pointer. Each cgroup is given a - * monotonically increasing unique serial number and always - * appended to the sibling list, so the next one can be found by - * walking the parent's children until we see a cgroup with higher - * serial number than @pos's. - * - * While this path can be slow, it's taken only when either the - * current cgroup is removed or iteration and removal race. - */ - list_for_each_entry_rcu(next, &pos->parent->children, sibling) - if (next->serial_nr > pos->serial_nr) - return next; + if (&next->sibling != &cgrp->children) + return next; return NULL; } -EXPORT_SYMBOL_GPL(cgroup_next_sibling); +EXPORT_SYMBOL_GPL(cgroup_next_child); /** * cgroup_next_descendant_pre - find the next descendant for pre-order walk @@ -3117,7 +3118,7 @@ struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos, /* no child, visit my or the closest ancestor's next sibling */ while (pos != cgroup) { - next = cgroup_next_sibling(pos); + next = cgroup_next_child(pos, pos->parent); if (next) return next; pos = pos->parent; @@ -3198,7 +3199,7 @@ struct cgroup *cgroup_next_descendant_post(struct cgroup *pos, } /* if there's an unvisited sibling, visit its leftmost descendant */ - next = cgroup_next_sibling(pos); + next = cgroup_next_child(pos, pos->parent); if (next) return cgroup_leftmost_descendant(next); @@ -4549,9 +4550,9 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) /* * Mark @cgrp dead. This prevents further task migration and child * creation by disabling cgroup_lock_live_group(). Note that - * CGRP_DEAD assertion is depended upon by cgroup_next_sibling() to + * CGRP_DEAD assertion is depended upon by cgroup_next_child() to * resume iteration after dropping RCU read lock. See - * cgroup_next_sibling() for details. + * cgroup_next_child() for details. */ set_bit(CGRP_DEAD, &cgrp->flags); -- cgit v0.10.2 From f48e3924dca268c677c4e338e5d91ad9e6fe6b9e Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 8 Aug 2013 20:11:24 -0400 Subject: cgroup: always use cgroup_next_child() to walk the children list There are several places where the children list is accessed directly. This patch converts those places to use cgroup_next_child(). This will help updating the hierarchy iterators to use @css instead of @cgrp. While cgroup_next_child() can be heavy in pathological cases - e.g. a lot of dead children, this shouldn't cause any noticeable behavior differences. Signed-off-by: Tejun Heo Acked-by: Li Zefan diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 5f9ba58..c288bce 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -800,9 +800,8 @@ struct cgroup *cgroup_next_child(struct cgroup *pos, struct cgroup *cgrp); * the start of the next iteration by, for example, bumping the css refcnt. */ #define cgroup_for_each_child(pos, cgrp) \ - for ((pos) = list_first_or_null_rcu(&(cgrp)->children, \ - struct cgroup, sibling); \ - (pos); (pos) = cgroup_next_child((pos), (cgrp))) + for ((pos) = cgroup_next_child(NULL, (cgrp)); (pos); \ + (pos) = cgroup_next_child((pos), (cgrp))) struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos, struct cgroup *cgroup); diff --git a/kernel/cgroup.c b/kernel/cgroup.c index dd55244..2b7354f 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -3112,7 +3112,7 @@ struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos, pos = cgroup; /* visit the first child if exists */ - next = list_first_or_null_rcu(&pos->children, struct cgroup, sibling); + next = cgroup_next_child(NULL, pos); if (next) return next; @@ -3151,7 +3151,7 @@ struct cgroup *cgroup_rightmost_descendant(struct cgroup *pos) last = pos; /* ->prev isn't RCU safe, walk ->next till the end */ pos = NULL; - list_for_each_entry_rcu(tmp, &last->children, sibling) + cgroup_for_each_child(tmp, last) pos = tmp; } while (pos); @@ -3165,8 +3165,7 @@ static struct cgroup *cgroup_leftmost_descendant(struct cgroup *pos) do { last = pos; - pos = list_first_or_null_rcu(&pos->children, struct cgroup, - sibling); + pos = cgroup_next_child(NULL, pos); } while (pos); return last; -- cgit v0.10.2 From 492eb21b98f88e411a8bb43d6edcd7d7022add10 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 8 Aug 2013 20:11:25 -0400 Subject: cgroup: make hierarchy iterators deal with cgroup_subsys_state instead of cgroup cgroup is currently in the process of transitioning to using css (cgroup_subsys_state) as the primary handle instead of cgroup in subsystem API. For hierarchy iterators, this is beneficial because * In most cases, css is the only thing subsystems care about anyway. * On the planned unified hierarchy, iterations for different subsystems will need to skip over different subtrees of the hierarchy depending on which subsystems are enabled on each cgroup. Passing around css makes it unnecessary to explicitly specify the subsystem in question as css is intersection between cgroup and subsystem * For the planned unified hierarchy, css's would need to be created and destroyed dynamically independent from cgroup hierarchy. Having cgroup core manage css iteration makes enforcing deref rules a lot easier. Most subsystem conversions are straight-forward. Noteworthy changes are * blkio: cgroup_to_blkcg() is no longer used. Removed. * freezer: cgroup_freezer() is no longer used. Removed. * devices: cgroup_to_devcgroup() is no longer used. Removed. Signed-off-by: Tejun Heo Acked-by: Li Zefan Acked-by: Michal Hocko Acked-by: Vivek Goyal Acked-by: Aristeu Rozanski Cc: Johannes Weiner Cc: Balbir Singh Cc: Matt Helsley Cc: Jens Axboe diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index f46f3c6..4b40640 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -614,7 +614,7 @@ u64 blkg_stat_recursive_sum(struct blkg_policy_data *pd, int off) { struct blkcg_policy *pol = blkcg_policy[pd->plid]; struct blkcg_gq *pos_blkg; - struct cgroup *pos_cgrp; + struct cgroup_subsys_state *pos_css; u64 sum; lockdep_assert_held(pd->blkg->q->queue_lock); @@ -622,7 +622,7 @@ u64 blkg_stat_recursive_sum(struct blkg_policy_data *pd, int off) sum = blkg_stat_read((void *)pd + off); rcu_read_lock(); - blkg_for_each_descendant_pre(pos_blkg, pos_cgrp, pd_to_blkg(pd)) { + blkg_for_each_descendant_pre(pos_blkg, pos_css, pd_to_blkg(pd)) { struct blkg_policy_data *pos_pd = blkg_to_pd(pos_blkg, pol); struct blkg_stat *stat = (void *)pos_pd + off; @@ -649,7 +649,7 @@ struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkg_policy_data *pd, { struct blkcg_policy *pol = blkcg_policy[pd->plid]; struct blkcg_gq *pos_blkg; - struct cgroup *pos_cgrp; + struct cgroup_subsys_state *pos_css; struct blkg_rwstat sum; int i; @@ -658,7 +658,7 @@ struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkg_policy_data *pd, sum = blkg_rwstat_read((void *)pd + off); rcu_read_lock(); - blkg_for_each_descendant_pre(pos_blkg, pos_cgrp, pd_to_blkg(pd)) { + blkg_for_each_descendant_pre(pos_blkg, pos_css, pd_to_blkg(pd)) { struct blkg_policy_data *pos_pd = blkg_to_pd(pos_blkg, pol); struct blkg_rwstat *rwstat = (void *)pos_pd + off; struct blkg_rwstat tmp; diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h index b6802c4..8555386 100644 --- a/block/blk-cgroup.h +++ b/block/blk-cgroup.h @@ -184,11 +184,6 @@ static inline struct blkcg *css_to_blkcg(struct cgroup_subsys_state *css) return css ? container_of(css, struct blkcg, css) : NULL; } -static inline struct blkcg *cgroup_to_blkcg(struct cgroup *cgroup) -{ - return css_to_blkcg(cgroup_css(cgroup, blkio_subsys_id)); -} - static inline struct blkcg *task_blkcg(struct task_struct *tsk) { return css_to_blkcg(task_css(tsk, blkio_subsys_id)); @@ -289,32 +284,31 @@ struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, struct request_queue *q, /** * blkg_for_each_descendant_pre - pre-order walk of a blkg's descendants * @d_blkg: loop cursor pointing to the current descendant - * @pos_cgrp: used for iteration + * @pos_css: used for iteration * @p_blkg: target blkg to walk descendants of * * Walk @c_blkg through the descendants of @p_blkg. Must be used with RCU * read locked. If called under either blkcg or queue lock, the iteration * is guaranteed to include all and only online blkgs. The caller may - * update @pos_cgrp by calling cgroup_rightmost_descendant() to skip - * subtree. + * update @pos_css by calling css_rightmost_descendant() to skip subtree. */ -#define blkg_for_each_descendant_pre(d_blkg, pos_cgrp, p_blkg) \ - cgroup_for_each_descendant_pre((pos_cgrp), (p_blkg)->blkcg->css.cgroup) \ - if (((d_blkg) = __blkg_lookup(cgroup_to_blkcg(pos_cgrp), \ +#define blkg_for_each_descendant_pre(d_blkg, pos_css, p_blkg) \ + css_for_each_descendant_pre((pos_css), &(p_blkg)->blkcg->css) \ + if (((d_blkg) = __blkg_lookup(css_to_blkcg(pos_css), \ (p_blkg)->q, false))) /** * blkg_for_each_descendant_post - post-order walk of a blkg's descendants * @d_blkg: loop cursor pointing to the current descendant - * @pos_cgrp: used for iteration + * @pos_css: used for iteration * @p_blkg: target blkg to walk descendants of * * Similar to blkg_for_each_descendant_pre() but performs post-order * traversal instead. Synchronization rules are the same. */ -#define blkg_for_each_descendant_post(d_blkg, pos_cgrp, p_blkg) \ - cgroup_for_each_descendant_post((pos_cgrp), (p_blkg)->blkcg->css.cgroup) \ - if (((d_blkg) = __blkg_lookup(cgroup_to_blkcg(pos_cgrp), \ +#define blkg_for_each_descendant_post(d_blkg, pos_css, p_blkg) \ + css_for_each_descendant_post((pos_css), &(p_blkg)->blkcg->css) \ + if (((d_blkg) = __blkg_lookup(css_to_blkcg(pos_css), \ (p_blkg)->q, false))) /** @@ -577,7 +571,6 @@ static inline int blkcg_activate_policy(struct request_queue *q, static inline void blkcg_deactivate_policy(struct request_queue *q, const struct blkcg_policy *pol) { } -static inline struct blkcg *cgroup_to_blkcg(struct cgroup *cgroup) { return NULL; } static inline struct blkcg *bio_blkcg(struct bio *bio) { return NULL; } static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg, diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 88bcfb6..8cefa7f 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -1349,7 +1349,7 @@ static int tg_set_conf(struct cgroup_subsys_state *css, struct cftype *cft, struct throtl_grp *tg; struct throtl_service_queue *sq; struct blkcg_gq *blkg; - struct cgroup *pos_cgrp; + struct cgroup_subsys_state *pos_css; int ret; ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx); @@ -1380,7 +1380,7 @@ static int tg_set_conf(struct cgroup_subsys_state *css, struct cftype *cft, * blk-throttle. */ tg_update_has_rules(tg); - blkg_for_each_descendant_pre(blkg, pos_cgrp, ctx.blkg) + blkg_for_each_descendant_pre(blkg, pos_css, ctx.blkg) tg_update_has_rules(blkg_to_tg(blkg)); /* @@ -1623,7 +1623,7 @@ void blk_throtl_drain(struct request_queue *q) { struct throtl_data *td = q->td; struct blkcg_gq *blkg; - struct cgroup *pos_cgrp; + struct cgroup_subsys_state *pos_css; struct bio *bio; int rw; @@ -1636,7 +1636,7 @@ void blk_throtl_drain(struct request_queue *q) * better to walk service_queue tree directly but blkg walk is * easier. */ - blkg_for_each_descendant_post(blkg, pos_cgrp, td->queue->root_blkg) + blkg_for_each_descendant_post(blkg, pos_css, td->queue->root_blkg) tg_drain_bios(&blkg_to_tg(blkg)->service_queue); tg_drain_bios(&td_root_tg(td)->service_queue); diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index c288bce..4bc22f4 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -779,68 +779,72 @@ static inline struct cgroup *cgroup_from_id(struct cgroup_subsys *ss, int id) return idr_find(&ss->root->cgroup_idr, id); } -struct cgroup *cgroup_next_child(struct cgroup *pos, struct cgroup *cgrp); +struct cgroup_subsys_state *css_next_child(struct cgroup_subsys_state *pos, + struct cgroup_subsys_state *parent); /** - * cgroup_for_each_child - iterate through children of a cgroup - * @pos: the cgroup * to use as the loop cursor - * @cgrp: cgroup whose children to walk + * css_for_each_child - iterate through children of a css + * @pos: the css * to use as the loop cursor + * @parent: css whose children to walk * - * Walk @cgrp's children. Must be called under rcu_read_lock(). A child - * cgroup which hasn't finished ->css_online() or already has finished + * Walk @parent's children. Must be called under rcu_read_lock(). A child + * css which hasn't finished ->css_online() or already has finished * ->css_offline() may show up during traversal and it's each subsystem's * responsibility to verify that each @pos is alive. * * If a subsystem synchronizes against the parent in its ->css_online() and - * before starting iterating, a cgroup which finished ->css_online() is + * before starting iterating, a css which finished ->css_online() is * guaranteed to be visible in the future iterations. * * It is allowed to temporarily drop RCU read lock during iteration. The * caller is responsible for ensuring that @pos remains accessible until * the start of the next iteration by, for example, bumping the css refcnt. */ -#define cgroup_for_each_child(pos, cgrp) \ - for ((pos) = cgroup_next_child(NULL, (cgrp)); (pos); \ - (pos) = cgroup_next_child((pos), (cgrp))) +#define css_for_each_child(pos, parent) \ + for ((pos) = css_next_child(NULL, (parent)); (pos); \ + (pos) = css_next_child((pos), (parent))) -struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos, - struct cgroup *cgroup); -struct cgroup *cgroup_rightmost_descendant(struct cgroup *pos); +struct cgroup_subsys_state * +css_next_descendant_pre(struct cgroup_subsys_state *pos, + struct cgroup_subsys_state *css); + +struct cgroup_subsys_state * +css_rightmost_descendant(struct cgroup_subsys_state *pos); /** - * cgroup_for_each_descendant_pre - pre-order walk of a cgroup's descendants - * @pos: the cgroup * to use as the loop cursor - * @cgroup: cgroup whose descendants to walk + * css_for_each_descendant_pre - pre-order walk of a css's descendants + * @pos: the css * to use as the loop cursor + * @root: css whose descendants to walk * - * Walk @cgroup's descendants. Must be called under rcu_read_lock(). A - * descendant cgroup which hasn't finished ->css_online() or already has + * Walk @root's descendants. Must be called under rcu_read_lock(). A + * descendant css which hasn't finished ->css_online() or already has * finished ->css_offline() may show up during traversal and it's each * subsystem's responsibility to verify that each @pos is alive. * * If a subsystem synchronizes against the parent in its ->css_online() and * before starting iterating, and synchronizes against @pos on each - * iteration, any descendant cgroup which finished ->css_online() is + * iteration, any descendant css which finished ->css_online() is * guaranteed to be visible in the future iterations. * * In other words, the following guarantees that a descendant can't escape * state updates of its ancestors. * - * my_online(@cgrp) + * my_online(@css) * { - * Lock @cgrp->parent and @cgrp; - * Inherit state from @cgrp->parent; + * Lock @css's parent and @css; + * Inherit state from the parent; * Unlock both. * } * - * my_update_state(@cgrp) + * my_update_state(@css) * { - * Lock @cgrp; - * Update @cgrp's state; - * Unlock @cgrp; + * Lock @css; + * Update @css's state; + * Unlock @css; * - * cgroup_for_each_descendant_pre(@pos, @cgrp) { + * css_for_each_descendant_pre(@pos, @css) { * Lock @pos; - * Verify @pos is alive and inherit state from @pos->parent; + * Verify @pos is alive and inherit state from @pos's parent; * Unlock @pos; * } * } @@ -851,8 +855,7 @@ struct cgroup *cgroup_rightmost_descendant(struct cgroup *pos); * visible by walking order and, as long as inheriting operations to the * same @pos are atomic to each other, multiple updates racing each other * still result in the correct state. It's guaranateed that at least one - * inheritance happens for any cgroup after the latest update to its - * parent. + * inheritance happens for any css after the latest update to its parent. * * If checking parent's state requires locking the parent, each inheriting * iteration should lock and unlock both @pos->parent and @pos. @@ -865,25 +868,26 @@ struct cgroup *cgroup_rightmost_descendant(struct cgroup *pos); * caller is responsible for ensuring that @pos remains accessible until * the start of the next iteration by, for example, bumping the css refcnt. */ -#define cgroup_for_each_descendant_pre(pos, cgroup) \ - for (pos = cgroup_next_descendant_pre(NULL, (cgroup)); (pos); \ - pos = cgroup_next_descendant_pre((pos), (cgroup))) +#define css_for_each_descendant_pre(pos, css) \ + for ((pos) = css_next_descendant_pre(NULL, (css)); (pos); \ + (pos) = css_next_descendant_pre((pos), (css))) -struct cgroup *cgroup_next_descendant_post(struct cgroup *pos, - struct cgroup *cgroup); +struct cgroup_subsys_state * +css_next_descendant_post(struct cgroup_subsys_state *pos, + struct cgroup_subsys_state *css); /** - * cgroup_for_each_descendant_post - post-order walk of a cgroup's descendants - * @pos: the cgroup * to use as the loop cursor - * @cgroup: cgroup whose descendants to walk + * css_for_each_descendant_post - post-order walk of a css's descendants + * @pos: the css * to use as the loop cursor + * @css: css whose descendants to walk * - * Similar to cgroup_for_each_descendant_pre() but performs post-order + * Similar to css_for_each_descendant_pre() but performs post-order * traversal instead. Note that the walk visibility guarantee described in * pre-order walk doesn't apply the same to post-order walks. */ -#define cgroup_for_each_descendant_post(pos, cgroup) \ - for (pos = cgroup_next_descendant_post(NULL, (cgroup)); (pos); \ - pos = cgroup_next_descendant_post((pos), (cgroup))) +#define css_for_each_descendant_post(pos, css) \ + for ((pos) = css_next_descendant_post(NULL, (css)); (pos); \ + (pos) = css_next_descendant_post((pos), (css))) /* A cgroup_iter should be treated as an opaque object */ struct cgroup_iter { diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 2b7354f..91eac33 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2814,8 +2814,8 @@ static void cgroup_cfts_prepare(void) /* * Thanks to the entanglement with vfs inode locking, we can't walk * the existing cgroups under cgroup_mutex and create files. - * Instead, we use cgroup_for_each_descendant_pre() and drop RCU - * read lock before calling cgroup_addrm_files(). + * Instead, we use css_for_each_descendant_pre() and drop RCU read + * lock before calling cgroup_addrm_files(). */ mutex_lock(&cgroup_mutex); } @@ -2825,10 +2825,11 @@ static int cgroup_cfts_commit(struct cftype *cfts, bool is_add) { LIST_HEAD(pending); struct cgroup_subsys *ss = cfts[0].ss; - struct cgroup *cgrp, *root = &ss->root->top_cgroup; + struct cgroup *root = &ss->root->top_cgroup; struct super_block *sb = ss->root->sb; struct dentry *prev = NULL; struct inode *inode; + struct cgroup_subsys_state *css; u64 update_before; int ret = 0; @@ -2861,7 +2862,9 @@ static int cgroup_cfts_commit(struct cftype *cfts, bool is_add) /* add/rm files for all cgroups created before */ rcu_read_lock(); - cgroup_for_each_descendant_pre(cgrp, root) { + css_for_each_descendant_pre(css, cgroup_css(root, ss->subsys_id)) { + struct cgroup *cgrp = css->cgroup; + if (cgroup_is_dead(cgrp)) continue; @@ -3037,17 +3040,21 @@ static void cgroup_enable_task_cg_lists(void) } /** - * cgroup_next_child - find the next child of a given cgroup - * @pos: the current position (%NULL to initiate traversal) - * @cgrp: cgroup whose descendants to walk + * css_next_child - find the next child of a given css + * @pos_css: the current position (%NULL to initiate traversal) + * @parent_css: css whose children to walk * - * This function returns the next child of @cgrp and should be called under - * RCU read lock. The only requirement is that @cgrp and @pos are - * accessible. The next sibling is guaranteed to be returned regardless of - * their states. + * This function returns the next child of @parent_css and should be called + * under RCU read lock. The only requirement is that @parent_css and + * @pos_css are accessible. The next sibling is guaranteed to be returned + * regardless of their states. */ -struct cgroup *cgroup_next_child(struct cgroup *pos, struct cgroup *cgrp) +struct cgroup_subsys_state * +css_next_child(struct cgroup_subsys_state *pos_css, + struct cgroup_subsys_state *parent_css) { + struct cgroup *pos = pos_css ? pos_css->cgroup : NULL; + struct cgroup *cgrp = parent_css->cgroup; struct cgroup *next; WARN_ON_ONCE(!rcu_read_lock_held()); @@ -3081,59 +3088,64 @@ struct cgroup *cgroup_next_child(struct cgroup *pos, struct cgroup *cgrp) break; } - if (&next->sibling != &cgrp->children) - return next; - return NULL; + if (&next->sibling == &cgrp->children) + return NULL; + + if (parent_css->ss) + return cgroup_css(next, parent_css->ss->subsys_id); + else + return &next->dummy_css; } -EXPORT_SYMBOL_GPL(cgroup_next_child); +EXPORT_SYMBOL_GPL(css_next_child); /** - * cgroup_next_descendant_pre - find the next descendant for pre-order walk + * css_next_descendant_pre - find the next descendant for pre-order walk * @pos: the current position (%NULL to initiate traversal) - * @cgroup: cgroup whose descendants to walk + * @root: css whose descendants to walk * - * To be used by cgroup_for_each_descendant_pre(). Find the next - * descendant to visit for pre-order traversal of @cgroup's descendants. + * To be used by css_for_each_descendant_pre(). Find the next descendant + * to visit for pre-order traversal of @root's descendants. * * While this function requires RCU read locking, it doesn't require the * whole traversal to be contained in a single RCU critical section. This * function will return the correct next descendant as long as both @pos - * and @cgroup are accessible and @pos is a descendant of @cgroup. + * and @root are accessible and @pos is a descendant of @root. */ -struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos, - struct cgroup *cgroup) +struct cgroup_subsys_state * +css_next_descendant_pre(struct cgroup_subsys_state *pos, + struct cgroup_subsys_state *root) { - struct cgroup *next; + struct cgroup_subsys_state *next; WARN_ON_ONCE(!rcu_read_lock_held()); - /* if first iteration, pretend we just visited @cgroup */ + /* if first iteration, pretend we just visited @root */ if (!pos) - pos = cgroup; + pos = root; /* visit the first child if exists */ - next = cgroup_next_child(NULL, pos); + next = css_next_child(NULL, pos); if (next) return next; /* no child, visit my or the closest ancestor's next sibling */ - while (pos != cgroup) { - next = cgroup_next_child(pos, pos->parent); + while (pos != root) { + next = css_next_child(pos, css_parent(pos)); if (next) return next; - pos = pos->parent; + pos = css_parent(pos); } return NULL; } -EXPORT_SYMBOL_GPL(cgroup_next_descendant_pre); +EXPORT_SYMBOL_GPL(css_next_descendant_pre); /** - * cgroup_rightmost_descendant - return the rightmost descendant of a cgroup - * @pos: cgroup of interest + * css_rightmost_descendant - return the rightmost descendant of a css + * @pos: css of interest * - * Return the rightmost descendant of @pos. If there's no descendant, - * @pos is returned. This can be used during pre-order traversal to skip + * Return the rightmost descendant of @pos. If there's no descendant, @pos + * is returned. This can be used during pre-order traversal to skip * subtree of @pos. * * While this function requires RCU read locking, it doesn't require the @@ -3141,9 +3153,10 @@ EXPORT_SYMBOL_GPL(cgroup_next_descendant_pre); * function will return the correct rightmost descendant as long as @pos is * accessible. */ -struct cgroup *cgroup_rightmost_descendant(struct cgroup *pos) +struct cgroup_subsys_state * +css_rightmost_descendant(struct cgroup_subsys_state *pos) { - struct cgroup *last, *tmp; + struct cgroup_subsys_state *last, *tmp; WARN_ON_ONCE(!rcu_read_lock_held()); @@ -3151,62 +3164,64 @@ struct cgroup *cgroup_rightmost_descendant(struct cgroup *pos) last = pos; /* ->prev isn't RCU safe, walk ->next till the end */ pos = NULL; - cgroup_for_each_child(tmp, last) + css_for_each_child(tmp, last) pos = tmp; } while (pos); return last; } -EXPORT_SYMBOL_GPL(cgroup_rightmost_descendant); +EXPORT_SYMBOL_GPL(css_rightmost_descendant); -static struct cgroup *cgroup_leftmost_descendant(struct cgroup *pos) +static struct cgroup_subsys_state * +css_leftmost_descendant(struct cgroup_subsys_state *pos) { - struct cgroup *last; + struct cgroup_subsys_state *last; do { last = pos; - pos = cgroup_next_child(NULL, pos); + pos = css_next_child(NULL, pos); } while (pos); return last; } /** - * cgroup_next_descendant_post - find the next descendant for post-order walk + * css_next_descendant_post - find the next descendant for post-order walk * @pos: the current position (%NULL to initiate traversal) - * @cgroup: cgroup whose descendants to walk + * @root: css whose descendants to walk * - * To be used by cgroup_for_each_descendant_post(). Find the next - * descendant to visit for post-order traversal of @cgroup's descendants. + * To be used by css_for_each_descendant_post(). Find the next descendant + * to visit for post-order traversal of @root's descendants. * * While this function requires RCU read locking, it doesn't require the * whole traversal to be contained in a single RCU critical section. This * function will return the correct next descendant as long as both @pos * and @cgroup are accessible and @pos is a descendant of @cgroup. */ -struct cgroup *cgroup_next_descendant_post(struct cgroup *pos, - struct cgroup *cgroup) +struct cgroup_subsys_state * +css_next_descendant_post(struct cgroup_subsys_state *pos, + struct cgroup_subsys_state *root) { - struct cgroup *next; + struct cgroup_subsys_state *next; WARN_ON_ONCE(!rcu_read_lock_held()); /* if first iteration, visit the leftmost descendant */ if (!pos) { - next = cgroup_leftmost_descendant(cgroup); - return next != cgroup ? next : NULL; + next = css_leftmost_descendant(root); + return next != root ? next : NULL; } /* if there's an unvisited sibling, visit its leftmost descendant */ - next = cgroup_next_child(pos, pos->parent); + next = css_next_child(pos, css_parent(pos)); if (next) - return cgroup_leftmost_descendant(next); + return css_leftmost_descendant(next); /* no sibling left, visit parent */ - next = pos->parent; - return next != cgroup ? next : NULL; + next = css_parent(pos); + return next != root ? next : NULL; } -EXPORT_SYMBOL_GPL(cgroup_next_descendant_post); +EXPORT_SYMBOL_GPL(css_next_descendant_post); void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it) __acquires(css_set_lock) @@ -4549,9 +4564,9 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) /* * Mark @cgrp dead. This prevents further task migration and child * creation by disabling cgroup_lock_live_group(). Note that - * CGRP_DEAD assertion is depended upon by cgroup_next_child() to + * CGRP_DEAD assertion is depended upon by css_next_child() to * resume iteration after dropping RCU read lock. See - * cgroup_next_child() for details. + * css_next_child() for details. */ set_bit(CGRP_DEAD, &cgrp->flags); diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index 19613ba..98ca48d 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -50,11 +50,6 @@ static inline struct freezer *css_freezer(struct cgroup_subsys_state *css) return css ? container_of(css, struct freezer, css) : NULL; } -static inline struct freezer *cgroup_freezer(struct cgroup *cgroup) -{ - return css_freezer(cgroup_css(cgroup, freezer_subsys_id)); -} - static inline struct freezer *task_freezer(struct task_struct *task) { return css_freezer(task_css(task, freezer_subsys_id)); @@ -120,7 +115,7 @@ static int freezer_css_online(struct cgroup_subsys_state *css) /* * The following double locking and freezing state inheritance * guarantee that @cgroup can never escape ancestors' freezing - * states. See cgroup_for_each_descendant_pre() for details. + * states. See css_for_each_descendant_pre() for details. */ if (parent) spin_lock_irq(&parent->lock); @@ -262,7 +257,7 @@ out: static void update_if_frozen(struct cgroup_subsys_state *css) { struct freezer *freezer = css_freezer(css); - struct cgroup *pos; + struct cgroup_subsys_state *pos; struct cgroup_iter it; struct task_struct *task; @@ -275,8 +270,8 @@ static void update_if_frozen(struct cgroup_subsys_state *css) goto out_unlock; /* are all (live) children frozen? */ - cgroup_for_each_child(pos, css->cgroup) { - struct freezer *child = cgroup_freezer(pos); + css_for_each_child(pos, css) { + struct freezer *child = css_freezer(pos); if ((child->state & CGROUP_FREEZER_ONLINE) && !(child->state & CGROUP_FROZEN)) @@ -309,13 +304,13 @@ out_unlock: static int freezer_read(struct cgroup_subsys_state *css, struct cftype *cft, struct seq_file *m) { - struct cgroup *pos; + struct cgroup_subsys_state *pos; rcu_read_lock(); /* update states bottom-up */ - cgroup_for_each_descendant_post(pos, css->cgroup) - update_if_frozen(cgroup_css(pos, freezer_subsys_id)); + css_for_each_descendant_post(pos, css) + update_if_frozen(pos); update_if_frozen(css); rcu_read_unlock(); @@ -396,7 +391,7 @@ static void freezer_apply_state(struct freezer *freezer, bool freeze, */ static void freezer_change_state(struct freezer *freezer, bool freeze) { - struct cgroup *pos; + struct cgroup_subsys_state *pos; /* update @freezer */ spin_lock_irq(&freezer->lock); @@ -409,8 +404,8 @@ static void freezer_change_state(struct freezer *freezer, bool freeze) * CGROUP_FREEZING_PARENT. */ rcu_read_lock(); - cgroup_for_each_descendant_pre(pos, freezer->css.cgroup) { - struct freezer *pos_f = cgroup_freezer(pos); + css_for_each_descendant_pre(pos, &freezer->css) { + struct freezer *pos_f = css_freezer(pos); struct freezer *parent = parent_freezer(pos_f); /* diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 89b76e1..be4f503 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -210,29 +210,29 @@ static struct cpuset top_cpuset = { /** * cpuset_for_each_child - traverse online children of a cpuset * @child_cs: loop cursor pointing to the current child - * @pos_cgrp: used for iteration + * @pos_css: used for iteration * @parent_cs: target cpuset to walk children of * * Walk @child_cs through the online children of @parent_cs. Must be used * with RCU read locked. */ -#define cpuset_for_each_child(child_cs, pos_cgrp, parent_cs) \ - cgroup_for_each_child((pos_cgrp), (parent_cs)->css.cgroup) \ - if (is_cpuset_online(((child_cs) = cgroup_cs((pos_cgrp))))) +#define cpuset_for_each_child(child_cs, pos_css, parent_cs) \ + css_for_each_child((pos_css), &(parent_cs)->css) \ + if (is_cpuset_online(((child_cs) = css_cs((pos_css))))) /** * cpuset_for_each_descendant_pre - pre-order walk of a cpuset's descendants * @des_cs: loop cursor pointing to the current descendant - * @pos_cgrp: used for iteration + * @pos_css: used for iteration * @root_cs: target cpuset to walk ancestor of * * Walk @des_cs through the online descendants of @root_cs. Must be used - * with RCU read locked. The caller may modify @pos_cgrp by calling - * cgroup_rightmost_descendant() to skip subtree. + * with RCU read locked. The caller may modify @pos_css by calling + * css_rightmost_descendant() to skip subtree. */ -#define cpuset_for_each_descendant_pre(des_cs, pos_cgrp, root_cs) \ - cgroup_for_each_descendant_pre((pos_cgrp), (root_cs)->css.cgroup) \ - if (is_cpuset_online(((des_cs) = cgroup_cs((pos_cgrp))))) +#define cpuset_for_each_descendant_pre(des_cs, pos_css, root_cs) \ + css_for_each_descendant_pre((pos_css), &(root_cs)->css) \ + if (is_cpuset_online(((des_cs) = css_cs((pos_css))))) /* * There are two global mutexes guarding cpuset structures - cpuset_mutex @@ -430,7 +430,7 @@ static void free_trial_cpuset(struct cpuset *trial) static int validate_change(struct cpuset *cur, struct cpuset *trial) { - struct cgroup *cgrp; + struct cgroup_subsys_state *css; struct cpuset *c, *par; int ret; @@ -438,7 +438,7 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial) /* Each of our child cpusets must be a subset of us */ ret = -EBUSY; - cpuset_for_each_child(c, cgrp, cur) + cpuset_for_each_child(c, css, cur) if (!is_cpuset_subset(c, trial)) goto out; @@ -459,7 +459,7 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial) * overlap */ ret = -EINVAL; - cpuset_for_each_child(c, cgrp, par) { + cpuset_for_each_child(c, css, par) { if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) && c != cur && cpumask_intersects(trial->cpus_allowed, c->cpus_allowed)) @@ -508,13 +508,13 @@ static void update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *root_cs) { struct cpuset *cp; - struct cgroup *pos_cgrp; + struct cgroup_subsys_state *pos_css; rcu_read_lock(); - cpuset_for_each_descendant_pre(cp, pos_cgrp, root_cs) { + cpuset_for_each_descendant_pre(cp, pos_css, root_cs) { /* skip the whole subtree if @cp doesn't have any CPU */ if (cpumask_empty(cp->cpus_allowed)) { - pos_cgrp = cgroup_rightmost_descendant(pos_cgrp); + pos_css = css_rightmost_descendant(pos_css); continue; } @@ -589,7 +589,7 @@ static int generate_sched_domains(cpumask_var_t **domains, struct sched_domain_attr *dattr; /* attributes for custom domains */ int ndoms = 0; /* number of sched domains in result */ int nslot; /* next empty doms[] struct cpumask slot */ - struct cgroup *pos_cgrp; + struct cgroup_subsys_state *pos_css; doms = NULL; dattr = NULL; @@ -618,7 +618,7 @@ static int generate_sched_domains(cpumask_var_t **domains, csn = 0; rcu_read_lock(); - cpuset_for_each_descendant_pre(cp, pos_cgrp, &top_cpuset) { + cpuset_for_each_descendant_pre(cp, pos_css, &top_cpuset) { /* * Continue traversing beyond @cp iff @cp has some CPUs and * isn't load balancing. The former is obvious. The @@ -635,7 +635,7 @@ static int generate_sched_domains(cpumask_var_t **domains, csa[csn++] = cp; /* skip @cp's subtree */ - pos_cgrp = cgroup_rightmost_descendant(pos_cgrp); + pos_css = css_rightmost_descendant(pos_css); } rcu_read_unlock(); @@ -886,16 +886,16 @@ static void update_tasks_cpumask_hier(struct cpuset *root_cs, bool update_root, struct ptr_heap *heap) { struct cpuset *cp; - struct cgroup *pos_cgrp; + struct cgroup_subsys_state *pos_css; if (update_root) update_tasks_cpumask(root_cs, heap); rcu_read_lock(); - cpuset_for_each_descendant_pre(cp, pos_cgrp, root_cs) { + cpuset_for_each_descendant_pre(cp, pos_css, root_cs) { /* skip the whole subtree if @cp have some CPU */ if (!cpumask_empty(cp->cpus_allowed)) { - pos_cgrp = cgroup_rightmost_descendant(pos_cgrp); + pos_css = css_rightmost_descendant(pos_css); continue; } if (!css_tryget(&cp->css)) @@ -1143,16 +1143,16 @@ static void update_tasks_nodemask_hier(struct cpuset *root_cs, bool update_root, struct ptr_heap *heap) { struct cpuset *cp; - struct cgroup *pos_cgrp; + struct cgroup_subsys_state *pos_css; if (update_root) update_tasks_nodemask(root_cs, heap); rcu_read_lock(); - cpuset_for_each_descendant_pre(cp, pos_cgrp, root_cs) { + cpuset_for_each_descendant_pre(cp, pos_css, root_cs) { /* skip the whole subtree if @cp have some CPU */ if (!nodes_empty(cp->mems_allowed)) { - pos_cgrp = cgroup_rightmost_descendant(pos_cgrp); + pos_css = css_rightmost_descendant(pos_css); continue; } if (!css_tryget(&cp->css)) @@ -1973,7 +1973,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) struct cpuset *cs = css_cs(css); struct cpuset *parent = parent_cs(cs); struct cpuset *tmp_cs; - struct cgroup *pos_cgrp; + struct cgroup_subsys_state *pos_css; if (!parent) return 0; @@ -2005,7 +2005,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) * (and likewise for mems) to the new cgroup. */ rcu_read_lock(); - cpuset_for_each_child(tmp_cs, pos_cgrp, parent) { + cpuset_for_each_child(tmp_cs, pos_css, parent) { if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) { rcu_read_unlock(); goto out_unlock; @@ -2252,10 +2252,10 @@ static void cpuset_hotplug_workfn(struct work_struct *work) /* if cpus or mems changed, we need to propagate to descendants */ if (cpus_updated || mems_updated) { struct cpuset *cs; - struct cgroup *pos_cgrp; + struct cgroup_subsys_state *pos_css; rcu_read_lock(); - cpuset_for_each_descendant_pre(cs, pos_cgrp, &top_cpuset) { + cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) { if (!css_tryget(&cs->css)) continue; rcu_read_unlock(); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index ab64dfc..2285319 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1082,7 +1082,7 @@ struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) static struct mem_cgroup *__mem_cgroup_iter_next(struct mem_cgroup *root, struct mem_cgroup *last_visited) { - struct cgroup *prev_cgroup, *next_cgroup; + struct cgroup_subsys_state *prev_css, *next_css; /* * Root is not visited by cgroup iterators so it needs an @@ -1091,11 +1091,9 @@ static struct mem_cgroup *__mem_cgroup_iter_next(struct mem_cgroup *root, if (!last_visited) return root; - prev_cgroup = (last_visited == root) ? NULL - : last_visited->css.cgroup; + prev_css = (last_visited == root) ? NULL : &last_visited->css; skip_node: - next_cgroup = cgroup_next_descendant_pre( - prev_cgroup, root->css.cgroup); + next_css = css_next_descendant_pre(prev_css, &root->css); /* * Even if we found a group we have to make sure it is @@ -1104,13 +1102,13 @@ skip_node: * last_visited css is safe to use because it is * protected by css_get and the tree walk is rcu safe. */ - if (next_cgroup) { - struct mem_cgroup *mem = mem_cgroup_from_cont( - next_cgroup); + if (next_css) { + struct mem_cgroup *mem = mem_cgroup_from_css(next_css); + if (css_tryget(&mem->css)) return mem; else { - prev_cgroup = next_cgroup; + prev_css = next_css; goto skip_node; } } @@ -4939,10 +4937,10 @@ static void mem_cgroup_reparent_charges(struct mem_cgroup *memcg) */ static inline bool __memcg_has_children(struct mem_cgroup *memcg) { - struct cgroup *pos; + struct cgroup_subsys_state *pos; /* bounce at first found */ - cgroup_for_each_child(pos, memcg->css.cgroup) + css_for_each_child(pos, &memcg->css) return true; return false; } diff --git a/security/device_cgroup.c b/security/device_cgroup.c index e0ca464..9bf230a 100644 --- a/security/device_cgroup.c +++ b/security/device_cgroup.c @@ -56,11 +56,6 @@ static inline struct dev_cgroup *css_to_devcgroup(struct cgroup_subsys_state *s) return s ? container_of(s, struct dev_cgroup, css) : NULL; } -static inline struct dev_cgroup *cgroup_to_devcgroup(struct cgroup *cgroup) -{ - return css_to_devcgroup(cgroup_css(cgroup, devices_subsys_id)); -} - static inline struct dev_cgroup *task_devcgroup(struct task_struct *task) { return css_to_devcgroup(task_css(task, devices_subsys_id)); @@ -447,13 +442,13 @@ static void revalidate_active_exceptions(struct dev_cgroup *devcg) static int propagate_exception(struct dev_cgroup *devcg_root, struct dev_exception_item *ex) { - struct cgroup *root = devcg_root->css.cgroup, *pos; + struct cgroup_subsys_state *pos; int rc = 0; rcu_read_lock(); - cgroup_for_each_descendant_pre(pos, root) { - struct dev_cgroup *devcg = cgroup_to_devcgroup(pos); + css_for_each_descendant_pre(pos, &devcg_root->css) { + struct dev_cgroup *devcg = css_to_devcgroup(pos); /* * Because devcgroup_mutex is held, no devcg will become -- cgit v0.10.2 From d515876e9d951d8cf7fc7c90db2967664bdc89ee Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 8 Aug 2013 20:11:26 -0400 Subject: cgroup: relocate cgroup_advance_iter() For some reason, cgroup_advance_iter() is standing lonely all away from its iter comrades. Relocate it. This is cosmetic. Signed-off-by: Tejun Heo Acked-by: Li Zefan diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 91eac33..d56d936 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2982,30 +2982,6 @@ int cgroup_task_count(const struct cgroup *cgrp) } /* - * Advance a list_head iterator. The iterator should be positioned at - * the start of a css_set - */ -static void cgroup_advance_iter(struct cgroup *cgrp, struct cgroup_iter *it) -{ - struct list_head *l = it->cset_link; - struct cgrp_cset_link *link; - struct css_set *cset; - - /* Advance to the next non-empty css_set */ - do { - l = l->next; - if (l == &cgrp->cset_links) { - it->cset_link = NULL; - return; - } - link = list_entry(l, struct cgrp_cset_link, cset_link); - cset = link->cset; - } while (list_empty(&cset->tasks)); - it->cset_link = l; - it->task = cset->tasks.next; -} - -/* * To reduce the fork() overhead for systems that are not actually * using their cgroups capability, we don't maintain the lists running * through each css_set to its tasks until we see the list actually @@ -3223,6 +3199,30 @@ css_next_descendant_post(struct cgroup_subsys_state *pos, } EXPORT_SYMBOL_GPL(css_next_descendant_post); +/* + * Advance a list_head iterator. The iterator should be positioned at + * the start of a css_set + */ +static void cgroup_advance_iter(struct cgroup *cgrp, struct cgroup_iter *it) +{ + struct list_head *l = it->cset_link; + struct cgrp_cset_link *link; + struct css_set *cset; + + /* Advance to the next non-empty css_set */ + do { + l = l->next; + if (l == &cgrp->cset_links) { + it->cset_link = NULL; + return; + } + link = list_entry(l, struct cgrp_cset_link, cset_link); + cset = link->cset; + } while (list_empty(&cset->tasks)); + it->cset_link = l; + it->task = cset->tasks.next; +} + void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it) __acquires(css_set_lock) { -- cgit v0.10.2 From 0942eeeef68f9493c1bcb1a52baf612b73fcf9fb Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 8 Aug 2013 20:11:26 -0400 Subject: cgroup: rename cgroup_iter to cgroup_task_iter cgroup now has multiple iterators and it's quite confusing to have something which walks over tasks of a single cgroup named cgroup_iter. Let's rename it to cgroup_task_iter. While at it, reformat / update comments and replace the overview comment above the interface function decls with proper function comments. Such overview can be useful but function comments should be more than enough here. This is pure rename and doesn't introduce any functional changes. Signed-off-by: Tejun Heo Acked-by: Li Zefan Acked-by: Michal Hocko Cc: Matt Helsley Cc: Johannes Weiner Cc: Balbir Singh diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 4bc22f4..ea43979 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -889,31 +889,16 @@ css_next_descendant_post(struct cgroup_subsys_state *pos, for ((pos) = css_next_descendant_post(NULL, (css)); (pos); \ (pos) = css_next_descendant_post((pos), (css))) -/* A cgroup_iter should be treated as an opaque object */ -struct cgroup_iter { - struct list_head *cset_link; - struct list_head *task; +/* A cgroup_task_iter should be treated as an opaque object */ +struct cgroup_task_iter { + struct list_head *cset_link; + struct list_head *task; }; -/* - * To iterate across the tasks in a cgroup: - * - * 1) call cgroup_iter_start to initialize an iterator - * - * 2) call cgroup_iter_next() to retrieve member tasks until it - * returns NULL or until you want to end the iteration - * - * 3) call cgroup_iter_end() to destroy the iterator. - * - * Or, call cgroup_scan_tasks() to iterate through every task in a - * cgroup - cgroup_scan_tasks() holds the css_set_lock when calling - * the test_task() callback, but not while calling the process_task() - * callback. - */ -void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it); -struct task_struct *cgroup_iter_next(struct cgroup *cgrp, - struct cgroup_iter *it); -void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it); +void cgroup_task_iter_start(struct cgroup *cgrp, struct cgroup_task_iter *it); +struct task_struct *cgroup_task_iter_next(struct cgroup *cgrp, + struct cgroup_task_iter *it); +void cgroup_task_iter_end(struct cgroup *cgrp, struct cgroup_task_iter *it); int cgroup_scan_tasks(struct cgroup_scanner *scan); int cgroup_attach_task_all(struct task_struct *from, struct task_struct *); int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from); diff --git a/kernel/cgroup.c b/kernel/cgroup.c index d56d936..15c93f9c 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -367,9 +367,11 @@ static struct cgrp_cset_link init_cgrp_cset_link; static int cgroup_init_idr(struct cgroup_subsys *ss, struct cgroup_subsys_state *css); -/* css_set_lock protects the list of css_set objects, and the - * chain of tasks off each css_set. Nests outside task->alloc_lock - * due to cgroup_iter_start() */ +/* + * css_set_lock protects the list of css_set objects, and the chain of + * tasks off each css_set. Nests outside task->alloc_lock due to + * cgroup_task_iter_start(). + */ static DEFINE_RWLOCK(css_set_lock); static int css_set_count; @@ -394,10 +396,12 @@ static unsigned long css_set_hash(struct cgroup_subsys_state *css[]) return key; } -/* We don't maintain the lists running through each css_set to its - * task until after the first call to cgroup_iter_start(). This - * reduces the fork()/exit() overhead for people who have cgroups - * compiled into their kernel but not actually in use */ +/* + * We don't maintain the lists running through each css_set to its task + * until after the first call to cgroup_task_iter_start(). This reduces + * the fork()/exit() overhead for people who have cgroups compiled into + * their kernel but not actually in use. + */ static int use_task_css_set_links __read_mostly; static void __put_css_set(struct css_set *cset, int taskexit) @@ -2982,10 +2986,10 @@ int cgroup_task_count(const struct cgroup *cgrp) } /* - * To reduce the fork() overhead for systems that are not actually - * using their cgroups capability, we don't maintain the lists running - * through each css_set to its tasks until we see the list actually - * used - in other words after the first call to cgroup_iter_start(). + * To reduce the fork() overhead for systems that are not actually using + * their cgroups capability, we don't maintain the lists running through + * each css_set to its tasks until we see the list actually used - in other + * words after the first call to cgroup_task_iter_start(). */ static void cgroup_enable_task_cg_lists(void) { @@ -3199,11 +3203,15 @@ css_next_descendant_post(struct cgroup_subsys_state *pos, } EXPORT_SYMBOL_GPL(css_next_descendant_post); -/* - * Advance a list_head iterator. The iterator should be positioned at - * the start of a css_set +/** + * cgroup_advance_task_iter - advance a task itererator to the next css_set + * @cgrp: the cgroup to walk tasks of + * @it: the iterator to advance + * + * Advance @it to the next css_set to walk. */ -static void cgroup_advance_iter(struct cgroup *cgrp, struct cgroup_iter *it) +static void cgroup_advance_task_iter(struct cgroup *cgrp, + struct cgroup_task_iter *it) { struct list_head *l = it->cset_link; struct cgrp_cset_link *link; @@ -3223,7 +3231,21 @@ static void cgroup_advance_iter(struct cgroup *cgrp, struct cgroup_iter *it) it->task = cset->tasks.next; } -void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it) +/** + * cgroup_task_iter_start - initiate task iteration + * @cgrp: the cgroup to walk tasks of + * @it: the task iterator to use + * + * Initiate iteration through the tasks of @cgrp. The caller can call + * cgroup_task_iter_next() to walk through the tasks until the function + * returns NULL. On completion of iteration, cgroup_task_iter_end() must + * be called. + * + * Note that this function acquires a lock which is released when the + * iteration finishes. The caller can't sleep while iteration is in + * progress. + */ +void cgroup_task_iter_start(struct cgroup *cgrp, struct cgroup_task_iter *it) __acquires(css_set_lock) { /* @@ -3236,11 +3258,20 @@ void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it) read_lock(&css_set_lock); it->cset_link = &cgrp->cset_links; - cgroup_advance_iter(cgrp, it); + cgroup_advance_task_iter(cgrp, it); } -struct task_struct *cgroup_iter_next(struct cgroup *cgrp, - struct cgroup_iter *it) +/** + * cgroup_task_iter_next - return the next task for the iterator + * @cgrp: the cgroup to walk tasks of + * @it: the task iterator being iterated + * + * The "next" function for task iteration. @it should have been + * initialized via cgroup_task_iter_start(). Returns NULL when the + * iteration reaches the end. + */ +struct task_struct *cgroup_task_iter_next(struct cgroup *cgrp, + struct cgroup_task_iter *it) { struct task_struct *res; struct list_head *l = it->task; @@ -3254,16 +3285,25 @@ struct task_struct *cgroup_iter_next(struct cgroup *cgrp, l = l->next; link = list_entry(it->cset_link, struct cgrp_cset_link, cset_link); if (l == &link->cset->tasks) { - /* We reached the end of this task list - move on to - * the next cg_cgroup_link */ - cgroup_advance_iter(cgrp, it); + /* + * We reached the end of this task list - move on to the + * next cgrp_cset_link. + */ + cgroup_advance_task_iter(cgrp, it); } else { it->task = l; } return res; } -void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it) +/** + * cgroup_task_iter_end - finish task iteration + * @cgrp: the cgroup to walk tasks of + * @it: the task iterator to finish + * + * Finish task iteration started by cgroup_task_iter_start(). + */ +void cgroup_task_iter_end(struct cgroup *cgrp, struct cgroup_task_iter *it) __releases(css_set_lock) { read_unlock(&css_set_lock); @@ -3312,7 +3352,7 @@ static inline int started_after(void *p1, void *p2) * Iterate through all the tasks in a cgroup, calling test_task() for each, * and if it returns true, call process_task() for it also. * The test_task pointer may be NULL, meaning always true (select all tasks). - * Effectively duplicates cgroup_iter_{start,next,end}() + * Effectively duplicates cgroup_task_iter_{start,next,end}() * but does not lock css_set_lock for the call to process_task(). * The struct cgroup_scanner may be embedded in any structure of the caller's * creation. @@ -3333,7 +3373,7 @@ static inline int started_after(void *p1, void *p2) int cgroup_scan_tasks(struct cgroup_scanner *scan) { int retval, i; - struct cgroup_iter it; + struct cgroup_task_iter it; struct task_struct *p, *dropped; /* Never dereference latest_task, since it's not refcounted */ struct task_struct *latest_task = NULL; @@ -3368,8 +3408,8 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan) * guarantees forward progress and that we don't miss any tasks. */ heap->size = 0; - cgroup_iter_start(scan->cgrp, &it); - while ((p = cgroup_iter_next(scan->cgrp, &it))) { + cgroup_task_iter_start(scan->cgrp, &it); + while ((p = cgroup_task_iter_next(scan->cgrp, &it))) { /* * Only affect tasks that qualify per the caller's callback, * if he provided one @@ -3402,7 +3442,7 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan) * the heap and wasn't inserted */ } - cgroup_iter_end(scan->cgrp, &it); + cgroup_task_iter_end(scan->cgrp, &it); if (heap->size) { for (i = 0; i < heap->size; i++) { @@ -3608,7 +3648,7 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type, pid_t *array; int length; int pid, n = 0; /* used for populating the array */ - struct cgroup_iter it; + struct cgroup_task_iter it; struct task_struct *tsk; struct cgroup_pidlist *l; @@ -3623,8 +3663,8 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type, if (!array) return -ENOMEM; /* now, populate the array */ - cgroup_iter_start(cgrp, &it); - while ((tsk = cgroup_iter_next(cgrp, &it))) { + cgroup_task_iter_start(cgrp, &it); + while ((tsk = cgroup_task_iter_next(cgrp, &it))) { if (unlikely(n == length)) break; /* get tgid or pid for procs or tasks file respectively */ @@ -3635,7 +3675,7 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type, if (pid > 0) /* make sure to only use valid results */ array[n++] = pid; } - cgroup_iter_end(cgrp, &it); + cgroup_task_iter_end(cgrp, &it); length = n; /* now sort & (if procs) strip out duplicates */ sort(array, length, sizeof(pid_t), cmppid, NULL); @@ -3669,7 +3709,7 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) { int ret = -EINVAL; struct cgroup *cgrp; - struct cgroup_iter it; + struct cgroup_task_iter it; struct task_struct *tsk; /* @@ -3683,8 +3723,8 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) ret = 0; cgrp = dentry->d_fsdata; - cgroup_iter_start(cgrp, &it); - while ((tsk = cgroup_iter_next(cgrp, &it))) { + cgroup_task_iter_start(cgrp, &it); + while ((tsk = cgroup_task_iter_next(cgrp, &it))) { switch (tsk->state) { case TASK_RUNNING: stats->nr_running++; @@ -3704,7 +3744,7 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) break; } } - cgroup_iter_end(cgrp, &it); + cgroup_task_iter_end(cgrp, &it); err: return ret; @@ -5137,7 +5177,7 @@ void cgroup_fork(struct task_struct *child) * Adds the task to the list running through its css_set if necessary and * call the subsystem fork() callbacks. Has to be after the task is * visible on the task list in case we race with the first call to - * cgroup_iter_start() - to guarantee that the new task ends up on its + * cgroup_task_iter_start() - to guarantee that the new task ends up on its * list. */ void cgroup_post_fork(struct task_struct *child) diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index 98ca48d..c9177f8 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -258,7 +258,7 @@ static void update_if_frozen(struct cgroup_subsys_state *css) { struct freezer *freezer = css_freezer(css); struct cgroup_subsys_state *pos; - struct cgroup_iter it; + struct cgroup_task_iter it; struct task_struct *task; WARN_ON_ONCE(!rcu_read_lock_held()); @@ -279,9 +279,9 @@ static void update_if_frozen(struct cgroup_subsys_state *css) } /* are all tasks frozen? */ - cgroup_iter_start(css->cgroup, &it); + cgroup_task_iter_start(css->cgroup, &it); - while ((task = cgroup_iter_next(css->cgroup, &it))) { + while ((task = cgroup_task_iter_next(css->cgroup, &it))) { if (freezing(task)) { /* * freezer_should_skip() indicates that the task @@ -296,7 +296,7 @@ static void update_if_frozen(struct cgroup_subsys_state *css) freezer->state |= CGROUP_FROZEN; out_iter_end: - cgroup_iter_end(css->cgroup, &it); + cgroup_task_iter_end(css->cgroup, &it); out_unlock: spin_unlock_irq(&freezer->lock); } @@ -323,25 +323,25 @@ static int freezer_read(struct cgroup_subsys_state *css, struct cftype *cft, static void freeze_cgroup(struct freezer *freezer) { struct cgroup *cgroup = freezer->css.cgroup; - struct cgroup_iter it; + struct cgroup_task_iter it; struct task_struct *task; - cgroup_iter_start(cgroup, &it); - while ((task = cgroup_iter_next(cgroup, &it))) + cgroup_task_iter_start(cgroup, &it); + while ((task = cgroup_task_iter_next(cgroup, &it))) freeze_task(task); - cgroup_iter_end(cgroup, &it); + cgroup_task_iter_end(cgroup, &it); } static void unfreeze_cgroup(struct freezer *freezer) { struct cgroup *cgroup = freezer->css.cgroup; - struct cgroup_iter it; + struct cgroup_task_iter it; struct task_struct *task; - cgroup_iter_start(cgroup, &it); - while ((task = cgroup_iter_next(cgroup, &it))) + cgroup_task_iter_start(cgroup, &it); + while ((task = cgroup_task_iter_next(cgroup, &it))) __thaw_task(task); - cgroup_iter_end(cgroup, &it); + cgroup_task_iter_end(cgroup, &it); } /** diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 2285319..00b055d 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1800,11 +1800,11 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, totalpages = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT ? : 1; for_each_mem_cgroup_tree(iter, memcg) { struct cgroup *cgroup = iter->css.cgroup; - struct cgroup_iter it; + struct cgroup_task_iter it; struct task_struct *task; - cgroup_iter_start(cgroup, &it); - while ((task = cgroup_iter_next(cgroup, &it))) { + cgroup_task_iter_start(cgroup, &it); + while ((task = cgroup_task_iter_next(cgroup, &it))) { switch (oom_scan_process_thread(task, totalpages, NULL, false)) { case OOM_SCAN_SELECT: @@ -1817,7 +1817,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, case OOM_SCAN_CONTINUE: continue; case OOM_SCAN_ABORT: - cgroup_iter_end(cgroup, &it); + cgroup_task_iter_end(cgroup, &it); mem_cgroup_iter_break(memcg, iter); if (chosen) put_task_struct(chosen); @@ -1834,7 +1834,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, get_task_struct(chosen); } } - cgroup_iter_end(cgroup, &it); + cgroup_task_iter_end(cgroup, &it); } if (!chosen) -- cgit v0.10.2 From c59cd3d840b1b0a8f996cbbd9132128dcaabbeb9 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 8 Aug 2013 20:11:26 -0400 Subject: cgroup: make cgroup_task_iter remember the cgroup being iterated Currently all cgroup_task_iter functions require @cgrp to be passed in, which is superflous and increases chance of usage error. Make cgroup_task_iter remember the cgroup being iterated and drop @cgrp argument from next and end functions. This patch doesn't introduce any behavior differences. Signed-off-by: Tejun Heo Acked-by: Li Zefan Acked-by: Michal Hocko Cc: Matt Helsley Cc: Johannes Weiner Cc: Balbir Singh diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index ea43979..0287fcc 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -891,14 +891,14 @@ css_next_descendant_post(struct cgroup_subsys_state *pos, /* A cgroup_task_iter should be treated as an opaque object */ struct cgroup_task_iter { + struct cgroup *origin_cgrp; struct list_head *cset_link; struct list_head *task; }; void cgroup_task_iter_start(struct cgroup *cgrp, struct cgroup_task_iter *it); -struct task_struct *cgroup_task_iter_next(struct cgroup *cgrp, - struct cgroup_task_iter *it); -void cgroup_task_iter_end(struct cgroup *cgrp, struct cgroup_task_iter *it); +struct task_struct *cgroup_task_iter_next(struct cgroup_task_iter *it); +void cgroup_task_iter_end(struct cgroup_task_iter *it); int cgroup_scan_tasks(struct cgroup_scanner *scan); int cgroup_attach_task_all(struct task_struct *from, struct task_struct *); int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from); diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 15c93f9c..abc62ea 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -3205,13 +3205,11 @@ EXPORT_SYMBOL_GPL(css_next_descendant_post); /** * cgroup_advance_task_iter - advance a task itererator to the next css_set - * @cgrp: the cgroup to walk tasks of * @it: the iterator to advance * * Advance @it to the next css_set to walk. */ -static void cgroup_advance_task_iter(struct cgroup *cgrp, - struct cgroup_task_iter *it) +static void cgroup_advance_task_iter(struct cgroup_task_iter *it) { struct list_head *l = it->cset_link; struct cgrp_cset_link *link; @@ -3220,7 +3218,7 @@ static void cgroup_advance_task_iter(struct cgroup *cgrp, /* Advance to the next non-empty css_set */ do { l = l->next; - if (l == &cgrp->cset_links) { + if (l == &it->origin_cgrp->cset_links) { it->cset_link = NULL; return; } @@ -3257,21 +3255,22 @@ void cgroup_task_iter_start(struct cgroup *cgrp, struct cgroup_task_iter *it) cgroup_enable_task_cg_lists(); read_lock(&css_set_lock); + + it->origin_cgrp = cgrp; it->cset_link = &cgrp->cset_links; - cgroup_advance_task_iter(cgrp, it); + + cgroup_advance_task_iter(it); } /** * cgroup_task_iter_next - return the next task for the iterator - * @cgrp: the cgroup to walk tasks of * @it: the task iterator being iterated * * The "next" function for task iteration. @it should have been * initialized via cgroup_task_iter_start(). Returns NULL when the * iteration reaches the end. */ -struct task_struct *cgroup_task_iter_next(struct cgroup *cgrp, - struct cgroup_task_iter *it) +struct task_struct *cgroup_task_iter_next(struct cgroup_task_iter *it) { struct task_struct *res; struct list_head *l = it->task; @@ -3289,7 +3288,7 @@ struct task_struct *cgroup_task_iter_next(struct cgroup *cgrp, * We reached the end of this task list - move on to the * next cgrp_cset_link. */ - cgroup_advance_task_iter(cgrp, it); + cgroup_advance_task_iter(it); } else { it->task = l; } @@ -3298,12 +3297,11 @@ struct task_struct *cgroup_task_iter_next(struct cgroup *cgrp, /** * cgroup_task_iter_end - finish task iteration - * @cgrp: the cgroup to walk tasks of * @it: the task iterator to finish * * Finish task iteration started by cgroup_task_iter_start(). */ -void cgroup_task_iter_end(struct cgroup *cgrp, struct cgroup_task_iter *it) +void cgroup_task_iter_end(struct cgroup_task_iter *it) __releases(css_set_lock) { read_unlock(&css_set_lock); @@ -3409,7 +3407,7 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan) */ heap->size = 0; cgroup_task_iter_start(scan->cgrp, &it); - while ((p = cgroup_task_iter_next(scan->cgrp, &it))) { + while ((p = cgroup_task_iter_next(&it))) { /* * Only affect tasks that qualify per the caller's callback, * if he provided one @@ -3442,7 +3440,7 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan) * the heap and wasn't inserted */ } - cgroup_task_iter_end(scan->cgrp, &it); + cgroup_task_iter_end(&it); if (heap->size) { for (i = 0; i < heap->size; i++) { @@ -3664,7 +3662,7 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type, return -ENOMEM; /* now, populate the array */ cgroup_task_iter_start(cgrp, &it); - while ((tsk = cgroup_task_iter_next(cgrp, &it))) { + while ((tsk = cgroup_task_iter_next(&it))) { if (unlikely(n == length)) break; /* get tgid or pid for procs or tasks file respectively */ @@ -3675,7 +3673,7 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type, if (pid > 0) /* make sure to only use valid results */ array[n++] = pid; } - cgroup_task_iter_end(cgrp, &it); + cgroup_task_iter_end(&it); length = n; /* now sort & (if procs) strip out duplicates */ sort(array, length, sizeof(pid_t), cmppid, NULL); @@ -3724,7 +3722,7 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) cgrp = dentry->d_fsdata; cgroup_task_iter_start(cgrp, &it); - while ((tsk = cgroup_task_iter_next(cgrp, &it))) { + while ((tsk = cgroup_task_iter_next(&it))) { switch (tsk->state) { case TASK_RUNNING: stats->nr_running++; @@ -3744,7 +3742,7 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) break; } } - cgroup_task_iter_end(cgrp, &it); + cgroup_task_iter_end(&it); err: return ret; diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index c9177f8..e0ab9bf 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -281,7 +281,7 @@ static void update_if_frozen(struct cgroup_subsys_state *css) /* are all tasks frozen? */ cgroup_task_iter_start(css->cgroup, &it); - while ((task = cgroup_task_iter_next(css->cgroup, &it))) { + while ((task = cgroup_task_iter_next(&it))) { if (freezing(task)) { /* * freezer_should_skip() indicates that the task @@ -296,7 +296,7 @@ static void update_if_frozen(struct cgroup_subsys_state *css) freezer->state |= CGROUP_FROZEN; out_iter_end: - cgroup_task_iter_end(css->cgroup, &it); + cgroup_task_iter_end(&it); out_unlock: spin_unlock_irq(&freezer->lock); } @@ -327,9 +327,9 @@ static void freeze_cgroup(struct freezer *freezer) struct task_struct *task; cgroup_task_iter_start(cgroup, &it); - while ((task = cgroup_task_iter_next(cgroup, &it))) + while ((task = cgroup_task_iter_next(&it))) freeze_task(task); - cgroup_task_iter_end(cgroup, &it); + cgroup_task_iter_end(&it); } static void unfreeze_cgroup(struct freezer *freezer) @@ -339,9 +339,9 @@ static void unfreeze_cgroup(struct freezer *freezer) struct task_struct *task; cgroup_task_iter_start(cgroup, &it); - while ((task = cgroup_task_iter_next(cgroup, &it))) + while ((task = cgroup_task_iter_next(&it))) __thaw_task(task); - cgroup_task_iter_end(cgroup, &it); + cgroup_task_iter_end(&it); } /** diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 00b055d..5a5f4dc 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1804,7 +1804,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, struct task_struct *task; cgroup_task_iter_start(cgroup, &it); - while ((task = cgroup_task_iter_next(cgroup, &it))) { + while ((task = cgroup_task_iter_next(&it))) { switch (oom_scan_process_thread(task, totalpages, NULL, false)) { case OOM_SCAN_SELECT: @@ -1817,7 +1817,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, case OOM_SCAN_CONTINUE: continue; case OOM_SCAN_ABORT: - cgroup_task_iter_end(cgroup, &it); + cgroup_task_iter_end(&it); mem_cgroup_iter_break(memcg, iter); if (chosen) put_task_struct(chosen); @@ -1834,7 +1834,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, get_task_struct(chosen); } } - cgroup_task_iter_end(cgroup, &it); + cgroup_task_iter_end(&it); } if (!chosen) -- cgit v0.10.2 From e535837b1dae17b5a2d76ea1bc22ac1a79354624 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 8 Aug 2013 20:11:26 -0400 Subject: cgroup: remove struct cgroup_scanner cgroup_scan_tasks() takes a pointer to struct cgroup_scanner as its sole argument and the only function of that struct is packing the arguments of the function call which are consisted of five fields. It's not too unusual to pack parameters into a struct when the number of arguments gets excessive or the whole set needs to be passed around a lot, but neither holds here making it just weird. Drop struct cgroup_scanner and pass the params directly to cgroup_scan_tasks(). Note that struct cpuset_change_nodemask_arg was added to cpuset.c to pass both ->cs and ->newmems pointer to cpuset_change_nodemask() using single data pointer. This doesn't make any functional differences. Signed-off-by: Tejun Heo Acked-by: Li Zefan diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 0287fcc..8472ed5 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -528,15 +528,6 @@ struct cftype_set { struct cftype *cfts; }; -struct cgroup_scanner { - struct cgroup *cgrp; - int (*test_task)(struct task_struct *p, struct cgroup_scanner *scan); - void (*process_task)(struct task_struct *p, - struct cgroup_scanner *scan); - struct ptr_heap *heap; - void *data; -}; - /* * See the comment above CGRP_ROOT_SANE_BEHAVIOR for details. This * function can be called as long as @cgrp is accessible. @@ -899,7 +890,12 @@ struct cgroup_task_iter { void cgroup_task_iter_start(struct cgroup *cgrp, struct cgroup_task_iter *it); struct task_struct *cgroup_task_iter_next(struct cgroup_task_iter *it); void cgroup_task_iter_end(struct cgroup_task_iter *it); -int cgroup_scan_tasks(struct cgroup_scanner *scan); + +int cgroup_scan_tasks(struct cgroup *cgrp, + bool (*test)(struct task_struct *, void *), + void (*process)(struct task_struct *, void *), + void *data, struct ptr_heap *heap); + int cgroup_attach_task_all(struct task_struct *from, struct task_struct *); int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from); diff --git a/kernel/cgroup.c b/kernel/cgroup.c index abc62ea..7b16ddb 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -3343,32 +3343,37 @@ static inline int started_after(void *p1, void *p2) /** * cgroup_scan_tasks - iterate though all the tasks in a cgroup - * @scan: struct cgroup_scanner containing arguments for the scan + * @cgrp: the cgroup to iterate tasks of + * @test: optional test callback + * @process: process callback + * @data: data passed to @test and @process + * @heap: optional pre-allocated heap used for task iteration * - * Arguments include pointers to callback functions test_task() and - * process_task(). - * Iterate through all the tasks in a cgroup, calling test_task() for each, - * and if it returns true, call process_task() for it also. - * The test_task pointer may be NULL, meaning always true (select all tasks). - * Effectively duplicates cgroup_task_iter_{start,next,end}() - * but does not lock css_set_lock for the call to process_task(). - * The struct cgroup_scanner may be embedded in any structure of the caller's - * creation. - * It is guaranteed that process_task() will act on every task that - * is a member of the cgroup for the duration of this call. This - * function may or may not call process_task() for tasks that exit - * or move to a different cgroup during the call, or are forked or - * move into the cgroup during the call. + * Iterate through all the tasks in a cgroup, calling @test for each, and + * if it returns %true, call @process for it also. * - * Note that test_task() may be called with locks held, and may in some - * situations be called multiple times for the same task, so it should - * be cheap. - * If the heap pointer in the struct cgroup_scanner is non-NULL, a heap has been - * pre-allocated and will be used for heap operations (and its "gt" member will - * be overwritten), else a temporary heap will be used (allocation of which - * may cause this function to fail). + * @test may be NULL, meaning always true (select all tasks), which + * effectively duplicates cgroup_task_iter_{start,next,end}() but does not + * lock css_set_lock for the call to @process. + * + * It is guaranteed that @process will act on every task that is a member + * of @cgrp for the duration of this call. This function may or may not + * call @process for tasks that exit or move to a different cgroup during + * the call, or are forked or move into the cgroup during the call. + * + * Note that @test may be called with locks held, and may in some + * situations be called multiple times for the same task, so it should be + * cheap. + * + * If @heap is non-NULL, a heap has been pre-allocated and will be used for + * heap operations (and its "gt" member will be overwritten), else a + * temporary heap will be used (allocation of which may cause this function + * to fail). */ -int cgroup_scan_tasks(struct cgroup_scanner *scan) +int cgroup_scan_tasks(struct cgroup *cgrp, + bool (*test)(struct task_struct *, void *), + void (*process)(struct task_struct *, void *), + void *data, struct ptr_heap *heap) { int retval, i; struct cgroup_task_iter it; @@ -3376,12 +3381,10 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan) /* Never dereference latest_task, since it's not refcounted */ struct task_struct *latest_task = NULL; struct ptr_heap tmp_heap; - struct ptr_heap *heap; struct timespec latest_time = { 0, 0 }; - if (scan->heap) { + if (heap) { /* The caller supplied our heap and pre-allocated its memory */ - heap = scan->heap; heap->gt = &started_after; } else { /* We need to allocate our own heap memory */ @@ -3394,25 +3397,24 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan) again: /* - * Scan tasks in the cgroup, using the scanner's "test_task" callback - * to determine which are of interest, and using the scanner's - * "process_task" callback to process any of them that need an update. - * Since we don't want to hold any locks during the task updates, - * gather tasks to be processed in a heap structure. - * The heap is sorted by descending task start time. - * If the statically-sized heap fills up, we overflow tasks that - * started later, and in future iterations only consider tasks that - * started after the latest task in the previous pass. This + * Scan tasks in the cgroup, using the @test callback to determine + * which are of interest, and invoking @process callback on the + * ones which need an update. Since we don't want to hold any + * locks during the task updates, gather tasks to be processed in a + * heap structure. The heap is sorted by descending task start + * time. If the statically-sized heap fills up, we overflow tasks + * that started later, and in future iterations only consider tasks + * that started after the latest task in the previous pass. This * guarantees forward progress and that we don't miss any tasks. */ heap->size = 0; - cgroup_task_iter_start(scan->cgrp, &it); + cgroup_task_iter_start(cgrp, &it); while ((p = cgroup_task_iter_next(&it))) { /* * Only affect tasks that qualify per the caller's callback, * if he provided one */ - if (scan->test_task && !scan->test_task(p, scan)) + if (test && !test(p, data)) continue; /* * Only process tasks that started after the last task @@ -3450,7 +3452,7 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan) latest_task = q; } /* Process the task per the caller's callback */ - scan->process_task(q, scan); + process(q, data); put_task_struct(q); } /* @@ -3467,10 +3469,9 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan) return 0; } -static void cgroup_transfer_one_task(struct task_struct *task, - struct cgroup_scanner *scan) +static void cgroup_transfer_one_task(struct task_struct *task, void *data) { - struct cgroup *new_cgroup = scan->data; + struct cgroup *new_cgroup = data; mutex_lock(&cgroup_mutex); cgroup_attach_task(new_cgroup, task, false); @@ -3484,15 +3485,7 @@ static void cgroup_transfer_one_task(struct task_struct *task, */ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) { - struct cgroup_scanner scan; - - scan.cgrp = from; - scan.test_task = NULL; /* select all tasks in cgroup */ - scan.process_task = cgroup_transfer_one_task; - scan.heap = NULL; - scan.data = to; - - return cgroup_scan_tasks(&scan); + return cgroup_scan_tasks(from, NULL, cgroup_transfer_one_task, to, NULL); } /* diff --git a/kernel/cpuset.c b/kernel/cpuset.c index be4f503..6fe23f2 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -830,7 +830,7 @@ static struct cpuset *effective_nodemask_cpuset(struct cpuset *cs) /** * cpuset_change_cpumask - make a task's cpus_allowed the same as its cpuset's * @tsk: task to test - * @scan: struct cgroup_scanner containing the cgroup of the task + * @data: cpuset to @tsk belongs to * * Called by cgroup_scan_tasks() for each task in a cgroup whose * cpus_allowed mask needs to be changed. @@ -838,12 +838,11 @@ static struct cpuset *effective_nodemask_cpuset(struct cpuset *cs) * We don't need to re-check for the cgroup/cpuset membership, since we're * holding cpuset_mutex at this point. */ -static void cpuset_change_cpumask(struct task_struct *tsk, - struct cgroup_scanner *scan) +static void cpuset_change_cpumask(struct task_struct *tsk, void *data) { - struct cpuset *cpus_cs; + struct cpuset *cs = data; + struct cpuset *cpus_cs = effective_cpumask_cpuset(cs); - cpus_cs = effective_cpumask_cpuset(cgroup_cs(scan->cgrp)); set_cpus_allowed_ptr(tsk, cpus_cs->cpus_allowed); } @@ -862,13 +861,8 @@ static void cpuset_change_cpumask(struct task_struct *tsk, */ static void update_tasks_cpumask(struct cpuset *cs, struct ptr_heap *heap) { - struct cgroup_scanner scan; - - scan.cgrp = cs->css.cgroup; - scan.test_task = NULL; - scan.process_task = cpuset_change_cpumask; - scan.heap = heap; - cgroup_scan_tasks(&scan); + cgroup_scan_tasks(cs->css.cgroup, NULL, cpuset_change_cpumask, cs, + heap); } /* @@ -1052,20 +1046,24 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk, task_unlock(tsk); } +struct cpuset_change_nodemask_arg { + struct cpuset *cs; + nodemask_t *newmems; +}; + /* * Update task's mems_allowed and rebind its mempolicy and vmas' mempolicy * of it to cpuset's new mems_allowed, and migrate pages to new nodes if * memory_migrate flag is set. Called with cpuset_mutex held. */ -static void cpuset_change_nodemask(struct task_struct *p, - struct cgroup_scanner *scan) +static void cpuset_change_nodemask(struct task_struct *p, void *data) { - struct cpuset *cs = cgroup_cs(scan->cgrp); + struct cpuset_change_nodemask_arg *arg = data; + struct cpuset *cs = arg->cs; struct mm_struct *mm; int migrate; - nodemask_t *newmems = scan->data; - cpuset_change_task_nodemask(p, newmems); + cpuset_change_task_nodemask(p, arg->newmems); mm = get_task_mm(p); if (!mm) @@ -1075,7 +1073,7 @@ static void cpuset_change_nodemask(struct task_struct *p, mpol_rebind_mm(mm, &cs->mems_allowed); if (migrate) - cpuset_migrate_mm(mm, &cs->old_mems_allowed, newmems); + cpuset_migrate_mm(mm, &cs->old_mems_allowed, arg->newmems); mmput(mm); } @@ -1093,19 +1091,14 @@ static void *cpuset_being_rebound; static void update_tasks_nodemask(struct cpuset *cs, struct ptr_heap *heap) { static nodemask_t newmems; /* protected by cpuset_mutex */ - struct cgroup_scanner scan; struct cpuset *mems_cs = effective_nodemask_cpuset(cs); + struct cpuset_change_nodemask_arg arg = { .cs = cs, + .newmems = &newmems }; cpuset_being_rebound = cs; /* causes mpol_dup() rebind */ guarantee_online_mems(mems_cs, &newmems); - scan.cgrp = cs->css.cgroup; - scan.test_task = NULL; - scan.process_task = cpuset_change_nodemask; - scan.heap = heap; - scan.data = &newmems; - /* * The mpol_rebind_mm() call takes mmap_sem, which we couldn't * take while holding tasklist_lock. Forks can happen - the @@ -1116,7 +1109,8 @@ static void update_tasks_nodemask(struct cpuset *cs, struct ptr_heap *heap) * It's ok if we rebind the same mm twice; mpol_rebind_mm() * is idempotent. Also migrate pages in each mm to new nodes. */ - cgroup_scan_tasks(&scan); + cgroup_scan_tasks(cs->css.cgroup, NULL, cpuset_change_nodemask, &arg, + heap); /* * All the tasks' nodemasks have been updated, update @@ -1263,17 +1257,18 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val) /* * cpuset_change_flag - make a task's spread flags the same as its cpuset's * @tsk: task to be updated - * @scan: struct cgroup_scanner containing the cgroup of the task + * @data: cpuset to @tsk belongs to * * Called by cgroup_scan_tasks() for each task in a cgroup. * * We don't need to re-check for the cgroup/cpuset membership, since we're * holding cpuset_mutex at this point. */ -static void cpuset_change_flag(struct task_struct *tsk, - struct cgroup_scanner *scan) +static void cpuset_change_flag(struct task_struct *tsk, void *data) { - cpuset_update_task_spread_flag(cgroup_cs(scan->cgrp), tsk); + struct cpuset *cs = data; + + cpuset_update_task_spread_flag(cs, tsk); } /* @@ -1291,13 +1286,7 @@ static void cpuset_change_flag(struct task_struct *tsk, */ static void update_tasks_flags(struct cpuset *cs, struct ptr_heap *heap) { - struct cgroup_scanner scan; - - scan.cgrp = cs->css.cgroup; - scan.test_task = NULL; - scan.process_task = cpuset_change_flag; - scan.heap = heap; - cgroup_scan_tasks(&scan); + cgroup_scan_tasks(cs->css.cgroup, NULL, cpuset_change_flag, cs, heap); } /* -- cgit v0.10.2 From 72ec7029937f0518eff21b8762743c31591684f5 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 8 Aug 2013 20:11:26 -0400 Subject: cgroup: make task iterators deal with cgroup_subsys_state instead of cgroup cgroup is in the process of converting to css (cgroup_subsys_state) from cgroup as the principal subsystem interface handle. This is mostly to prepare for the unified hierarchy support where css's will be created and destroyed dynamically but also helps cleaning up subsystem implementations as css is usually what they are interested in anyway. This patch converts task iterators to deal with css instead of cgroup. Note that under unified hierarchy, different sets of tasks will be considered belonging to a given cgroup depending on the subsystem in question and making the iterators deal with css instead cgroup provides them with enough information about the iteration. While at it, fix several function comment formats in cpuset.c. This patch doesn't introduce any behavior differences. Signed-off-by: Tejun Heo Acked-by: Li Zefan Acked-by: Michal Hocko Cc: Johannes Weiner Cc: Balbir Singh Cc: Matt Helsley diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 8472ed5..cd105fc 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -880,21 +880,22 @@ css_next_descendant_post(struct cgroup_subsys_state *pos, for ((pos) = css_next_descendant_post(NULL, (css)); (pos); \ (pos) = css_next_descendant_post((pos), (css))) -/* A cgroup_task_iter should be treated as an opaque object */ -struct cgroup_task_iter { - struct cgroup *origin_cgrp; +/* A css_task_iter should be treated as an opaque object */ +struct css_task_iter { + struct cgroup_subsys_state *origin_css; struct list_head *cset_link; struct list_head *task; }; -void cgroup_task_iter_start(struct cgroup *cgrp, struct cgroup_task_iter *it); -struct task_struct *cgroup_task_iter_next(struct cgroup_task_iter *it); -void cgroup_task_iter_end(struct cgroup_task_iter *it); +void css_task_iter_start(struct cgroup_subsys_state *css, + struct css_task_iter *it); +struct task_struct *css_task_iter_next(struct css_task_iter *it); +void css_task_iter_end(struct css_task_iter *it); -int cgroup_scan_tasks(struct cgroup *cgrp, - bool (*test)(struct task_struct *, void *), - void (*process)(struct task_struct *, void *), - void *data, struct ptr_heap *heap); +int css_scan_tasks(struct cgroup_subsys_state *css, + bool (*test)(struct task_struct *, void *), + void (*process)(struct task_struct *, void *), + void *data, struct ptr_heap *heap); int cgroup_attach_task_all(struct task_struct *from, struct task_struct *); int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from); diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 7b16ddb..8c57301 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -370,7 +370,7 @@ static int cgroup_init_idr(struct cgroup_subsys *ss, /* * css_set_lock protects the list of css_set objects, and the chain of * tasks off each css_set. Nests outside task->alloc_lock due to - * cgroup_task_iter_start(). + * css_task_iter_start(). */ static DEFINE_RWLOCK(css_set_lock); static int css_set_count; @@ -398,9 +398,9 @@ static unsigned long css_set_hash(struct cgroup_subsys_state *css[]) /* * We don't maintain the lists running through each css_set to its task - * until after the first call to cgroup_task_iter_start(). This reduces - * the fork()/exit() overhead for people who have cgroups compiled into - * their kernel but not actually in use. + * until after the first call to css_task_iter_start(). This reduces the + * fork()/exit() overhead for people who have cgroups compiled into their + * kernel but not actually in use. */ static int use_task_css_set_links __read_mostly; @@ -2989,7 +2989,7 @@ int cgroup_task_count(const struct cgroup *cgrp) * To reduce the fork() overhead for systems that are not actually using * their cgroups capability, we don't maintain the lists running through * each css_set to its tasks until we see the list actually used - in other - * words after the first call to cgroup_task_iter_start(). + * words after the first call to css_task_iter_start(). */ static void cgroup_enable_task_cg_lists(void) { @@ -3204,12 +3204,12 @@ css_next_descendant_post(struct cgroup_subsys_state *pos, EXPORT_SYMBOL_GPL(css_next_descendant_post); /** - * cgroup_advance_task_iter - advance a task itererator to the next css_set + * css_advance_task_iter - advance a task itererator to the next css_set * @it: the iterator to advance * * Advance @it to the next css_set to walk. */ -static void cgroup_advance_task_iter(struct cgroup_task_iter *it) +static void css_advance_task_iter(struct css_task_iter *it) { struct list_head *l = it->cset_link; struct cgrp_cset_link *link; @@ -3218,7 +3218,7 @@ static void cgroup_advance_task_iter(struct cgroup_task_iter *it) /* Advance to the next non-empty css_set */ do { l = l->next; - if (l == &it->origin_cgrp->cset_links) { + if (l == &it->origin_css->cgroup->cset_links) { it->cset_link = NULL; return; } @@ -3230,47 +3230,48 @@ static void cgroup_advance_task_iter(struct cgroup_task_iter *it) } /** - * cgroup_task_iter_start - initiate task iteration - * @cgrp: the cgroup to walk tasks of + * css_task_iter_start - initiate task iteration + * @css: the css to walk tasks of * @it: the task iterator to use * - * Initiate iteration through the tasks of @cgrp. The caller can call - * cgroup_task_iter_next() to walk through the tasks until the function - * returns NULL. On completion of iteration, cgroup_task_iter_end() must - * be called. + * Initiate iteration through the tasks of @css. The caller can call + * css_task_iter_next() to walk through the tasks until the function + * returns NULL. On completion of iteration, css_task_iter_end() must be + * called. * * Note that this function acquires a lock which is released when the * iteration finishes. The caller can't sleep while iteration is in * progress. */ -void cgroup_task_iter_start(struct cgroup *cgrp, struct cgroup_task_iter *it) +void css_task_iter_start(struct cgroup_subsys_state *css, + struct css_task_iter *it) __acquires(css_set_lock) { /* - * The first time anyone tries to iterate across a cgroup, - * we need to enable the list linking each css_set to its - * tasks, and fix up all existing tasks. + * The first time anyone tries to iterate across a css, we need to + * enable the list linking each css_set to its tasks, and fix up + * all existing tasks. */ if (!use_task_css_set_links) cgroup_enable_task_cg_lists(); read_lock(&css_set_lock); - it->origin_cgrp = cgrp; - it->cset_link = &cgrp->cset_links; + it->origin_css = css; + it->cset_link = &css->cgroup->cset_links; - cgroup_advance_task_iter(it); + css_advance_task_iter(it); } /** - * cgroup_task_iter_next - return the next task for the iterator + * css_task_iter_next - return the next task for the iterator * @it: the task iterator being iterated * * The "next" function for task iteration. @it should have been - * initialized via cgroup_task_iter_start(). Returns NULL when the - * iteration reaches the end. + * initialized via css_task_iter_start(). Returns NULL when the iteration + * reaches the end. */ -struct task_struct *cgroup_task_iter_next(struct cgroup_task_iter *it) +struct task_struct *css_task_iter_next(struct css_task_iter *it) { struct task_struct *res; struct list_head *l = it->task; @@ -3288,7 +3289,7 @@ struct task_struct *cgroup_task_iter_next(struct cgroup_task_iter *it) * We reached the end of this task list - move on to the * next cgrp_cset_link. */ - cgroup_advance_task_iter(it); + css_advance_task_iter(it); } else { it->task = l; } @@ -3296,12 +3297,12 @@ struct task_struct *cgroup_task_iter_next(struct cgroup_task_iter *it) } /** - * cgroup_task_iter_end - finish task iteration + * css_task_iter_end - finish task iteration * @it: the task iterator to finish * - * Finish task iteration started by cgroup_task_iter_start(). + * Finish task iteration started by css_task_iter_start(). */ -void cgroup_task_iter_end(struct cgroup_task_iter *it) +void css_task_iter_end(struct css_task_iter *it) __releases(css_set_lock) { read_unlock(&css_set_lock); @@ -3342,24 +3343,24 @@ static inline int started_after(void *p1, void *p2) } /** - * cgroup_scan_tasks - iterate though all the tasks in a cgroup - * @cgrp: the cgroup to iterate tasks of + * css_scan_tasks - iterate though all the tasks in a css + * @css: the css to iterate tasks of * @test: optional test callback * @process: process callback * @data: data passed to @test and @process * @heap: optional pre-allocated heap used for task iteration * - * Iterate through all the tasks in a cgroup, calling @test for each, and - * if it returns %true, call @process for it also. + * Iterate through all the tasks in @css, calling @test for each, and if it + * returns %true, call @process for it also. * * @test may be NULL, meaning always true (select all tasks), which - * effectively duplicates cgroup_task_iter_{start,next,end}() but does not + * effectively duplicates css_task_iter_{start,next,end}() but does not * lock css_set_lock for the call to @process. * * It is guaranteed that @process will act on every task that is a member - * of @cgrp for the duration of this call. This function may or may not - * call @process for tasks that exit or move to a different cgroup during - * the call, or are forked or move into the cgroup during the call. + * of @css for the duration of this call. This function may or may not + * call @process for tasks that exit or move to a different css during the + * call, or are forked or move into the css during the call. * * Note that @test may be called with locks held, and may in some * situations be called multiple times for the same task, so it should be @@ -3370,13 +3371,13 @@ static inline int started_after(void *p1, void *p2) * temporary heap will be used (allocation of which may cause this function * to fail). */ -int cgroup_scan_tasks(struct cgroup *cgrp, - bool (*test)(struct task_struct *, void *), - void (*process)(struct task_struct *, void *), - void *data, struct ptr_heap *heap) +int css_scan_tasks(struct cgroup_subsys_state *css, + bool (*test)(struct task_struct *, void *), + void (*process)(struct task_struct *, void *), + void *data, struct ptr_heap *heap) { int retval, i; - struct cgroup_task_iter it; + struct css_task_iter it; struct task_struct *p, *dropped; /* Never dereference latest_task, since it's not refcounted */ struct task_struct *latest_task = NULL; @@ -3397,7 +3398,7 @@ int cgroup_scan_tasks(struct cgroup *cgrp, again: /* - * Scan tasks in the cgroup, using the @test callback to determine + * Scan tasks in the css, using the @test callback to determine * which are of interest, and invoking @process callback on the * ones which need an update. Since we don't want to hold any * locks during the task updates, gather tasks to be processed in a @@ -3408,8 +3409,8 @@ int cgroup_scan_tasks(struct cgroup *cgrp, * guarantees forward progress and that we don't miss any tasks. */ heap->size = 0; - cgroup_task_iter_start(cgrp, &it); - while ((p = cgroup_task_iter_next(&it))) { + css_task_iter_start(css, &it); + while ((p = css_task_iter_next(&it))) { /* * Only affect tasks that qualify per the caller's callback, * if he provided one @@ -3442,7 +3443,7 @@ int cgroup_scan_tasks(struct cgroup *cgrp, * the heap and wasn't inserted */ } - cgroup_task_iter_end(&it); + css_task_iter_end(&it); if (heap->size) { for (i = 0; i < heap->size; i++) { @@ -3485,7 +3486,8 @@ static void cgroup_transfer_one_task(struct task_struct *task, void *data) */ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) { - return cgroup_scan_tasks(from, NULL, cgroup_transfer_one_task, to, NULL); + return css_scan_tasks(&from->dummy_css, NULL, cgroup_transfer_one_task, + to, NULL); } /* @@ -3639,7 +3641,7 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type, pid_t *array; int length; int pid, n = 0; /* used for populating the array */ - struct cgroup_task_iter it; + struct css_task_iter it; struct task_struct *tsk; struct cgroup_pidlist *l; @@ -3654,8 +3656,8 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type, if (!array) return -ENOMEM; /* now, populate the array */ - cgroup_task_iter_start(cgrp, &it); - while ((tsk = cgroup_task_iter_next(&it))) { + css_task_iter_start(&cgrp->dummy_css, &it); + while ((tsk = css_task_iter_next(&it))) { if (unlikely(n == length)) break; /* get tgid or pid for procs or tasks file respectively */ @@ -3666,7 +3668,7 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type, if (pid > 0) /* make sure to only use valid results */ array[n++] = pid; } - cgroup_task_iter_end(&it); + css_task_iter_end(&it); length = n; /* now sort & (if procs) strip out duplicates */ sort(array, length, sizeof(pid_t), cmppid, NULL); @@ -3700,7 +3702,7 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) { int ret = -EINVAL; struct cgroup *cgrp; - struct cgroup_task_iter it; + struct css_task_iter it; struct task_struct *tsk; /* @@ -3714,8 +3716,8 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) ret = 0; cgrp = dentry->d_fsdata; - cgroup_task_iter_start(cgrp, &it); - while ((tsk = cgroup_task_iter_next(&it))) { + css_task_iter_start(&cgrp->dummy_css, &it); + while ((tsk = css_task_iter_next(&it))) { switch (tsk->state) { case TASK_RUNNING: stats->nr_running++; @@ -3735,7 +3737,7 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) break; } } - cgroup_task_iter_end(&it); + css_task_iter_end(&it); err: return ret; diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index e0ab9bf..5cd2b6d 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -258,7 +258,7 @@ static void update_if_frozen(struct cgroup_subsys_state *css) { struct freezer *freezer = css_freezer(css); struct cgroup_subsys_state *pos; - struct cgroup_task_iter it; + struct css_task_iter it; struct task_struct *task; WARN_ON_ONCE(!rcu_read_lock_held()); @@ -279,9 +279,9 @@ static void update_if_frozen(struct cgroup_subsys_state *css) } /* are all tasks frozen? */ - cgroup_task_iter_start(css->cgroup, &it); + css_task_iter_start(css, &it); - while ((task = cgroup_task_iter_next(&it))) { + while ((task = css_task_iter_next(&it))) { if (freezing(task)) { /* * freezer_should_skip() indicates that the task @@ -296,7 +296,7 @@ static void update_if_frozen(struct cgroup_subsys_state *css) freezer->state |= CGROUP_FROZEN; out_iter_end: - cgroup_task_iter_end(&it); + css_task_iter_end(&it); out_unlock: spin_unlock_irq(&freezer->lock); } @@ -322,26 +322,24 @@ static int freezer_read(struct cgroup_subsys_state *css, struct cftype *cft, static void freeze_cgroup(struct freezer *freezer) { - struct cgroup *cgroup = freezer->css.cgroup; - struct cgroup_task_iter it; + struct css_task_iter it; struct task_struct *task; - cgroup_task_iter_start(cgroup, &it); - while ((task = cgroup_task_iter_next(&it))) + css_task_iter_start(&freezer->css, &it); + while ((task = css_task_iter_next(&it))) freeze_task(task); - cgroup_task_iter_end(&it); + css_task_iter_end(&it); } static void unfreeze_cgroup(struct freezer *freezer) { - struct cgroup *cgroup = freezer->css.cgroup; - struct cgroup_task_iter it; + struct css_task_iter it; struct task_struct *task; - cgroup_task_iter_start(cgroup, &it); - while ((task = cgroup_task_iter_next(&it))) + css_task_iter_start(&freezer->css, &it); + while ((task = css_task_iter_next(&it))) __thaw_task(task); - cgroup_task_iter_end(&it); + css_task_iter_end(&it); } /** diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 6fe23f2..39e5217 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -832,8 +832,8 @@ static struct cpuset *effective_nodemask_cpuset(struct cpuset *cs) * @tsk: task to test * @data: cpuset to @tsk belongs to * - * Called by cgroup_scan_tasks() for each task in a cgroup whose - * cpus_allowed mask needs to be changed. + * Called by css_scan_tasks() for each task in a cgroup whose cpus_allowed + * mask needs to be changed. * * We don't need to re-check for the cgroup/cpuset membership, since we're * holding cpuset_mutex at this point. @@ -849,27 +849,26 @@ static void cpuset_change_cpumask(struct task_struct *tsk, void *data) /** * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset. * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed - * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks() + * @heap: if NULL, defer allocating heap memory to css_scan_tasks() * * Called with cpuset_mutex held * - * The cgroup_scan_tasks() function will scan all the tasks in a cgroup, + * The css_scan_tasks() function will scan all the tasks in a cgroup, * calling callback functions for each. * - * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0 + * No return value. It's guaranteed that css_scan_tasks() always returns 0 * if @heap != NULL. */ static void update_tasks_cpumask(struct cpuset *cs, struct ptr_heap *heap) { - cgroup_scan_tasks(cs->css.cgroup, NULL, cpuset_change_cpumask, cs, - heap); + css_scan_tasks(&cs->css, NULL, cpuset_change_cpumask, cs, heap); } /* * update_tasks_cpumask_hier - Update the cpumasks of tasks in the hierarchy. * @root_cs: the root cpuset of the hierarchy * @update_root: update root cpuset or not? - * @heap: the heap used by cgroup_scan_tasks() + * @heap: the heap used by css_scan_tasks() * * This will update cpumasks of tasks in @root_cs and all other empty cpusets * which take on cpumask of @root_cs. @@ -1082,11 +1081,10 @@ static void *cpuset_being_rebound; /** * update_tasks_nodemask - Update the nodemasks of tasks in the cpuset. * @cs: the cpuset in which each task's mems_allowed mask needs to be changed - * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks() + * @heap: if NULL, defer allocating heap memory to css_scan_tasks() * - * Called with cpuset_mutex held - * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0 - * if @heap != NULL. + * Called with cpuset_mutex held. No return value. It's guaranteed that + * css_scan_tasks() always returns 0 if @heap != NULL. */ static void update_tasks_nodemask(struct cpuset *cs, struct ptr_heap *heap) { @@ -1109,8 +1107,7 @@ static void update_tasks_nodemask(struct cpuset *cs, struct ptr_heap *heap) * It's ok if we rebind the same mm twice; mpol_rebind_mm() * is idempotent. Also migrate pages in each mm to new nodes. */ - cgroup_scan_tasks(cs->css.cgroup, NULL, cpuset_change_nodemask, &arg, - heap); + css_scan_tasks(&cs->css, NULL, cpuset_change_nodemask, &arg, heap); /* * All the tasks' nodemasks have been updated, update @@ -1126,7 +1123,7 @@ static void update_tasks_nodemask(struct cpuset *cs, struct ptr_heap *heap) * update_tasks_nodemask_hier - Update the nodemasks of tasks in the hierarchy. * @cs: the root cpuset of the hierarchy * @update_root: update the root cpuset or not? - * @heap: the heap used by cgroup_scan_tasks() + * @heap: the heap used by css_scan_tasks() * * This will update nodemasks of tasks in @root_cs and all other empty cpusets * which take on nodemask of @root_cs. @@ -1254,12 +1251,12 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val) return 0; } -/* +/** * cpuset_change_flag - make a task's spread flags the same as its cpuset's * @tsk: task to be updated * @data: cpuset to @tsk belongs to * - * Called by cgroup_scan_tasks() for each task in a cgroup. + * Called by css_scan_tasks() for each task in a cgroup. * * We don't need to re-check for the cgroup/cpuset membership, since we're * holding cpuset_mutex at this point. @@ -1271,22 +1268,22 @@ static void cpuset_change_flag(struct task_struct *tsk, void *data) cpuset_update_task_spread_flag(cs, tsk); } -/* +/** * update_tasks_flags - update the spread flags of tasks in the cpuset. * @cs: the cpuset in which each task's spread flags needs to be changed - * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks() + * @heap: if NULL, defer allocating heap memory to css_scan_tasks() * * Called with cpuset_mutex held * - * The cgroup_scan_tasks() function will scan all the tasks in a cgroup, + * The css_scan_tasks() function will scan all the tasks in a cgroup, * calling callback functions for each. * - * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0 + * No return value. It's guaranteed that css_scan_tasks() always returns 0 * if @heap != NULL. */ static void update_tasks_flags(struct cpuset *cs, struct ptr_heap *heap) { - cgroup_scan_tasks(cs->css.cgroup, NULL, cpuset_change_flag, cs, heap); + css_scan_tasks(&cs->css, NULL, cpuset_change_flag, cs, heap); } /* diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 5a5f4dc..95106a9 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1799,12 +1799,11 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL); totalpages = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT ? : 1; for_each_mem_cgroup_tree(iter, memcg) { - struct cgroup *cgroup = iter->css.cgroup; - struct cgroup_task_iter it; + struct css_task_iter it; struct task_struct *task; - cgroup_task_iter_start(cgroup, &it); - while ((task = cgroup_task_iter_next(&it))) { + css_task_iter_start(&iter->css, &it); + while ((task = css_task_iter_next(&it))) { switch (oom_scan_process_thread(task, totalpages, NULL, false)) { case OOM_SCAN_SELECT: @@ -1817,7 +1816,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, case OOM_SCAN_CONTINUE: continue; case OOM_SCAN_ABORT: - cgroup_task_iter_end(&it); + css_task_iter_end(&it); mem_cgroup_iter_break(memcg, iter); if (chosen) put_task_struct(chosen); @@ -1834,7 +1833,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, get_task_struct(chosen); } } - cgroup_task_iter_end(&it); + css_task_iter_end(&it); } if (!chosen) -- cgit v0.10.2 From 81eeaf0411204f52af8ef78ff107cfca2fcfec1d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 8 Aug 2013 20:11:26 -0400 Subject: cgroup: make cftype->[un]register_event() deal with cgroup_subsys_state instead of cgroup cgroup is in the process of converting to css (cgroup_subsys_state) from cgroup as the principal subsystem interface handle. This is mostly to prepare for the unified hierarchy support where css's will be created and destroyed dynamically but also helps cleaning up subsystem implementations as css is usually what they are interested in anyway. cftype->[un]register_event() is among the remaining couple interfaces which still use struct cgroup. Convert it to cgroup_subsys_state. The conversion is mostly mechanical and removes the last users of mem_cgroup_from_cont() and cg_to_vmpressure(), which are removed. v2: indentation update as suggested by Li Zefan. Signed-off-by: Tejun Heo Acked-by: Li Zefan Acked-by: Michal Hocko Cc: Johannes Weiner Cc: Balbir Singh diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index cd105fc..b065d24 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -506,16 +506,18 @@ struct cftype { * you want to provide this functionality. Use eventfd_signal() * on eventfd to send notification to userspace. */ - int (*register_event)(struct cgroup *cgrp, struct cftype *cft, - struct eventfd_ctx *eventfd, const char *args); + int (*register_event)(struct cgroup_subsys_state *css, + struct cftype *cft, struct eventfd_ctx *eventfd, + const char *args); /* * unregister_event() callback will be called when userspace * closes the eventfd or on cgroup removing. * This callback must be implemented, if you want provide * notification functionality. */ - void (*unregister_event)(struct cgroup *cgrp, struct cftype *cft, - struct eventfd_ctx *eventfd); + void (*unregister_event)(struct cgroup_subsys_state *css, + struct cftype *cft, + struct eventfd_ctx *eventfd); }; /* diff --git a/include/linux/vmpressure.h b/include/linux/vmpressure.h index 76be077..b239482 100644 --- a/include/linux/vmpressure.h +++ b/include/linux/vmpressure.h @@ -33,10 +33,12 @@ extern void vmpressure_init(struct vmpressure *vmpr); extern struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg); extern struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr); extern struct vmpressure *css_to_vmpressure(struct cgroup_subsys_state *css); -extern int vmpressure_register_event(struct cgroup *cg, struct cftype *cft, +extern int vmpressure_register_event(struct cgroup_subsys_state *css, + struct cftype *cft, struct eventfd_ctx *eventfd, const char *args); -extern void vmpressure_unregister_event(struct cgroup *cg, struct cftype *cft, +extern void vmpressure_unregister_event(struct cgroup_subsys_state *css, + struct cftype *cft, struct eventfd_ctx *eventfd); #else static inline void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 8c57301..a71f2e0 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -159,9 +159,9 @@ struct css_id { */ struct cgroup_event { /* - * Cgroup which the event belongs to. + * css which the event belongs to. */ - struct cgroup *cgrp; + struct cgroup_subsys_state *css; /* * Control file which the event associated. */ @@ -3955,11 +3955,12 @@ static void cgroup_event_remove(struct work_struct *work) { struct cgroup_event *event = container_of(work, struct cgroup_event, remove); - struct cgroup *cgrp = event->cgrp; + struct cgroup_subsys_state *css = event->css; + struct cgroup *cgrp = css->cgroup; remove_wait_queue(event->wqh, &event->wait); - event->cft->unregister_event(cgrp, event->cft, event->eventfd); + event->cft->unregister_event(css, event->cft, event->eventfd); /* Notify userspace the event is going away. */ eventfd_signal(event->eventfd, 1); @@ -3979,7 +3980,7 @@ static int cgroup_event_wake(wait_queue_t *wait, unsigned mode, { struct cgroup_event *event = container_of(wait, struct cgroup_event, wait); - struct cgroup *cgrp = event->cgrp; + struct cgroup *cgrp = event->css->cgroup; unsigned long flags = (unsigned long)key; if (flags & POLLHUP) { @@ -4048,7 +4049,7 @@ static int cgroup_write_event_control(struct cgroup_subsys_state *css, event = kzalloc(sizeof(*event), GFP_KERNEL); if (!event) return -ENOMEM; - event->cgrp = cgrp; + event->css = css; INIT_LIST_HEAD(&event->list); init_poll_funcptr(&event->pt, cgroup_event_ptable_queue_proc); init_waitqueue_func_entry(&event->wait, cgroup_event_wake); @@ -4099,7 +4100,7 @@ static int cgroup_write_event_control(struct cgroup_subsys_state *css, goto out_put_cfile; } - ret = event->cft->register_event(cgrp, event->cft, + ret = event->cft->register_event(css, event->cft, event->eventfd, buffer); if (ret) goto out_put_cfile; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 95106a9..2885e3e 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1034,11 +1034,6 @@ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page) preempt_enable(); } -static inline struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont) -{ - return mem_cgroup_from_css(cgroup_css(cont, mem_cgroup_subsys_id)); -} - struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) { /* @@ -5620,10 +5615,10 @@ static void mem_cgroup_oom_notify(struct mem_cgroup *memcg) mem_cgroup_oom_notify_cb(iter); } -static int mem_cgroup_usage_register_event(struct cgroup *cgrp, +static int mem_cgroup_usage_register_event(struct cgroup_subsys_state *css, struct cftype *cft, struct eventfd_ctx *eventfd, const char *args) { - struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); + struct mem_cgroup *memcg = mem_cgroup_from_css(css); struct mem_cgroup_thresholds *thresholds; struct mem_cgroup_threshold_ary *new; enum res_type type = MEMFILE_TYPE(cft->private); @@ -5703,10 +5698,10 @@ unlock: return ret; } -static void mem_cgroup_usage_unregister_event(struct cgroup *cgrp, +static void mem_cgroup_usage_unregister_event(struct cgroup_subsys_state *css, struct cftype *cft, struct eventfd_ctx *eventfd) { - struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); + struct mem_cgroup *memcg = mem_cgroup_from_css(css); struct mem_cgroup_thresholds *thresholds; struct mem_cgroup_threshold_ary *new; enum res_type type = MEMFILE_TYPE(cft->private); @@ -5782,10 +5777,10 @@ unlock: mutex_unlock(&memcg->thresholds_lock); } -static int mem_cgroup_oom_register_event(struct cgroup *cgrp, +static int mem_cgroup_oom_register_event(struct cgroup_subsys_state *css, struct cftype *cft, struct eventfd_ctx *eventfd, const char *args) { - struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); + struct mem_cgroup *memcg = mem_cgroup_from_css(css); struct mem_cgroup_eventfd_list *event; enum res_type type = MEMFILE_TYPE(cft->private); @@ -5807,10 +5802,10 @@ static int mem_cgroup_oom_register_event(struct cgroup *cgrp, return 0; } -static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp, +static void mem_cgroup_oom_unregister_event(struct cgroup_subsys_state *css, struct cftype *cft, struct eventfd_ctx *eventfd) { - struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); + struct mem_cgroup *memcg = mem_cgroup_from_css(css); struct mem_cgroup_eventfd_list *ev, *tmp; enum res_type type = MEMFILE_TYPE(cft->private); diff --git a/mm/vmpressure.c b/mm/vmpressure.c index 2a8a736..13489b1 100644 --- a/mm/vmpressure.c +++ b/mm/vmpressure.c @@ -74,11 +74,6 @@ static struct vmpressure *work_to_vmpressure(struct work_struct *work) return container_of(work, struct vmpressure, work); } -static struct vmpressure *cg_to_vmpressure(struct cgroup *cg) -{ - return css_to_vmpressure(cgroup_css(cg, mem_cgroup_subsys_id)); -} - static struct vmpressure *vmpressure_parent(struct vmpressure *vmpr) { struct cgroup_subsys_state *css = vmpressure_to_css(vmpr); @@ -283,7 +278,7 @@ void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio) /** * vmpressure_register_event() - Bind vmpressure notifications to an eventfd - * @cg: cgroup that is interested in vmpressure notifications + * @css: css that is interested in vmpressure notifications * @cft: cgroup control files handle * @eventfd: eventfd context to link notifications with * @args: event arguments (used to set up a pressure level threshold) @@ -298,10 +293,11 @@ void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio) * cftype).register_event, and then cgroup core will handle everything by * itself. */ -int vmpressure_register_event(struct cgroup *cg, struct cftype *cft, - struct eventfd_ctx *eventfd, const char *args) +int vmpressure_register_event(struct cgroup_subsys_state *css, + struct cftype *cft, struct eventfd_ctx *eventfd, + const char *args) { - struct vmpressure *vmpr = cg_to_vmpressure(cg); + struct vmpressure *vmpr = css_to_vmpressure(css); struct vmpressure_event *ev; int level; @@ -329,7 +325,7 @@ int vmpressure_register_event(struct cgroup *cg, struct cftype *cft, /** * vmpressure_unregister_event() - Unbind eventfd from vmpressure - * @cg: cgroup handle + * @css: css handle * @cft: cgroup control files handle * @eventfd: eventfd context that was used to link vmpressure with the @cg * @@ -341,10 +337,11 @@ int vmpressure_register_event(struct cgroup *cg, struct cftype *cft, * cftype).unregister_event, and then cgroup core will handle everything * by itself. */ -void vmpressure_unregister_event(struct cgroup *cg, struct cftype *cft, +void vmpressure_unregister_event(struct cgroup_subsys_state *css, + struct cftype *cft, struct eventfd_ctx *eventfd) { - struct vmpressure *vmpr = cg_to_vmpressure(cg); + struct vmpressure *vmpr = css_to_vmpressure(css); struct vmpressure_event *ev; mutex_lock(&vmpr->events_lock); -- cgit v0.10.2 From d99c8727e7bbc01b70e2c57e6127bfab26b868fd Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 8 Aug 2013 20:11:27 -0400 Subject: cgroup: make cgroup_taskset deal with cgroup_subsys_state instead of cgroup cgroup is in the process of converting to css (cgroup_subsys_state) from cgroup as the principal subsystem interface handle. This is mostly to prepare for the unified hierarchy support where css's will be created and destroyed dynamically but also helps cleaning up subsystem implementations as css is usually what they are interested in anyway. cgroup_taskset which is used by the subsystem attach methods is the last cgroup subsystem API which isn't using css as the handle. Update cgroup_taskset_cur_cgroup() to cgroup_taskset_cur_css() and cgroup_taskset_for_each() to take @skip_css instead of @skip_cgrp. The conversions are pretty mechanical. One exception is cpuset::cgroup_cs(), which lost its last user and got removed. This patch shouldn't introduce any functional changes. Signed-off-by: Tejun Heo Acked-by: Li Zefan Acked-by: Daniel Wagner Cc: Ingo Molnar Cc: Matt Helsley Cc: Steven Rostedt diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 4b40640..54ad002 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -891,7 +891,7 @@ static int blkcg_can_attach(struct cgroup_subsys_state *css, int ret = 0; /* task_lock() is needed to avoid races with exit_io_context() */ - cgroup_taskset_for_each(task, css->cgroup, tset) { + cgroup_taskset_for_each(task, css, tset) { task_lock(task); ioc = task->io_context; if (ioc && atomic_read(&ioc->nr_tasks) > 1) diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index b065d24..d9a9705 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -562,20 +562,22 @@ int cgroup_task_count(const struct cgroup *cgrp); struct cgroup_taskset; struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset); struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset); -struct cgroup *cgroup_taskset_cur_cgroup(struct cgroup_taskset *tset); +struct cgroup_subsys_state *cgroup_taskset_cur_css(struct cgroup_taskset *tset, + int subsys_id); int cgroup_taskset_size(struct cgroup_taskset *tset); /** * cgroup_taskset_for_each - iterate cgroup_taskset * @task: the loop cursor - * @skip_cgrp: skip if task's cgroup matches this, %NULL to iterate through all + * @skip_css: skip if task's css matches this, %NULL to iterate through all * @tset: taskset to iterate */ -#define cgroup_taskset_for_each(task, skip_cgrp, tset) \ +#define cgroup_taskset_for_each(task, skip_css, tset) \ for ((task) = cgroup_taskset_first((tset)); (task); \ (task) = cgroup_taskset_next((tset))) \ - if (!(skip_cgrp) || \ - cgroup_taskset_cur_cgroup((tset)) != (skip_cgrp)) + if (!(skip_css) || \ + cgroup_taskset_cur_css((tset), \ + (skip_css)->ss->subsys_id) != (skip_css)) /* * Control Group subsystem type. diff --git a/kernel/cgroup.c b/kernel/cgroup.c index a71f2e0..e5bfb2a 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1907,18 +1907,20 @@ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset) EXPORT_SYMBOL_GPL(cgroup_taskset_next); /** - * cgroup_taskset_cur_cgroup - return the matching cgroup for the current task + * cgroup_taskset_cur_css - return the matching css for the current task * @tset: taskset of interest + * @subsys_id: the ID of the target subsystem * - * Return the cgroup for the current (last returned) task of @tset. This - * function must be preceded by either cgroup_taskset_first() or - * cgroup_taskset_next(). + * Return the css for the current (last returned) task of @tset for + * subsystem specified by @subsys_id. This function must be preceded by + * either cgroup_taskset_first() or cgroup_taskset_next(). */ -struct cgroup *cgroup_taskset_cur_cgroup(struct cgroup_taskset *tset) +struct cgroup_subsys_state *cgroup_taskset_cur_css(struct cgroup_taskset *tset, + int subsys_id) { - return tset->cur_cgrp; + return cgroup_css(tset->cur_cgrp, subsys_id); } -EXPORT_SYMBOL_GPL(cgroup_taskset_cur_cgroup); +EXPORT_SYMBOL_GPL(cgroup_taskset_cur_css); /** * cgroup_taskset_size - return the number of tasks in taskset diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index 5cd2b6d..224da9a 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -189,7 +189,7 @@ static void freezer_attach(struct cgroup_subsys_state *new_css, * current state before executing the following - !frozen tasks may * be visible in a FROZEN cgroup and frozen tasks in a THAWED one. */ - cgroup_taskset_for_each(task, new_css->cgroup, tset) { + cgroup_taskset_for_each(task, new_css, tset) { if (!(freezer->state & CGROUP_FREEZING)) { __thaw_task(task); } else { diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 39e5217..bf69717 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -119,12 +119,6 @@ static inline struct cpuset *css_cs(struct cgroup_subsys_state *css) return css ? container_of(css, struct cpuset, css) : NULL; } -/* Retrieve the cpuset for a cgroup */ -static inline struct cpuset *cgroup_cs(struct cgroup *cgrp) -{ - return css_cs(cgroup_css(cgrp, cpuset_subsys_id)); -} - /* Retrieve the cpuset for a task */ static inline struct cpuset *task_cs(struct task_struct *task) { @@ -1459,7 +1453,7 @@ static int cpuset_can_attach(struct cgroup_subsys_state *css, (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))) goto out_unlock; - cgroup_taskset_for_each(task, css->cgroup, tset) { + cgroup_taskset_for_each(task, css, tset) { /* * Kthreads which disallow setaffinity shouldn't be moved * to a new cpuset; we don't want to change their cpu @@ -1511,9 +1505,10 @@ static void cpuset_attach(struct cgroup_subsys_state *css, struct mm_struct *mm; struct task_struct *task; struct task_struct *leader = cgroup_taskset_first(tset); - struct cgroup *oldcgrp = cgroup_taskset_cur_cgroup(tset); + struct cgroup_subsys_state *oldcss = cgroup_taskset_cur_css(tset, + cpuset_subsys_id); struct cpuset *cs = css_cs(css); - struct cpuset *oldcs = cgroup_cs(oldcgrp); + struct cpuset *oldcs = css_cs(oldcss); struct cpuset *cpus_cs = effective_cpumask_cpuset(cs); struct cpuset *mems_cs = effective_nodemask_cpuset(cs); @@ -1527,7 +1522,7 @@ static void cpuset_attach(struct cgroup_subsys_state *css, guarantee_online_mems(mems_cs, &cpuset_attach_nodemask_to); - cgroup_taskset_for_each(task, css->cgroup, tset) { + cgroup_taskset_for_each(task, css, tset) { /* * can_attach beforehand should guarantee that this doesn't * fail. TODO: have a better way to handle failure here diff --git a/kernel/events/core.c b/kernel/events/core.c index 9705a0e..c199c4f 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7816,7 +7816,7 @@ static void perf_cgroup_attach(struct cgroup_subsys_state *css, { struct task_struct *task; - cgroup_taskset_for_each(task, css->cgroup, tset) + cgroup_taskset_for_each(task, css, tset) task_function_call(task, __perf_cgroup_move, task); } diff --git a/kernel/sched/core.c b/kernel/sched/core.c index cc9a492..a7122d5 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7135,7 +7135,7 @@ static int cpu_cgroup_can_attach(struct cgroup_subsys_state *css, { struct task_struct *task; - cgroup_taskset_for_each(task, css->cgroup, tset) { + cgroup_taskset_for_each(task, css, tset) { #ifdef CONFIG_RT_GROUP_SCHED if (!sched_rt_can_attach(css_tg(css), task)) return -EINVAL; @@ -7153,7 +7153,7 @@ static void cpu_cgroup_attach(struct cgroup_subsys_state *css, { struct task_struct *task; - cgroup_taskset_for_each(task, css->cgroup, tset) + cgroup_taskset_for_each(task, css, tset) sched_move_task(task); } diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c index e00f60e..d9cd627 100644 --- a/net/core/netprio_cgroup.c +++ b/net/core/netprio_cgroup.c @@ -224,7 +224,7 @@ static void net_prio_attach(struct cgroup_subsys_state *css, struct task_struct *p; void *v; - cgroup_taskset_for_each(p, css->cgroup, tset) { + cgroup_taskset_for_each(p, css, tset) { task_lock(p); v = (void *)(unsigned long)task_netprioidx(p); iterate_fd(p->files, 0, update_netprio, v); diff --git a/net/sched/cls_cgroup.c b/net/sched/cls_cgroup.c index 8ea1184..867b4a3 100644 --- a/net/sched/cls_cgroup.c +++ b/net/sched/cls_cgroup.c @@ -74,7 +74,7 @@ static void cgrp_attach(struct cgroup_subsys_state *css, struct task_struct *p; void *v; - cgroup_taskset_for_each(p, css->cgroup, tset) { + cgroup_taskset_for_each(p, css, tset) { task_lock(p); v = (void *)(unsigned long)task_cls_classid(p); iterate_fd(p->files, 0, update_classid, v); -- cgit v0.10.2 From 95109b627ba6a043c181fa5fa45d1c754dd44fbc Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 8 Aug 2013 20:11:27 -0400 Subject: cgroup: unexport cgroup_css() cgroup_css() no longer has any user left outside cgroup.c proper and we don't want subsystems to grow new usages of the function. cgroup core should always provide the css to use to the subsystems, which will make dynamic creation and destruction of css's across the lifetime of a cgroup much more manageable than exposing the cgroup directly to subsystems and let them dereference css's from it. Make cgroup_css() a static function in cgroup.c. Signed-off-by: Tejun Heo Acked-by: Li Zefan diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index d9a9705..c40e508 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -678,19 +678,6 @@ struct cgroup_subsys_state *css_parent(struct cgroup_subsys_state *css) } /** - * cgroup_css - obtain a cgroup's css for the specified subsystem - * @cgrp: the cgroup of interest - * @subsys_id: the subsystem of interest - * - * Return @cgrp's css (cgroup_subsys_state) associated with @subsys_id. - */ -static inline struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp, - int subsys_id) -{ - return cgrp->subsys[subsys_id]; -} - -/** * task_css_set_check - obtain a task's css_set with extra access conditions * @task: the task to obtain css_set for * @__c: extra condition expression to be passed to rcu_dereference_check() diff --git a/kernel/cgroup.c b/kernel/cgroup.c index e5bfb2a..c02a288 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -222,6 +222,19 @@ static int cgroup_destroy_locked(struct cgroup *cgrp); static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], bool is_add); +/** + * cgroup_css - obtain a cgroup's css for the specified subsystem + * @cgrp: the cgroup of interest + * @subsys_id: the subsystem of interest + * + * Return @cgrp's css (cgroup_subsys_state) associated with @subsys_id. + */ +static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp, + int subsys_id) +{ + return cgrp->subsys[subsys_id]; +} + /* convenient tests for these bits */ static inline bool cgroup_is_dead(const struct cgroup *cgrp) { -- cgit v0.10.2 From bd8815a6d802fc16a7a106e170593aa05dc17e72 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 8 Aug 2013 20:11:27 -0400 Subject: cgroup: make css_for_each_descendant() and friends include the origin css in the iteration Previously, all css descendant iterators didn't include the origin (root of subtree) css in the iteration. The reasons were maintaining consistency with css_for_each_child() and that at the time of introduction more use cases needed skipping the origin anyway; however, given that css_is_descendant() considers self to be a descendant, omitting the origin css has become more confusing and looking at the accumulated use cases rather clearly indicates that including origin would result in simpler code overall. While this is a change which can easily lead to subtle bugs, cgroup API including the iterators has recently gone through major restructuring and no out-of-tree changes will be applicable without adjustments making this a relatively acceptable opportunity for this type of change. The conversions are mostly straight-forward. If the iteration block had explicit origin handling before or after, it's moved inside the iteration. If not, if (pos == origin) continue; is added. Some conversions add extra reference get/put around origin handling by consolidating origin handling and the rest. While the extra ref operations aren't strictly necessary, this shouldn't cause any noticeable difference. Signed-off-by: Tejun Heo Acked-by: Li Zefan Acked-by: Vivek Goyal Acked-by: Aristeu Rozanski Acked-by: Michal Hocko Cc: Jens Axboe Cc: Matt Helsley Cc: Johannes Weiner Cc: Balbir Singh diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 54ad002..e90c7c1 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -615,12 +615,10 @@ u64 blkg_stat_recursive_sum(struct blkg_policy_data *pd, int off) struct blkcg_policy *pol = blkcg_policy[pd->plid]; struct blkcg_gq *pos_blkg; struct cgroup_subsys_state *pos_css; - u64 sum; + u64 sum = 0; lockdep_assert_held(pd->blkg->q->queue_lock); - sum = blkg_stat_read((void *)pd + off); - rcu_read_lock(); blkg_for_each_descendant_pre(pos_blkg, pos_css, pd_to_blkg(pd)) { struct blkg_policy_data *pos_pd = blkg_to_pd(pos_blkg, pol); @@ -650,13 +648,11 @@ struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkg_policy_data *pd, struct blkcg_policy *pol = blkcg_policy[pd->plid]; struct blkcg_gq *pos_blkg; struct cgroup_subsys_state *pos_css; - struct blkg_rwstat sum; + struct blkg_rwstat sum = { }; int i; lockdep_assert_held(pd->blkg->q->queue_lock); - sum = blkg_rwstat_read((void *)pd + off); - rcu_read_lock(); blkg_for_each_descendant_pre(pos_blkg, pos_css, pd_to_blkg(pd)) { struct blkg_policy_data *pos_pd = blkg_to_pd(pos_blkg, pol); diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h index 8555386..ae6969a 100644 --- a/block/blk-cgroup.h +++ b/block/blk-cgroup.h @@ -291,6 +291,7 @@ struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, struct request_queue *q, * read locked. If called under either blkcg or queue lock, the iteration * is guaranteed to include all and only online blkgs. The caller may * update @pos_css by calling css_rightmost_descendant() to skip subtree. + * @p_blkg is included in the iteration and the first node to be visited. */ #define blkg_for_each_descendant_pre(d_blkg, pos_css, p_blkg) \ css_for_each_descendant_pre((pos_css), &(p_blkg)->blkcg->css) \ @@ -304,7 +305,8 @@ struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, struct request_queue *q, * @p_blkg: target blkg to walk descendants of * * Similar to blkg_for_each_descendant_pre() but performs post-order - * traversal instead. Synchronization rules are the same. + * traversal instead. Synchronization rules are the same. @p_blkg is + * included in the iteration and the last node to be visited. */ #define blkg_for_each_descendant_post(d_blkg, pos_css, p_blkg) \ css_for_each_descendant_post((pos_css), &(p_blkg)->blkcg->css) \ diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 8cefa7f..8331aba 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -1379,7 +1379,6 @@ static int tg_set_conf(struct cgroup_subsys_state *css, struct cftype *cft, * restrictions in the whole hierarchy and allows them to bypass * blk-throttle. */ - tg_update_has_rules(tg); blkg_for_each_descendant_pre(blkg, pos_css, ctx.blkg) tg_update_has_rules(blkg_to_tg(blkg)); @@ -1639,8 +1638,6 @@ void blk_throtl_drain(struct request_queue *q) blkg_for_each_descendant_post(blkg, pos_css, td->queue->root_blkg) tg_drain_bios(&blkg_to_tg(blkg)->service_queue); - tg_drain_bios(&td_root_tg(td)->service_queue); - /* finally, transfer bios from top-level tg's into the td */ tg_drain_bios(&td->service_queue); diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index c40e508..8ec5b0f 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -798,7 +798,8 @@ css_rightmost_descendant(struct cgroup_subsys_state *pos); * @pos: the css * to use as the loop cursor * @root: css whose descendants to walk * - * Walk @root's descendants. Must be called under rcu_read_lock(). A + * Walk @root's descendants. @root is included in the iteration and the + * first node to be visited. Must be called under rcu_read_lock(). A * descendant css which hasn't finished ->css_online() or already has * finished ->css_offline() may show up during traversal and it's each * subsystem's responsibility to verify that each @pos is alive. @@ -820,13 +821,12 @@ css_rightmost_descendant(struct cgroup_subsys_state *pos); * * my_update_state(@css) * { - * Lock @css; - * Update @css's state; - * Unlock @css; - * * css_for_each_descendant_pre(@pos, @css) { * Lock @pos; - * Verify @pos is alive and inherit state from @pos's parent; + * if (@pos == @css) + * Update @css's state; + * else + * Verify @pos is alive and inherit state from its parent; * Unlock @pos; * } * } @@ -864,8 +864,9 @@ css_next_descendant_post(struct cgroup_subsys_state *pos, * @css: css whose descendants to walk * * Similar to css_for_each_descendant_pre() but performs post-order - * traversal instead. Note that the walk visibility guarantee described in - * pre-order walk doesn't apply the same to post-order walks. + * traversal instead. @root is included in the iteration and the last + * node to be visited. Note that the walk visibility guarantee described + * in pre-order walk doesn't apply the same to post-order walks. */ #define css_for_each_descendant_post(pos, css) \ for ((pos) = css_next_descendant_post(NULL, (css)); (pos); \ diff --git a/kernel/cgroup.c b/kernel/cgroup.c index c02a288..52f0498 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2868,17 +2868,6 @@ static int cgroup_cfts_commit(struct cftype *cfts, bool is_add) mutex_unlock(&cgroup_mutex); - /* @root always needs to be updated */ - inode = root->dentry->d_inode; - mutex_lock(&inode->i_mutex); - mutex_lock(&cgroup_mutex); - ret = cgroup_addrm_files(root, cfts, is_add); - mutex_unlock(&cgroup_mutex); - mutex_unlock(&inode->i_mutex); - - if (ret) - goto out_deact; - /* add/rm files for all cgroups created before */ rcu_read_lock(); css_for_each_descendant_pre(css, cgroup_css(root, ss->subsys_id)) { @@ -2907,7 +2896,6 @@ static int cgroup_cfts_commit(struct cftype *cfts, bool is_add) } rcu_read_unlock(); dput(prev); -out_deact: deactivate_super(sb); return ret; } @@ -3099,7 +3087,8 @@ EXPORT_SYMBOL_GPL(css_next_child); * @root: css whose descendants to walk * * To be used by css_for_each_descendant_pre(). Find the next descendant - * to visit for pre-order traversal of @root's descendants. + * to visit for pre-order traversal of @root's descendants. @root is + * included in the iteration and the first node to be visited. * * While this function requires RCU read locking, it doesn't require the * whole traversal to be contained in a single RCU critical section. This @@ -3114,9 +3103,9 @@ css_next_descendant_pre(struct cgroup_subsys_state *pos, WARN_ON_ONCE(!rcu_read_lock_held()); - /* if first iteration, pretend we just visited @root */ + /* if first iteration, visit @root */ if (!pos) - pos = root; + return root; /* visit the first child if exists */ next = css_next_child(NULL, pos); @@ -3186,7 +3175,8 @@ css_leftmost_descendant(struct cgroup_subsys_state *pos) * @root: css whose descendants to walk * * To be used by css_for_each_descendant_post(). Find the next descendant - * to visit for post-order traversal of @root's descendants. + * to visit for post-order traversal of @root's descendants. @root is + * included in the iteration and the last node to be visited. * * While this function requires RCU read locking, it doesn't require the * whole traversal to be contained in a single RCU critical section. This @@ -3207,14 +3197,17 @@ css_next_descendant_post(struct cgroup_subsys_state *pos, return next != root ? next : NULL; } + /* if we visited @root, we're done */ + if (pos == root) + return NULL; + /* if there's an unvisited sibling, visit its leftmost descendant */ next = css_next_child(pos, css_parent(pos)); if (next) return css_leftmost_descendant(next); /* no sibling left, visit parent */ - next = css_parent(pos); - return next != root ? next : NULL; + return css_parent(pos); } EXPORT_SYMBOL_GPL(css_next_descendant_post); diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index 224da9a..f0ff64d 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -311,7 +311,6 @@ static int freezer_read(struct cgroup_subsys_state *css, struct cftype *cft, /* update states bottom-up */ css_for_each_descendant_post(pos, css) update_if_frozen(pos); - update_if_frozen(css); rcu_read_unlock(); @@ -391,11 +390,6 @@ static void freezer_change_state(struct freezer *freezer, bool freeze) { struct cgroup_subsys_state *pos; - /* update @freezer */ - spin_lock_irq(&freezer->lock); - freezer_apply_state(freezer, freeze, CGROUP_FREEZING_SELF); - spin_unlock_irq(&freezer->lock); - /* * Update all its descendants in pre-order traversal. Each * descendant will try to inherit its parent's FREEZING state as @@ -406,14 +400,23 @@ static void freezer_change_state(struct freezer *freezer, bool freeze) struct freezer *pos_f = css_freezer(pos); struct freezer *parent = parent_freezer(pos_f); - /* - * Our update to @parent->state is already visible which is - * all we need. No need to lock @parent. For more info on - * synchronization, see freezer_post_create(). - */ spin_lock_irq(&pos_f->lock); - freezer_apply_state(pos_f, parent->state & CGROUP_FREEZING, - CGROUP_FREEZING_PARENT); + + if (pos_f == freezer) { + freezer_apply_state(pos_f, freeze, + CGROUP_FREEZING_SELF); + } else { + /* + * Our update to @parent->state is already visible + * which is all we need. No need to lock @parent. + * For more info on synchronization, see + * freezer_post_create(). + */ + freezer_apply_state(pos_f, + parent->state & CGROUP_FREEZING, + CGROUP_FREEZING_PARENT); + } + spin_unlock_irq(&pos_f->lock); } rcu_read_unlock(); diff --git a/kernel/cpuset.c b/kernel/cpuset.c index bf69717..72a0383 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -222,7 +222,8 @@ static struct cpuset top_cpuset = { * * Walk @des_cs through the online descendants of @root_cs. Must be used * with RCU read locked. The caller may modify @pos_css by calling - * css_rightmost_descendant() to skip subtree. + * css_rightmost_descendant() to skip subtree. @root_cs is included in the + * iteration and the first node to be visited. */ #define cpuset_for_each_descendant_pre(des_cs, pos_css, root_cs) \ css_for_each_descendant_pre((pos_css), &(root_cs)->css) \ @@ -506,6 +507,9 @@ static void update_domain_attr_tree(struct sched_domain_attr *dattr, rcu_read_lock(); cpuset_for_each_descendant_pre(cp, pos_css, root_cs) { + if (cp == root_cs) + continue; + /* skip the whole subtree if @cp doesn't have any CPU */ if (cpumask_empty(cp->cpus_allowed)) { pos_css = css_rightmost_descendant(pos_css); @@ -613,6 +617,8 @@ static int generate_sched_domains(cpumask_var_t **domains, rcu_read_lock(); cpuset_for_each_descendant_pre(cp, pos_css, &top_cpuset) { + if (cp == &top_cpuset) + continue; /* * Continue traversing beyond @cp iff @cp has some CPUs and * isn't load balancing. The former is obvious. The @@ -875,15 +881,17 @@ static void update_tasks_cpumask_hier(struct cpuset *root_cs, struct cpuset *cp; struct cgroup_subsys_state *pos_css; - if (update_root) - update_tasks_cpumask(root_cs, heap); - rcu_read_lock(); cpuset_for_each_descendant_pre(cp, pos_css, root_cs) { - /* skip the whole subtree if @cp have some CPU */ - if (!cpumask_empty(cp->cpus_allowed)) { - pos_css = css_rightmost_descendant(pos_css); - continue; + if (cp == root_cs) { + if (!update_root) + continue; + } else { + /* skip the whole subtree if @cp have some CPU */ + if (!cpumask_empty(cp->cpus_allowed)) { + pos_css = css_rightmost_descendant(pos_css); + continue; + } } if (!css_tryget(&cp->css)) continue; @@ -1130,15 +1138,17 @@ static void update_tasks_nodemask_hier(struct cpuset *root_cs, struct cpuset *cp; struct cgroup_subsys_state *pos_css; - if (update_root) - update_tasks_nodemask(root_cs, heap); - rcu_read_lock(); cpuset_for_each_descendant_pre(cp, pos_css, root_cs) { - /* skip the whole subtree if @cp have some CPU */ - if (!nodes_empty(cp->mems_allowed)) { - pos_css = css_rightmost_descendant(pos_css); - continue; + if (cp == root_cs) { + if (!update_root) + continue; + } else { + /* skip the whole subtree if @cp have some CPU */ + if (!nodes_empty(cp->mems_allowed)) { + pos_css = css_rightmost_descendant(pos_css); + continue; + } } if (!css_tryget(&cp->css)) continue; @@ -2237,7 +2247,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work) rcu_read_lock(); cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) { - if (!css_tryget(&cs->css)) + if (cs == &top_cpuset || !css_tryget(&cs->css)) continue; rcu_read_unlock(); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 2885e3e..b89d4cb 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1079,14 +1079,7 @@ static struct mem_cgroup *__mem_cgroup_iter_next(struct mem_cgroup *root, { struct cgroup_subsys_state *prev_css, *next_css; - /* - * Root is not visited by cgroup iterators so it needs an - * explicit visit. - */ - if (!last_visited) - return root; - - prev_css = (last_visited == root) ? NULL : &last_visited->css; + prev_css = last_visited ? &last_visited->css : NULL; skip_node: next_css = css_next_descendant_pre(prev_css, &root->css); diff --git a/security/device_cgroup.c b/security/device_cgroup.c index 9bf230a..c123628 100644 --- a/security/device_cgroup.c +++ b/security/device_cgroup.c @@ -456,7 +456,7 @@ static int propagate_exception(struct dev_cgroup *devcg_root, * methods), and online ones are safe to access outside RCU * read lock without bumping refcnt. */ - if (!is_devcg_online(devcg)) + if (pos == &devcg_root->css || !is_devcg_online(devcg)) continue; rcu_read_unlock(); -- cgit v0.10.2 From 40e93b39cd5b6a347333a95152ce37deef37bbd0 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 13 Aug 2013 11:01:53 -0400 Subject: cgroup: always use cgroup_css() cgroup_css() is the accessor for cgroup->subsys[] but is not used consistently. cgroup->subsys[] will become RCU protected and cgroup_css() will grow synchronization sanity checks. In preparation, make all cgroup->subsys[] dereferences use cgroup_css() consistently. This patch doesn't introduce any functional difference. Signed-off-by: Tejun Heo Acked-by: Li Zefan diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 52f0498..49ad96e 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -574,7 +574,7 @@ static struct css_set *find_existing_css_set(struct css_set *old_cset, /* Subsystem is in this hierarchy. So we want * the subsystem state from the new * cgroup */ - template[i] = cgrp->subsys[i]; + template[i] = cgroup_css(cgrp, i); } else { /* Subsystem is not in this hierarchy, so we * don't want to change the subsystem state */ @@ -871,7 +871,7 @@ static void cgroup_free_fn(struct work_struct *work) * Release the subsystem state objects. */ for_each_root_subsys(cgrp->root, ss) { - struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; + struct cgroup_subsys_state *css = cgroup_css(cgrp, ss->subsys_id); ss->css_free(css); } @@ -1067,27 +1067,27 @@ static int rebind_subsystems(struct cgroupfs_root *root, if (bit & added_mask) { /* We're binding this subsystem to this hierarchy */ - BUG_ON(cgrp->subsys[i]); - BUG_ON(!cgroup_dummy_top->subsys[i]); - BUG_ON(cgroup_dummy_top->subsys[i]->cgroup != cgroup_dummy_top); + BUG_ON(cgroup_css(cgrp, i)); + BUG_ON(!cgroup_css(cgroup_dummy_top, i)); + BUG_ON(cgroup_css(cgroup_dummy_top, i)->cgroup != cgroup_dummy_top); cgrp->subsys[i] = cgroup_dummy_top->subsys[i]; - cgrp->subsys[i]->cgroup = cgrp; + cgroup_css(cgrp, i)->cgroup = cgrp; list_move(&ss->sibling, &root->subsys_list); ss->root = root; if (ss->bind) - ss->bind(cgrp->subsys[i]); + ss->bind(cgroup_css(cgrp, i)); /* refcount was already taken, and we're keeping it */ root->subsys_mask |= bit; } else if (bit & removed_mask) { /* We're removing this subsystem */ - BUG_ON(cgrp->subsys[i] != cgroup_dummy_top->subsys[i]); - BUG_ON(cgrp->subsys[i]->cgroup != cgrp); + BUG_ON(cgroup_css(cgrp, i) != cgroup_css(cgroup_dummy_top, i)); + BUG_ON(cgroup_css(cgrp, i)->cgroup != cgrp); if (ss->bind) - ss->bind(cgroup_dummy_top->subsys[i]); - cgroup_dummy_top->subsys[i]->cgroup = cgroup_dummy_top; + ss->bind(cgroup_css(cgroup_dummy_top, i)); + cgroup_css(cgroup_dummy_top, i)->cgroup = cgroup_dummy_top; cgrp->subsys[i] = NULL; cgroup_subsys[i]->root = &cgroup_dummy_root; list_move(&ss->sibling, &cgroup_dummy_root.subsys_list); @@ -2072,7 +2072,7 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk, * step 1: check that we can legitimately attach to the cgroup. */ for_each_root_subsys(root, ss) { - struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; + struct cgroup_subsys_state *css = cgroup_css(cgrp, ss->subsys_id); if (ss->can_attach) { retval = ss->can_attach(css, &tset); @@ -2114,7 +2114,7 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk, * step 4: do subsystem attach callbacks. */ for_each_root_subsys(root, ss) { - struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; + struct cgroup_subsys_state *css = cgroup_css(cgrp, ss->subsys_id); if (ss->attach) ss->attach(css, &tset); @@ -2136,7 +2136,7 @@ out_put_css_set_refs: out_cancel_attach: if (retval) { for_each_root_subsys(root, ss) { - struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; + struct cgroup_subsys_state *css = cgroup_css(cgrp, ss->subsys_id); if (ss == failed_ss) break; @@ -2308,7 +2308,7 @@ static struct cgroup_subsys_state *cgroup_file_css(struct cfent *cfe) struct cgroup *cgrp = __d_cgrp(cfe->dentry->d_parent); if (cft->ss) - return cgrp->subsys[cft->ss->subsys_id]; + return cgroup_css(cgrp, cft->ss->subsys_id); return &cgrp->dummy_css; } @@ -4241,7 +4241,7 @@ static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask) /* This cgroup is ready now */ for_each_root_subsys(cgrp->root, ss) { - struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; + struct cgroup_subsys_state *css = cgroup_css(cgrp, ss->subsys_id); struct css_id *id = rcu_dereference_protected(css->id, true); /* @@ -4285,7 +4285,7 @@ static void init_cgroup_css(struct cgroup_subsys_state *css, css->id = NULL; if (cgrp == cgroup_dummy_top) css->flags |= CSS_ROOT; - BUG_ON(cgrp->subsys[ss->subsys_id]); + BUG_ON(cgroup_css(cgrp, ss->subsys_id)); cgrp->subsys[ss->subsys_id] = css; /* @@ -4300,7 +4300,7 @@ static void init_cgroup_css(struct cgroup_subsys_state *css, /* invoke ->css_online() on a new CSS and mark it online if successful */ static int online_css(struct cgroup_subsys *ss, struct cgroup *cgrp) { - struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; + struct cgroup_subsys_state *css = cgroup_css(cgrp, ss->subsys_id); int ret = 0; lockdep_assert_held(&cgroup_mutex); @@ -4315,7 +4315,7 @@ static int online_css(struct cgroup_subsys *ss, struct cgroup *cgrp) /* if the CSS is online, invoke ->css_offline() on it and mark it offline */ static void offline_css(struct cgroup_subsys *ss, struct cgroup *cgrp) { - struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; + struct cgroup_subsys_state *css = cgroup_css(cgrp, ss->subsys_id); lockdep_assert_held(&cgroup_mutex); @@ -4400,7 +4400,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, for_each_root_subsys(root, ss) { struct cgroup_subsys_state *css; - css = ss->css_alloc(parent->subsys[ss->subsys_id]); + css = ss->css_alloc(cgroup_css(parent, ss->subsys_id)); if (IS_ERR(css)) { err = PTR_ERR(css); goto err_free_all; @@ -4477,7 +4477,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, err_free_all: for_each_root_subsys(root, ss) { - struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; + struct cgroup_subsys_state *css = cgroup_css(cgrp, ss->subsys_id); if (css) { percpu_ref_cancel_init(&css->refcnt); @@ -4590,7 +4590,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) */ atomic_set(&cgrp->css_kill_cnt, 1); for_each_root_subsys(cgrp->root, ss) { - struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; + struct cgroup_subsys_state *css = cgroup_css(cgrp, ss->subsys_id); /* * Killing would put the base ref, but we need to keep it @@ -4676,7 +4676,7 @@ static void cgroup_offline_fn(struct work_struct *work) * destruction happens only after all css's are released. */ for_each_root_subsys(cgrp->root, ss) - css_put(cgrp->subsys[ss->subsys_id]); + css_put(cgroup_css(cgrp, ss->subsys_id)); /* delete this cgroup from parent->children */ list_del_rcu(&cgrp->sibling); @@ -4741,7 +4741,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) /* Create the top cgroup state for this subsystem */ list_add(&ss->sibling, &cgroup_dummy_root.subsys_list); ss->root = &cgroup_dummy_root; - css = ss->css_alloc(cgroup_dummy_top->subsys[ss->subsys_id]); + css = ss->css_alloc(cgroup_css(cgroup_dummy_top, ss->subsys_id)); /* We don't handle early failures gracefully */ BUG_ON(IS_ERR(css)); init_cgroup_css(css, ss, cgroup_dummy_top); @@ -4820,7 +4820,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) * struct, so this can happen first (i.e. before the dummy root * attachment). */ - css = ss->css_alloc(cgroup_dummy_top->subsys[ss->subsys_id]); + css = ss->css_alloc(cgroup_css(cgroup_dummy_top, ss->subsys_id)); if (IS_ERR(css)) { /* failure case - need to deassign the cgroup_subsys[] slot. */ cgroup_subsys[ss->subsys_id] = NULL; @@ -4936,7 +4936,7 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss) * the cgrp->subsys pointer to find their state. note that this * also takes care of freeing the css_id. */ - ss->css_free(cgroup_dummy_top->subsys[ss->subsys_id]); + ss->css_free(cgroup_css(cgroup_dummy_top, ss->subsys_id)); cgroup_dummy_top->subsys[ss->subsys_id] = NULL; mutex_unlock(&cgroup_mutex); @@ -5562,8 +5562,8 @@ static int alloc_css_id(struct cgroup_subsys *ss, struct cgroup *parent, struct css_id *child_id, *parent_id; subsys_id = ss->subsys_id; - parent_css = parent->subsys[subsys_id]; - child_css = child->subsys[subsys_id]; + parent_css = cgroup_css(parent, subsys_id); + child_css = cgroup_css(child, subsys_id); parent_id = rcu_dereference_protected(parent_css->id, true); depth = parent_id->depth + 1; @@ -5624,7 +5624,7 @@ struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id) /* get cgroup */ cgrp = __d_cgrp(f->f_dentry); - css = cgrp->subsys[id]; + css = cgroup_css(cgrp, id); return css ? css : ERR_PTR(-ENOENT); } -- cgit v0.10.2 From 35ef10da65d43211f4cd7e7822cbb3becdfc0ae1 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 13 Aug 2013 11:01:54 -0400 Subject: cgroup: rename cgroup_subsys_state->dput_work and its callback function css (cgroup_subsys_state) will become RCU protected and there will be two stages which require punting to work item during release. To prepare for using the work item for multiple times, rename css->dput_work to css->destroy_work and css_dput_fn() to css_free_work_fn() and move work item initialization from css init to right before the actual usage. This reorganization doesn't introduce any behavior change. Signed-off-by: Tejun Heo Acked-by: Li Zefan diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 8ec5b0f..12d66fe 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -80,7 +80,7 @@ struct cgroup_subsys_state { struct css_id __rcu *id; /* Used to put @cgroup->dentry on the last css_put() */ - struct work_struct dput_work; + struct work_struct destroy_work; }; /* bits in struct cgroup_subsys_state flags field */ diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 49ad96e..0b28097 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4259,10 +4259,10 @@ err: return ret; } -static void css_dput_fn(struct work_struct *work) +static void css_free_work_fn(struct work_struct *work) { struct cgroup_subsys_state *css = - container_of(work, struct cgroup_subsys_state, dput_work); + container_of(work, struct cgroup_subsys_state, destroy_work); cgroup_dput(css->cgroup); } @@ -4272,7 +4272,14 @@ static void css_release(struct percpu_ref *ref) struct cgroup_subsys_state *css = container_of(ref, struct cgroup_subsys_state, refcnt); - schedule_work(&css->dput_work); + /* + * css holds an extra ref to @cgrp->dentry which is put on the last + * css_put(). dput() requires process context, which css_put() may + * be called without. @css->destroy_work will be used to invoke + * dput() asynchronously from css_put(). + */ + INIT_WORK(&css->destroy_work, css_free_work_fn); + schedule_work(&css->destroy_work); } static void init_cgroup_css(struct cgroup_subsys_state *css, @@ -4287,14 +4294,6 @@ static void init_cgroup_css(struct cgroup_subsys_state *css, css->flags |= CSS_ROOT; BUG_ON(cgroup_css(cgrp, ss->subsys_id)); cgrp->subsys[ss->subsys_id] = css; - - /* - * css holds an extra ref to @cgrp->dentry which is put on the last - * css_put(). dput() requires process context, which css_put() may - * be called without. @css->dput_work will be used to invoke - * dput() asynchronously from css_put(). - */ - INIT_WORK(&css->dput_work, css_dput_fn); } /* invoke ->css_online() on a new CSS and mark it online if successful */ -- cgit v0.10.2 From 0ae78e0bf10ac38ab53548e18383afc9997eca22 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 13 Aug 2013 11:01:54 -0400 Subject: cgroup: add cgroup_subsys_state->parent With the planned unified hierarchy, css's (cgroup_subsys_state) will be RCU protected and allowed to be attached and detached dynamically over the course of a cgroup's lifetime. This means that css's will stay accessible after being detached from its cgroup - the matching pointer in cgroup->subsys[] cleared - for ref draining and RCU grace period. cgroup core still wants to guarantee that the parent css is never destroyed before its children and css_parent() always returns the parent regardless of the state of the child css as long as it's accessible. This patch makes css's hold onto their parents and adds css->parent so that the parent css is never detroyed before its children and can be determined without consulting the cgroups. cgroup->dummy_css is also updated to point to the parent dummy_css; however, it doesn't need to worry about object lifetime as the parent cgroup is already pinned by the child. Signed-off-by: Tejun Heo Acked-by: Li Zefan diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 12d66fe..8a5dc91 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -75,6 +75,9 @@ struct cgroup_subsys_state { /* reference count - access via css_[try]get() and css_put() */ struct percpu_ref refcnt; + /* the parent css */ + struct cgroup_subsys_state *parent; + unsigned long flags; /* ID for this css, if possible */ struct css_id __rcu *id; @@ -666,15 +669,7 @@ struct cgroup_subsys { static inline struct cgroup_subsys_state *css_parent(struct cgroup_subsys_state *css) { - struct cgroup *parent_cgrp = css->cgroup->parent; - - if (!parent_cgrp) - return NULL; - - if (css->ss) - return parent_cgrp->subsys[css->ss->subsys_id]; - else - return &parent_cgrp->dummy_css; + return css->parent; } /** diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 0b28097..5c6dd7e 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4264,6 +4264,9 @@ static void css_free_work_fn(struct work_struct *work) struct cgroup_subsys_state *css = container_of(work, struct cgroup_subsys_state, destroy_work); + if (css->parent) + css_put(css->parent); + cgroup_dput(css->cgroup); } @@ -4290,8 +4293,12 @@ static void init_cgroup_css(struct cgroup_subsys_state *css, css->ss = ss; css->flags = 0; css->id = NULL; - if (cgrp == cgroup_dummy_top) + + if (cgrp->parent) + css->parent = cgroup_css(cgrp->parent, ss->subsys_id); + else css->flags |= CSS_ROOT; + BUG_ON(cgroup_css(cgrp, ss->subsys_id)); cgrp->subsys[ss->subsys_id] = css; } @@ -4388,6 +4395,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, cgrp->dentry = dentry; cgrp->parent = parent; + cgrp->dummy_css.parent = &parent->dummy_css; cgrp->root = parent->root; if (notify_on_release(parent)) @@ -4436,9 +4444,13 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children); root->number_of_cgroups++; - /* each css holds a ref to the cgroup's dentry */ - for_each_root_subsys(root, ss) + /* each css holds a ref to the cgroup's dentry and the parent css */ + for_each_root_subsys(root, ss) { + struct cgroup_subsys_state *css = cgroup_css(cgrp, ss->subsys_id); + dget(dentry); + percpu_ref_get(&css->parent->refcnt); + } /* hold a ref to the parent's dentry */ dget(parent->dentry); -- cgit v0.10.2 From b77d7b6088377998ebf65eaea5e51008c2d75e94 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 13 Aug 2013 11:01:54 -0400 Subject: cgroup: cgroup_css_from_dir() now should be called with RCU read locked cgroup->subsys[] will become RCU protected and thus all cgroup_css() usages should either be under RCU read lock or cgroup_mutex. This patch updates cgroup_css_from_dir() which returns the matching cgroup_subsys_state given a directory file and subsys_id so that it requires RCU read lock and updates its sole user perf_cgroup_connect(). Signed-off-by: Tejun Heo Acked-by: Li Zefan Cc: Steven Rostedt Cc: Frederic Weisbecker Cc: Ingo Molnar diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 5c6dd7e..cbb6314 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -5616,8 +5616,14 @@ struct cgroup_subsys_state *css_lookup(struct cgroup_subsys *ss, int id) } EXPORT_SYMBOL_GPL(css_lookup); -/* - * get corresponding css from file open on cgroupfs directory +/** + * cgroup_css_from_dir - get corresponding css from file open on cgroup dir + * @f: directory file of interest + * @id: subsystem id of interest + * + * Must be called under RCU read lock. The caller is responsible for + * pinning the returned css if it needs to be accessed outside the RCU + * critical section. */ struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id) { @@ -5625,6 +5631,8 @@ struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id) struct inode *inode; struct cgroup_subsys_state *css; + WARN_ON_ONCE(!rcu_read_lock_held()); + inode = file_inode(f); /* check in cgroup filesystem dir */ if (inode->i_op != &cgroup_dir_inode_operations) diff --git a/kernel/events/core.c b/kernel/events/core.c index c199c4f..23261f9 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -591,6 +591,8 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event, if (!f.file) return -EBADF; + rcu_read_lock(); + css = cgroup_css_from_dir(f.file, perf_subsys_id); if (IS_ERR(css)) { ret = PTR_ERR(css); @@ -617,6 +619,7 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event, ret = -EINVAL; } out: + rcu_read_unlock(); fdput(f); return ret; } -- cgit v0.10.2 From 105347ba5da3e87facce2337c50cd5df93cc6bec Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 13 Aug 2013 11:01:55 -0400 Subject: cgroup: make cgroup_file_open() rcu_read_lock() around cgroup_css() and add cfent->css For the planned unified hierarchy, each css (cgroup_subsys_state) will be RCU protected so that it can be created and destroyed individually while allowing RCU accesses, and cgroup_css() will soon require either holding cgroup_mutex or RCU read lock. This patch updates cgroup_file_open() such that it acquires the associated css under rcu_read_lock(). While cgroup_file_css() usages in other file operations are safe due to the reference from open, cgroup_css() wouldn't know that and will still trigger warnings. It'd be cleanest to store the acquired css in file->prvidate_data for further file operations but that's already used by seqfile. This patch instead adds cfent->css to cache the associated css. Note that while this field is initialized during cfe init, it should only be considered valid while the file is open. This patch doesn't change visible behavior. Signed-off-by: Tejun Heo Acked-by: Li Zefan diff --git a/kernel/cgroup.c b/kernel/cgroup.c index cbb6314..d63beff 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -117,6 +117,7 @@ struct cfent { struct list_head node; struct dentry *dentry; struct cftype *type; + struct cgroup_subsys_state *css; /* file xattrs */ struct simple_xattrs xattrs; @@ -2301,17 +2302,6 @@ static int cgroup_sane_behavior_show(struct cgroup_subsys_state *css, return 0; } -/* return the css for the given cgroup file */ -static struct cgroup_subsys_state *cgroup_file_css(struct cfent *cfe) -{ - struct cftype *cft = cfe->type; - struct cgroup *cgrp = __d_cgrp(cfe->dentry->d_parent); - - if (cft->ss) - return cgroup_css(cgrp, cft->ss->subsys_id); - return &cgrp->dummy_css; -} - /* A buffer size big enough for numbers or short strings */ #define CGROUP_LOCAL_BUFFER_SIZE 64 @@ -2388,7 +2378,7 @@ static ssize_t cgroup_file_write(struct file *file, const char __user *buf, { struct cfent *cfe = __d_cfe(file->f_dentry); struct cftype *cft = __d_cft(file->f_dentry); - struct cgroup_subsys_state *css = cgroup_file_css(cfe); + struct cgroup_subsys_state *css = cfe->css; if (cft->write) return cft->write(css, cft, file, buf, nbytes, ppos); @@ -2430,7 +2420,7 @@ static ssize_t cgroup_file_read(struct file *file, char __user *buf, { struct cfent *cfe = __d_cfe(file->f_dentry); struct cftype *cft = __d_cft(file->f_dentry); - struct cgroup_subsys_state *css = cgroup_file_css(cfe); + struct cgroup_subsys_state *css = cfe->css; if (cft->read) return cft->read(css, cft, file, buf, nbytes, ppos); @@ -2456,7 +2446,7 @@ static int cgroup_seqfile_show(struct seq_file *m, void *arg) { struct cfent *cfe = m->private; struct cftype *cft = cfe->type; - struct cgroup_subsys_state *css = cgroup_file_css(cfe); + struct cgroup_subsys_state *css = cfe->css; if (cft->read_map) { struct cgroup_map_cb cb = { @@ -2479,7 +2469,8 @@ static int cgroup_file_open(struct inode *inode, struct file *file) { struct cfent *cfe = __d_cfe(file->f_dentry); struct cftype *cft = __d_cft(file->f_dentry); - struct cgroup_subsys_state *css = cgroup_file_css(cfe); + struct cgroup *cgrp = __d_cgrp(cfe->dentry->d_parent); + struct cgroup_subsys_state *css; int err; err = generic_file_open(inode, file); @@ -2491,7 +2482,18 @@ static int cgroup_file_open(struct inode *inode, struct file *file) * unpinned either on open failure or release. This ensures that * @css stays alive for all file operations. */ - if (css->ss && !css_tryget(css)) + rcu_read_lock(); + if (cft->ss) { + css = cgroup_css(cgrp, cft->ss->subsys_id); + if (!css_tryget(css)) + css = NULL; + } else { + css = &cgrp->dummy_css; + } + rcu_read_unlock(); + + /* css should match @cfe->css, see cgroup_add_file() for details */ + if (!css || WARN_ON_ONCE(css != cfe->css)) return -ENODEV; if (cft->read_map || cft->read_seq_string) { @@ -2510,7 +2512,7 @@ static int cgroup_file_release(struct inode *inode, struct file *file) { struct cfent *cfe = __d_cfe(file->f_dentry); struct cftype *cft = __d_cft(file->f_dentry); - struct cgroup_subsys_state *css = cgroup_file_css(cfe); + struct cgroup_subsys_state *css = cfe->css; int ret = 0; if (cft->release) @@ -2772,6 +2774,18 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft) dentry->d_fsdata = cfe; simple_xattrs_init(&cfe->xattrs); + /* + * cfe->css is used by read/write/close to determine the associated + * css. file->private_data would be a better place but that's + * already used by seqfile. Note that open will use the usual + * cgroup_css() and css_tryget() to acquire the css and this + * caching doesn't affect css lifetime management. + */ + if (cft->ss) + cfe->css = cgroup_css(cgrp, cft->ss->subsys_id); + else + cfe->css = &cgrp->dummy_css; + mode = cgroup_file_mode(cft); error = cgroup_create_file(dentry, mode | S_IFREG, cgrp->root->sb); if (!error) { -- cgit v0.10.2 From 73e80ed8007fc48a6deeb295ba37159fad274bd2 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 13 Aug 2013 11:01:55 -0400 Subject: cgroup: add __rcu modifier to cgroup->subsys[] For the planned unified hierarchy, each css (cgroup_subsys_state) will be RCU protected so that it can be created and destroyed individually while allowing RCU accesses. Previous changes ensured that all cgroup->subsys[] accesses use the cgroup_css() accessor. This patch adds __rcu modifier to cgroup->subsys[], add matching RCU dereference in cgroup_css() and convert all assignments to either rcu_assign_pointer() or RCU_INIT_POINTER(). This change prepares for the actual RCUfication of css's and doesn't introduce any visible behavior change. The conversion is verified with sparse and all accesses are properly RCU annotated. Signed-off-by: Tejun Heo Acked-by: Li Zefan diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 8a5dc91..eb200b5 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -204,7 +204,7 @@ struct cgroup { struct cgroup_name __rcu *name; /* Private pointers for each registered subsystem */ - struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT]; + struct cgroup_subsys_state __rcu *subsys[CGROUP_SUBSYS_COUNT]; struct cgroupfs_root *root; diff --git a/kernel/cgroup.c b/kernel/cgroup.c index d63beff..c271016 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -229,11 +229,16 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], * @subsys_id: the subsystem of interest * * Return @cgrp's css (cgroup_subsys_state) associated with @subsys_id. + * This function must be called either under cgroup_mutex or + * rcu_read_lock() and the caller is responsible for pinning the returned + * css if it wants to keep accessing it outside the said locks. This + * function may return %NULL if @cgrp doesn't have @subsys_id enabled. */ static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp, int subsys_id) { - return cgrp->subsys[subsys_id]; + return rcu_dereference_check(cgrp->subsys[subsys_id], + lockdep_is_held(&cgroup_mutex)); } /* convenient tests for these bits */ @@ -1072,8 +1077,10 @@ static int rebind_subsystems(struct cgroupfs_root *root, BUG_ON(!cgroup_css(cgroup_dummy_top, i)); BUG_ON(cgroup_css(cgroup_dummy_top, i)->cgroup != cgroup_dummy_top); - cgrp->subsys[i] = cgroup_dummy_top->subsys[i]; + rcu_assign_pointer(cgrp->subsys[i], + cgroup_css(cgroup_dummy_top, i)); cgroup_css(cgrp, i)->cgroup = cgrp; + list_move(&ss->sibling, &root->subsys_list); ss->root = root; if (ss->bind) @@ -1088,8 +1095,10 @@ static int rebind_subsystems(struct cgroupfs_root *root, if (ss->bind) ss->bind(cgroup_css(cgroup_dummy_top, i)); + cgroup_css(cgroup_dummy_top, i)->cgroup = cgroup_dummy_top; - cgrp->subsys[i] = NULL; + RCU_INIT_POINTER(cgrp->subsys[i], NULL); + cgroup_subsys[i]->root = &cgroup_dummy_root; list_move(&ss->sibling, &cgroup_dummy_root.subsys_list); @@ -4314,7 +4323,7 @@ static void init_cgroup_css(struct cgroup_subsys_state *css, css->flags |= CSS_ROOT; BUG_ON(cgroup_css(cgrp, ss->subsys_id)); - cgrp->subsys[ss->subsys_id] = css; + rcu_assign_pointer(cgrp->subsys[ss->subsys_id], css); } /* invoke ->css_online() on a new CSS and mark it online if successful */ @@ -4962,7 +4971,7 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss) * also takes care of freeing the css_id. */ ss->css_free(cgroup_css(cgroup_dummy_top, ss->subsys_id)); - cgroup_dummy_top->subsys[ss->subsys_id] = NULL; + RCU_INIT_POINTER(cgroup_dummy_top->subsys[ss->subsys_id], NULL); mutex_unlock(&cgroup_mutex); } -- cgit v0.10.2 From 623f926b050e12b0f5e3a2f4d11c36e4ddd63541 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 13 Aug 2013 11:01:55 -0400 Subject: cgroup: reorganize css init / exit paths css (cgroup_subsys_state) lifetime management is about to be restructured. In prepartion, make the following mostly trivial changes. * init_cgroup_css() is renamed to init_css() so that it's consistent with other css handling functions. * alloc_css_id(), online_css() and offline_css() updated to take @css instead of cgroups and subsys IDs. This patch doesn't make any functional changes. v2: v1 merged two for_each_root_subsys() loops in cgroup_create() but Li Zefan pointed out that it breaks error path. Dropped. Signed-off-by: Tejun Heo Acked-by: Li Zefan diff --git a/kernel/cgroup.c b/kernel/cgroup.c index c271016..a1ebc44 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -838,8 +838,7 @@ static struct backing_dev_info cgroup_backing_dev_info = { .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, }; -static int alloc_css_id(struct cgroup_subsys *ss, - struct cgroup *parent, struct cgroup *child); +static int alloc_css_id(struct cgroup_subsys_state *child_css); static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb) { @@ -4308,9 +4307,8 @@ static void css_release(struct percpu_ref *ref) schedule_work(&css->destroy_work); } -static void init_cgroup_css(struct cgroup_subsys_state *css, - struct cgroup_subsys *ss, - struct cgroup *cgrp) +static void init_css(struct cgroup_subsys_state *css, struct cgroup_subsys *ss, + struct cgroup *cgrp) { css->cgroup = cgrp; css->ss = ss; @@ -4327,9 +4325,9 @@ static void init_cgroup_css(struct cgroup_subsys_state *css, } /* invoke ->css_online() on a new CSS and mark it online if successful */ -static int online_css(struct cgroup_subsys *ss, struct cgroup *cgrp) +static int online_css(struct cgroup_subsys_state *css) { - struct cgroup_subsys_state *css = cgroup_css(cgrp, ss->subsys_id); + struct cgroup_subsys *ss = css->ss; int ret = 0; lockdep_assert_held(&cgroup_mutex); @@ -4342,9 +4340,9 @@ static int online_css(struct cgroup_subsys *ss, struct cgroup *cgrp) } /* if the CSS is online, invoke ->css_offline() on it and mark it offline */ -static void offline_css(struct cgroup_subsys *ss, struct cgroup *cgrp) +static void offline_css(struct cgroup_subsys_state *css) { - struct cgroup_subsys_state *css = cgroup_css(cgrp, ss->subsys_id); + struct cgroup_subsys *ss = css->ss; lockdep_assert_held(&cgroup_mutex); @@ -4442,10 +4440,10 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, goto err_free_all; } - init_cgroup_css(css, ss, cgrp); + init_css(css, ss, cgrp); if (ss->use_id) { - err = alloc_css_id(ss, parent, cgrp); + err = alloc_css_id(css); if (err) goto err_free_all; } @@ -4480,7 +4478,9 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, /* creation succeeded, notify subsystems */ for_each_root_subsys(root, ss) { - err = online_css(ss, cgrp); + struct cgroup_subsys_state *css = cgroup_css(cgrp, ss->subsys_id); + + err = online_css(css); if (err) goto err_destroy; @@ -4700,7 +4700,7 @@ static void cgroup_offline_fn(struct work_struct *work) * initate destruction. */ for_each_root_subsys(cgrp->root, ss) - offline_css(ss, cgrp); + offline_css(cgroup_css(cgrp, ss->subsys_id)); /* * Put the css refs from cgroup_destroy_locked(). Each css holds @@ -4778,7 +4778,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) css = ss->css_alloc(cgroup_css(cgroup_dummy_top, ss->subsys_id)); /* We don't handle early failures gracefully */ BUG_ON(IS_ERR(css)); - init_cgroup_css(css, ss, cgroup_dummy_top); + init_css(css, ss, cgroup_dummy_top); /* Update the init_css_set to contain a subsys * pointer to this state - since the subsystem is @@ -4793,7 +4793,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) * need to invoke fork callbacks here. */ BUG_ON(!list_empty(&init_task.tasks)); - BUG_ON(online_css(ss, cgroup_dummy_top)); + BUG_ON(online_css(cgroup_css(cgroup_dummy_top, ss->subsys_id))); mutex_unlock(&cgroup_mutex); @@ -4866,8 +4866,8 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) ss->root = &cgroup_dummy_root; /* our new subsystem will be attached to the dummy hierarchy. */ - init_cgroup_css(css, ss, cgroup_dummy_top); - /* init_idr must be after init_cgroup_css because it sets css->id. */ + init_css(css, ss, cgroup_dummy_top); + /* init_idr must be after init_css() because it sets css->id. */ if (ss->use_id) { ret = cgroup_init_idr(ss, css); if (ret) @@ -4897,7 +4897,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) } write_unlock(&css_set_lock); - ret = online_css(ss, cgroup_dummy_top); + ret = online_css(cgroup_css(cgroup_dummy_top, ss->subsys_id)); if (ret) goto err_unload; @@ -4936,7 +4936,7 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss) mutex_lock(&cgroup_mutex); - offline_css(ss, cgroup_dummy_top); + offline_css(cgroup_css(cgroup_dummy_top, ss->subsys_id)); if (ss->use_id) idr_destroy(&ss->idr); @@ -5588,20 +5588,16 @@ static int __init_or_module cgroup_init_idr(struct cgroup_subsys *ss, return 0; } -static int alloc_css_id(struct cgroup_subsys *ss, struct cgroup *parent, - struct cgroup *child) +static int alloc_css_id(struct cgroup_subsys_state *child_css) { - int subsys_id, i, depth = 0; - struct cgroup_subsys_state *parent_css, *child_css; + struct cgroup_subsys_state *parent_css = css_parent(child_css); struct css_id *child_id, *parent_id; + int i, depth; - subsys_id = ss->subsys_id; - parent_css = cgroup_css(parent, subsys_id); - child_css = cgroup_css(child, subsys_id); parent_id = rcu_dereference_protected(parent_css->id, true); depth = parent_id->depth + 1; - child_id = get_new_cssid(ss, depth); + child_id = get_new_cssid(child_css->ss, depth); if (IS_ERR(child_id)) return PTR_ERR(child_id); -- cgit v0.10.2 From ae7f164a09408bf21ab3c82a9e80a3ff37aa9e3e Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 13 Aug 2013 20:22:50 -0400 Subject: cgroup: move cgroup->subsys[] assignment to online_css() Currently, css (cgroup_subsys_state) lifetime is tied to that of the associated cgroup. With the planned unified hierarchy, css's will be dynamically created and destroyed within the lifetime of a cgroup. To enable such usages, css's will be individually RCU protected instead of being tied to the cgroup. In preparation, this patch moves cgroup->subsys[] assignment from init_css() to online_css(). As this means that a newly initialized css should be remembered separately and that cgroup_css() returns NULL between init and online, cgroup_create() is updated so that it stores newly created css's in a local array css_ar[] and cgroup_init/load_subsys() are updated to use local variable @css instead of using cgroup_css(). This change also slightly simplifies error path of cgroup_create(). While this patch changes when cgroup->subsys[] is initialized, this change isn't visible to subsystems or userland. v2: This patch wasn't updated accordingly after the previous "cgroup: reorganize css init / exit paths" was updated leading to missing a css_ar[] conversion in cgroup_create() and thus boot failure. Fix it. Signed-off-by: Tejun Heo Acked-by: Li Zefan diff --git a/kernel/cgroup.c b/kernel/cgroup.c index a1ebc44..b9f736c 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4321,7 +4321,6 @@ static void init_css(struct cgroup_subsys_state *css, struct cgroup_subsys *ss, css->flags |= CSS_ROOT; BUG_ON(cgroup_css(cgrp, ss->subsys_id)); - rcu_assign_pointer(cgrp->subsys[ss->subsys_id], css); } /* invoke ->css_online() on a new CSS and mark it online if successful */ @@ -4334,8 +4333,10 @@ static int online_css(struct cgroup_subsys_state *css) if (ss->css_online) ret = ss->css_online(css); - if (!ret) + if (!ret) { css->flags |= CSS_ONLINE; + rcu_assign_pointer(css->cgroup->subsys[ss->subsys_id], css); + } return ret; } @@ -4366,6 +4367,7 @@ static void offline_css(struct cgroup_subsys_state *css) static long cgroup_create(struct cgroup *parent, struct dentry *dentry, umode_t mode) { + struct cgroup_subsys_state *css_ar[CGROUP_SUBSYS_COUNT] = { }; struct cgroup *cgrp; struct cgroup_name *name; struct cgroupfs_root *root = parent->root; @@ -4433,12 +4435,11 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, err = PTR_ERR(css); goto err_free_all; } + css_ar[ss->subsys_id] = css; err = percpu_ref_init(&css->refcnt, css_release); - if (err) { - ss->css_free(css); + if (err) goto err_free_all; - } init_css(css, ss, cgrp); @@ -4467,7 +4468,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, /* each css holds a ref to the cgroup's dentry and the parent css */ for_each_root_subsys(root, ss) { - struct cgroup_subsys_state *css = cgroup_css(cgrp, ss->subsys_id); + struct cgroup_subsys_state *css = css_ar[ss->subsys_id]; dget(dentry); percpu_ref_get(&css->parent->refcnt); @@ -4478,7 +4479,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, /* creation succeeded, notify subsystems */ for_each_root_subsys(root, ss) { - struct cgroup_subsys_state *css = cgroup_css(cgrp, ss->subsys_id); + struct cgroup_subsys_state *css = css_ar[ss->subsys_id]; err = online_css(css); if (err) @@ -4511,7 +4512,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, err_free_all: for_each_root_subsys(root, ss) { - struct cgroup_subsys_state *css = cgroup_css(cgrp, ss->subsys_id); + struct cgroup_subsys_state *css = css_ar[ss->subsys_id]; if (css) { percpu_ref_cancel_init(&css->refcnt); @@ -4793,7 +4794,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) * need to invoke fork callbacks here. */ BUG_ON(!list_empty(&init_task.tasks)); - BUG_ON(online_css(cgroup_css(cgroup_dummy_top, ss->subsys_id))); + BUG_ON(online_css(css)); mutex_unlock(&cgroup_mutex); @@ -4897,7 +4898,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) } write_unlock(&css_set_lock); - ret = online_css(cgroup_css(cgroup_dummy_top, ss->subsys_id)); + ret = online_css(css); if (ret) goto err_unload; -- cgit v0.10.2 From 223dbc38d2a8745a93749dc75ed909e274ce075d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 13 Aug 2013 20:22:50 -0400 Subject: cgroup: bounce cgroup_subsys_state ref kill confirmation to a work item css (cgroup_subsys_state) offlining, which requires process context, will be moved to ref kill confirmation. In preparation, bounce css_killed handling through css->destroy_work. css_ref_killed_fn() is renamed to css_killed_ref_fn() so that it's consistent with the new css_killed_work_fn(). This patch adds an additional work item bouncing but doesn't change the actual logic. Signed-off-by: Tejun Heo Acked-by: Li Zefan diff --git a/kernel/cgroup.c b/kernel/cgroup.c index b9f736c..398ffbb 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4555,12 +4555,27 @@ static void cgroup_css_killed(struct cgroup *cgrp) schedule_work(&cgrp->destroy_work); } -static void css_ref_killed_fn(struct percpu_ref *ref) +/* + * This is called when the refcnt of a css is confirmed to be killed. + * css_tryget() is now guaranteed to fail. + */ +static void css_killed_work_fn(struct work_struct *work) +{ + struct cgroup_subsys_state *css = + container_of(work, struct cgroup_subsys_state, destroy_work); + struct cgroup *cgrp = css->cgroup; + + cgroup_css_killed(cgrp); +} + +/* css kill confirmation processing requires process context, bounce */ +static void css_killed_ref_fn(struct percpu_ref *ref) { struct cgroup_subsys_state *css = container_of(ref, struct cgroup_subsys_state, refcnt); - cgroup_css_killed(css->cgroup); + INIT_WORK(&css->destroy_work, css_killed_work_fn); + schedule_work(&css->destroy_work); } /** @@ -4634,7 +4649,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) percpu_ref_get(&css->refcnt); atomic_inc(&cgrp->css_kill_cnt); - percpu_ref_kill_and_confirm(&css->refcnt, css_ref_killed_fn); + percpu_ref_kill_and_confirm(&css->refcnt, css_killed_ref_fn); } cgroup_css_killed(cgrp); -- cgit v0.10.2 From f20104de55a212a9742d8df1807f1f29dc95b748 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 13 Aug 2013 20:22:50 -0400 Subject: cgroup: replace cgroup->css_kill_cnt with ->nr_css Currently, css (cgroup_subsys_state) lifetime is tied to that of the associated cgroup. With the planned unified hierarchy, css's will be dynamically created and destroyed within the lifetime of a cgroup. To enable such usages, css's will be individually RCU protected instead of being tied to the cgroup. cgroup->css_kill_cnt is used during cgroup destruction to wait for css reference count disable; however, this model doesn't work once css's lifetimes are managed separately from cgroup's. This patch replaces it with cgroup->nr_css which is an cgroup_mutex protected integer counting the number of attached css's. The count is incremented from online_css() and decremented after refcnt kill is confirmed. If the count reaches zero and the cgroup is marked dead, the second stage of cgroup destruction is kicked off. If a cgroup doesn't have any css attached at the time of rmdir, cgroup_destroy_locked() now invokes the second stage directly as no css kill confirmation would happen. cgroup_offline_fn() - the second step of cgroup destruction - is renamed to cgroup_destroy_css_killed() and now expects to be called with cgroup_mutex held. While this patch changes how css destruction is punted to work items, it shouldn't change any visible behavior. Signed-off-by: Tejun Heo Acked-by: Li Zefan diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index eb200b5..80dca87 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -171,6 +171,9 @@ struct cgroup { */ int id; + /* the number of attached css's */ + int nr_css; + /* * We link our 'sibling' struct into our parent's 'children'. * Our children link their 'sibling' into our 'children'. @@ -234,7 +237,6 @@ struct cgroup { /* For css percpu_ref killing and RCU-protected deletion */ struct rcu_head rcu_head; struct work_struct destroy_work; - atomic_t css_kill_cnt; /* List of events which userspace want to receive */ struct list_head event_list; diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 398ffbb..174f4c3 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -218,7 +218,7 @@ static int need_forkexit_callback __read_mostly; static struct cftype cgroup_base_files[]; -static void cgroup_offline_fn(struct work_struct *work); +static void cgroup_destroy_css_killed(struct cgroup *cgrp); static int cgroup_destroy_locked(struct cgroup *cgrp); static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], bool is_add); @@ -4335,6 +4335,7 @@ static int online_css(struct cgroup_subsys_state *css) ret = ss->css_online(css); if (!ret) { css->flags |= CSS_ONLINE; + css->cgroup->nr_css++; rcu_assign_pointer(css->cgroup->subsys[ss->subsys_id], css); } return ret; @@ -4545,16 +4546,6 @@ static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) return cgroup_create(c_parent, dentry, mode | S_IFDIR); } -static void cgroup_css_killed(struct cgroup *cgrp) -{ - if (!atomic_dec_and_test(&cgrp->css_kill_cnt)) - return; - - /* percpu ref's of all css's are killed, kick off the next step */ - INIT_WORK(&cgrp->destroy_work, cgroup_offline_fn); - schedule_work(&cgrp->destroy_work); -} - /* * This is called when the refcnt of a css is confirmed to be killed. * css_tryget() is now guaranteed to fail. @@ -4565,7 +4556,17 @@ static void css_killed_work_fn(struct work_struct *work) container_of(work, struct cgroup_subsys_state, destroy_work); struct cgroup *cgrp = css->cgroup; - cgroup_css_killed(cgrp); + mutex_lock(&cgroup_mutex); + + /* + * If @cgrp is marked dead, it's waiting for refs of all css's to + * be disabled before proceeding to the second phase of cgroup + * destruction. If we are the last one, kick it off. + */ + if (!--cgrp->nr_css && cgroup_is_dead(cgrp)) + cgroup_destroy_css_killed(cgrp); + + mutex_unlock(&cgroup_mutex); } /* css kill confirmation processing requires process context, bounce */ @@ -4634,11 +4635,10 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) * Use percpu_ref_kill_and_confirm() to get notifications as each * css is confirmed to be seen as killed on all CPUs. The * notification callback keeps track of the number of css's to be - * killed and schedules cgroup_offline_fn() to perform the rest of - * destruction once the percpu refs of all css's are confirmed to - * be killed. + * killed and invokes cgroup_destroy_css_killed() to perform the + * rest of destruction once the percpu refs of all css's are + * confirmed to be killed. */ - atomic_set(&cgrp->css_kill_cnt, 1); for_each_root_subsys(cgrp->root, ss) { struct cgroup_subsys_state *css = cgroup_css(cgrp, ss->subsys_id); @@ -4648,10 +4648,8 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) */ percpu_ref_get(&css->refcnt); - atomic_inc(&cgrp->css_kill_cnt); percpu_ref_kill_and_confirm(&css->refcnt, css_killed_ref_fn); } - cgroup_css_killed(cgrp); /* * Mark @cgrp dead. This prevents further task migration and child @@ -4669,6 +4667,15 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) raw_spin_unlock(&release_list_lock); /* + * If @cgrp has css's attached, the second stage of cgroup + * destruction is kicked off from css_killed_work_fn() after the + * refs of all attached css's are killed. If @cgrp doesn't have + * any css, we kick it off here. + */ + if (!cgrp->nr_css) + cgroup_destroy_css_killed(cgrp); + + /* * Clear and remove @cgrp directory. The removal puts the base ref * but we aren't quite done with @cgrp yet, so hold onto it. */ @@ -4693,7 +4700,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) }; /** - * cgroup_offline_fn - the second step of cgroup destruction + * cgroup_destroy_css_killed - the second step of cgroup destruction * @work: cgroup->destroy_free_work * * This function is invoked from a work item for a cgroup which is being @@ -4702,14 +4709,13 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) * is the second step of destruction described in the comment above * cgroup_destroy_locked(). */ -static void cgroup_offline_fn(struct work_struct *work) +static void cgroup_destroy_css_killed(struct cgroup *cgrp) { - struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work); struct cgroup *parent = cgrp->parent; struct dentry *d = cgrp->dentry; struct cgroup_subsys *ss; - mutex_lock(&cgroup_mutex); + lockdep_assert_held(&cgroup_mutex); /* * css_tryget() is guaranteed to fail now. Tell subsystems to @@ -4743,8 +4749,6 @@ static void cgroup_offline_fn(struct work_struct *work) set_bit(CGRP_RELEASABLE, &parent->flags); check_for_release(parent); - - mutex_unlock(&cgroup_mutex); } static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) -- cgit v0.10.2 From 09a503ea3a816b285b0b402b7f785eaec0c7a7e1 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 13 Aug 2013 20:22:50 -0400 Subject: cgroup: decouple cgroup_subsys_state destruction from cgroup destruction Currently, css (cgroup_subsys_state) lifetime is tied to that of the associated cgroup. css's are created when the associated cgroup is created and destroyed when it gets destroyed. Also, individual css's aren't RCU protected but the whole cgroup is. With the planned unified hierarchy, css's will need to be dynamically created and destroyed within the lifetime of a cgroup. To enable such usages, this patch decouples css destruction from cgroup destruction - offline_css() invocation and the final css_put() are moved from cgroup_destroy_css_killed() to css_killed_work_fn(). Now each css is individually offlined and put as its reference count is killed instead of waiting for all css's attached to the cgroup to finish refcnt killing and then proceeding to offlining and putting them together. While this changes the order of destruction operations, the changes shouldn't be noticeable to cgroup subsystems or userland. Signed-off-by: Tejun Heo Acked-by: Li Zefan diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 80dca87..71e77e7 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -82,7 +82,7 @@ struct cgroup_subsys_state { /* ID for this css, if possible */ struct css_id __rcu *id; - /* Used to put @cgroup->dentry on the last css_put() */ + /* percpu_ref killing and putting dentry on the last css_put() */ struct work_struct destroy_work; }; diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 174f4c3..3c4c4b0 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4355,6 +4355,7 @@ static void offline_css(struct cgroup_subsys_state *css) ss->css_offline(css); css->flags &= ~CSS_ONLINE; + css->cgroup->nr_css--; } /* @@ -4559,14 +4560,29 @@ static void css_killed_work_fn(struct work_struct *work) mutex_lock(&cgroup_mutex); /* + * css_tryget() is guaranteed to fail now. Tell subsystems to + * initate destruction. + */ + offline_css(css); + + /* * If @cgrp is marked dead, it's waiting for refs of all css's to * be disabled before proceeding to the second phase of cgroup * destruction. If we are the last one, kick it off. */ - if (!--cgrp->nr_css && cgroup_is_dead(cgrp)) + if (!cgrp->nr_css && cgroup_is_dead(cgrp)) cgroup_destroy_css_killed(cgrp); mutex_unlock(&cgroup_mutex); + + /* + * Put the css refs from kill_css(). Each css holds an extra + * reference to the cgroup's dentry and cgroup removal proceeds + * regardless of css refs. On the last put of each css, whenever + * that may be, the extra dentry ref is put so that dentry + * destruction happens only after all css's are released. + */ + css_put(css); } /* css kill confirmation processing requires process context, bounce */ @@ -4633,11 +4649,10 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) * as killed on all CPUs on return. * * Use percpu_ref_kill_and_confirm() to get notifications as each - * css is confirmed to be seen as killed on all CPUs. The - * notification callback keeps track of the number of css's to be - * killed and invokes cgroup_destroy_css_killed() to perform the - * rest of destruction once the percpu refs of all css's are - * confirmed to be killed. + * css is confirmed to be seen as killed on all CPUs. + * cgroup_destroy_css_killed() will be invoked to perform the rest + * of destruction once the percpu refs of all css's are confirmed + * to be killed. */ for_each_root_subsys(cgrp->root, ss) { struct cgroup_subsys_state *css = cgroup_css(cgrp, ss->subsys_id); @@ -4704,36 +4719,17 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) * @work: cgroup->destroy_free_work * * This function is invoked from a work item for a cgroup which is being - * destroyed after the percpu refcnts of all css's are guaranteed to be - * seen as killed on all CPUs, and performs the rest of destruction. This - * is the second step of destruction described in the comment above - * cgroup_destroy_locked(). + * destroyed after all css's are offlined and performs the rest of + * destruction. This is the second step of destruction described in the + * comment above cgroup_destroy_locked(). */ static void cgroup_destroy_css_killed(struct cgroup *cgrp) { struct cgroup *parent = cgrp->parent; struct dentry *d = cgrp->dentry; - struct cgroup_subsys *ss; lockdep_assert_held(&cgroup_mutex); - /* - * css_tryget() is guaranteed to fail now. Tell subsystems to - * initate destruction. - */ - for_each_root_subsys(cgrp->root, ss) - offline_css(cgroup_css(cgrp, ss->subsys_id)); - - /* - * Put the css refs from cgroup_destroy_locked(). Each css holds - * an extra reference to the cgroup's dentry and cgroup removal - * proceeds regardless of css refs. On the last put of each css, - * whenever that may be, the extra dentry ref is put so that dentry - * destruction happens only after all css's are released. - */ - for_each_root_subsys(cgrp->root, ss) - css_put(cgroup_css(cgrp, ss->subsys_id)); - /* delete this cgroup from parent->children */ list_del_rcu(&cgrp->sibling); -- cgit v0.10.2 From edae0c3358947f8be5ca99f762d89e0c38e1f5d5 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 13 Aug 2013 20:22:51 -0400 Subject: cgroup: factor out kill_css() Factor out css ref killing from cgroup_destroy_locked() into kill_css(). We're gonna add more to the path and the factored out function will eventually be called from other places too. While at it, replace open coded percpu_ref_get() with css_get() for consistency. This shouldn't cause any functional difference as the function is not used for root cgroups. Signed-off-by: Tejun Heo Acked-by: Li Zefan diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 3c4c4b0..7b7575f 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4596,6 +4596,36 @@ static void css_killed_ref_fn(struct percpu_ref *ref) } /** + * kill_css - destroy a css + * @css: css to destroy + * + * This function initiates destruction of @css by putting its base + * reference. ->css_offline() will be invoked asynchronously once + * css_tryget() is guaranteed to fail and when the reference count reaches + * zero, @css will be released. + */ +static void kill_css(struct cgroup_subsys_state *css) +{ + /* + * Killing would put the base ref, but we need to keep it alive + * until after ->css_offline(). + */ + css_get(css); + + /* + * cgroup core guarantees that, by the time ->css_offline() is + * invoked, no new css reference will be given out via + * css_tryget(). We can't simply call percpu_ref_kill() and + * proceed to offlining css's because percpu_ref_kill() doesn't + * guarantee that the ref is seen as killed on all CPUs on return. + * + * Use percpu_ref_kill_and_confirm() to get notifications as each + * css is confirmed to be seen as killed on all CPUs. + */ + percpu_ref_kill_and_confirm(&css->refcnt, css_killed_ref_fn); +} + +/** * cgroup_destroy_locked - the first stage of cgroup destruction * @cgrp: cgroup to be destroyed * @@ -4641,30 +4671,12 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) return -EBUSY; /* - * Block new css_tryget() by killing css refcnts. cgroup core - * guarantees that, by the time ->css_offline() is invoked, no new - * css reference will be given out via css_tryget(). We can't - * simply call percpu_ref_kill() and proceed to offlining css's - * because percpu_ref_kill() doesn't guarantee that the ref is seen - * as killed on all CPUs on return. - * - * Use percpu_ref_kill_and_confirm() to get notifications as each - * css is confirmed to be seen as killed on all CPUs. - * cgroup_destroy_css_killed() will be invoked to perform the rest - * of destruction once the percpu refs of all css's are confirmed - * to be killed. + * Initiate massacre of all css's. cgroup_destroy_css_killed() + * will be invoked to perform the rest of destruction once the + * percpu refs of all css's are confirmed to be killed. */ - for_each_root_subsys(cgrp->root, ss) { - struct cgroup_subsys_state *css = cgroup_css(cgrp, ss->subsys_id); - - /* - * Killing would put the base ref, but we need to keep it - * alive until after ->css_offline. - */ - percpu_ref_get(&css->refcnt); - - percpu_ref_kill_and_confirm(&css->refcnt, css_killed_ref_fn); - } + for_each_root_subsys(cgrp->root, ss) + kill_css(cgroup_css(cgrp, ss->subsys_id)); /* * Mark @cgrp dead. This prevents further task migration and child -- cgit v0.10.2 From 3c14f8b44fafaa60519440bea1591e495b928327 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 13 Aug 2013 20:22:51 -0400 Subject: cgroup: move subsys file removal to kill_css() With the planned unified hierarchy, individual css's will be created and destroyed dynamically across the lifetime of a cgroup. To enable such usages, css destruction is being decoupled from cgroup destruction. This patch moves subsys file removal from cgroup_destroy_locked() to kill_css(). While this changes the order of destruction operations, the changes shouldn't be noticeable to cgroup subsystems or userland. Signed-off-by: Tejun Heo Acked-by: Li Zefan diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 7b7575f..3137e38 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4599,13 +4599,15 @@ static void css_killed_ref_fn(struct percpu_ref *ref) * kill_css - destroy a css * @css: css to destroy * - * This function initiates destruction of @css by putting its base - * reference. ->css_offline() will be invoked asynchronously once - * css_tryget() is guaranteed to fail and when the reference count reaches - * zero, @css will be released. + * This function initiates destruction of @css by removing cgroup interface + * files and putting its base reference. ->css_offline() will be invoked + * asynchronously once css_tryget() is guaranteed to fail and when the + * reference count reaches zero, @css will be released. */ static void kill_css(struct cgroup_subsys_state *css) { + cgroup_clear_dir(css->cgroup, 1 << css->ss->subsys_id); + /* * Killing would put the base ref, but we need to keep it alive * until after ->css_offline(). @@ -4703,10 +4705,10 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) cgroup_destroy_css_killed(cgrp); /* - * Clear and remove @cgrp directory. The removal puts the base ref - * but we aren't quite done with @cgrp yet, so hold onto it. + * Clear the base files and remove @cgrp directory. The removal + * puts the base ref but we aren't quite done with @cgrp yet, so + * hold onto it. */ - cgroup_clear_dir(cgrp, cgrp->root->subsys_mask); cgroup_addrm_files(cgrp, cgroup_base_files, false); dget(d); cgroup_d_remove_dir(d); -- cgit v0.10.2 From 0c21ead136a900c36f1ab74fd7d09a306dc31324 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 13 Aug 2013 20:22:51 -0400 Subject: cgroup: RCU protect each cgroup_subsys_state release With the planned unified hierarchy, individual css's will be created and destroyed dynamically across the lifetime of a cgroup. To enable such usages, css destruction is being decoupled from cgroup destruction. Most of the destruction path has been decoupled but the actual free of css still depends on cgroup free path. When all css refs are drained, css_release() kicks off css_free_work_fn() which puts the cgroup. When the cgroup refcnt reaches zero, cgroup_diput() is invoked which in turn schedules RCU free of the cgroup. After a grace period, all css's are freed along with the cgroup itself. This patch moves the RCU grace period and css freeing from cgroup release path to css release path. css_release(), instead of kicking off css_free_work_fn() directly, schedules RCU callback css_free_rcu_fn() which in turn kicks off css_free_work_fn() after a RCU grace period. css_free_work_fn() is updated to free the css directly. The five-way punting - percpu ref kill confirmation, a work item, percpu ref release, RCU grace period, and again a work item - is quite hairy but the work items are there only to provide process context and the actual sequence is kill confirm -> release -> RCU free, which isn't simple but not too crazy. This removes cgroup_css() usage after offline_css() allowing clearing cgroup->subsys[] from offline_css(), which makes it consistent with online_css() and brings it closer to proper lifetime management for individual css's. Signed-off-by: Tejun Heo Acked-by: Li Zefan diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 71e77e7..c24bd0b 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -82,7 +82,8 @@ struct cgroup_subsys_state { /* ID for this css, if possible */ struct css_id __rcu *id; - /* percpu_ref killing and putting dentry on the last css_put() */ + /* percpu_ref killing and RCU release */ + struct rcu_head rcu_head; struct work_struct destroy_work; }; diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 3137e38..66d0107 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -869,18 +869,8 @@ static struct cgroup_name *cgroup_alloc_name(struct dentry *dentry) static void cgroup_free_fn(struct work_struct *work) { struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work); - struct cgroup_subsys *ss; mutex_lock(&cgroup_mutex); - /* - * Release the subsystem state objects. - */ - for_each_root_subsys(cgrp->root, ss) { - struct cgroup_subsys_state *css = cgroup_css(cgrp, ss->subsys_id); - - ss->css_free(css); - } - cgrp->root->number_of_cgroups--; mutex_unlock(&cgroup_mutex); @@ -4281,32 +4271,62 @@ err: return ret; } +/* + * css destruction is four-stage process. + * + * 1. Destruction starts. Killing of the percpu_ref is initiated. + * Implemented in kill_css(). + * + * 2. When the percpu_ref is confirmed to be visible as killed on all CPUs + * and thus css_tryget() is guaranteed to fail, the css can be offlined + * by invoking offline_css(). After offlining, the base ref is put. + * Implemented in css_killed_work_fn(). + * + * 3. When the percpu_ref reaches zero, the only possible remaining + * accessors are inside RCU read sections. css_release() schedules the + * RCU callback. + * + * 4. After the grace period, the css can be freed. Implemented in + * css_free_work_fn(). + * + * It is actually hairier because both step 2 and 4 require process context + * and thus involve punting to css->destroy_work adding two additional + * steps to the already complex sequence. + */ static void css_free_work_fn(struct work_struct *work) { struct cgroup_subsys_state *css = container_of(work, struct cgroup_subsys_state, destroy_work); + struct cgroup *cgrp = css->cgroup; if (css->parent) css_put(css->parent); - cgroup_dput(css->cgroup); + css->ss->css_free(css); + cgroup_dput(cgrp); } -static void css_release(struct percpu_ref *ref) +static void css_free_rcu_fn(struct rcu_head *rcu_head) { struct cgroup_subsys_state *css = - container_of(ref, struct cgroup_subsys_state, refcnt); + container_of(rcu_head, struct cgroup_subsys_state, rcu_head); /* * css holds an extra ref to @cgrp->dentry which is put on the last - * css_put(). dput() requires process context, which css_put() may - * be called without. @css->destroy_work will be used to invoke - * dput() asynchronously from css_put(). + * css_put(). dput() requires process context which we don't have. */ INIT_WORK(&css->destroy_work, css_free_work_fn); schedule_work(&css->destroy_work); } +static void css_release(struct percpu_ref *ref) +{ + struct cgroup_subsys_state *css = + container_of(ref, struct cgroup_subsys_state, refcnt); + + call_rcu(&css->rcu_head, css_free_rcu_fn); +} + static void init_css(struct cgroup_subsys_state *css, struct cgroup_subsys *ss, struct cgroup *cgrp) { @@ -4356,6 +4376,7 @@ static void offline_css(struct cgroup_subsys_state *css) css->flags &= ~CSS_ONLINE; css->cgroup->nr_css--; + RCU_INIT_POINTER(css->cgroup->subsys[ss->subsys_id], css); } /* -- cgit v0.10.2 From ff58ac0d58d51bffe868b239ed8fce7c4a23c5a9 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 13 Aug 2013 09:17:33 +0800 Subject: cpuset: remove an unncessary forward declaration Signed-off-by: Li Zefan Signed-off-by: Tejun Heo diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 72a0383..95f4b25 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -68,9 +68,6 @@ */ int number_of_cpusets __read_mostly; -/* Forward declare cgroup structures */ -struct cgroup_subsys cpuset_subsys; - /* See "Frequency meter" comments, below. */ struct fmeter { -- cgit v0.10.2 From 930913a31289202d232186b82854b26d7fb7cf4d Mon Sep 17 00:00:00 2001 From: Li Zhong Date: Fri, 16 Aug 2013 17:57:14 +0800 Subject: cgroup: use css_get() in cgroup_create() to check CSS_ROOT It seems that the root css doesn't have refcnt allocated(not needed?), and would cause the booting error attached. This patch tries to use css_get() to not increase the refcnt if parent is root. BUG: unable to handle kernel NULL pointer dereference at (null) IP: [] cgroup_mkdir+0x37c/0x740 PGD 0 Oops: 0002 [#1] Modules linked in: CPU: 0 PID: 1 Comm: systemd Not tainted 3.11.0-rc5-next-20130815+ #1 Hardware name: Bochs Bochs, BIOS Bochs 01/01/2007 task: ffff88007f868000 ti: ffff88007f864000 task.ti: ffff88007f864000 RIP: 0010:[] [] cgroup_mkdir+0x37c/0x740 RSP: 0018:ffff88007f865df8 EFLAGS: 00010246 RAX: 0000000000000000 RBX: ffffffff81a46ee0 RCX: 0000000000000001 RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffffffff81a415c0 RBP: ffff88007f865ec8 R08: 0000000000000001 R09: 0000000000000000 R10: ffff88007ce6d060 R11: 0000000000000000 R12: ffff88007ce6d000 R13: ffff88007ce6d060 R14: ffffffff81a46d80 R15: ffff88007c6e8018 FS: 00007f13dbf6f840(0000) GS:ffffffff81a23000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000000 CR3: 000000007b7e5000 CR4: 00000000000006b0 Stack: ffffffff810b380d 0000000000000002 ffff88007f865e18 ffffffff81167069 ffff88007f865ed8 ffffffff8116a3f5 ffff880037454400 ffff88007c6e8018 ffff88007c6e8028 ffff88007c6e8328 ffff88007c6e8000 ffff88007ce6d000 Call Trace: [] ? cgroup_mkdir+0x3bd/0x740 [] ? lookup_hash+0x19/0x20 [] ? kern_path_create+0x95/0x170 [] vfs_mkdir+0x9e/0xf0 [] SyS_mkdirat+0x60/0xe0 [] SyS_mkdir+0x19/0x20 [] tracesys+0xcf/0xd4 Code: ad 70 ff ff ff 48 89 9d 60 ff ff ff 4d 89 d5 4c 8b bd 68 ff ff ff 4c 8b 65 88 eb 50 0f 1f 00 48 8b 43 18 a8 03 0f 85 6c 03 00 00 00 e8 1d 0a fb ff 85 c0 74 0d 80 3d f0 45 a1 00 00 0f 84 4c RIP [] cgroup_mkdir+0x37c/0x740 RSP CR2: 0000000000000000 ---[ end trace a4b14b49bc46fd60 ]--- Signed-off-by: Li Zhong Acked-by: Li Zefan Signed-off-by: Tejun Heo diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 66d0107..b69b572 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4494,7 +4494,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, struct cgroup_subsys_state *css = css_ar[ss->subsys_id]; dget(dentry); - percpu_ref_get(&css->parent->refcnt); + css_get(css->parent); } /* hold a ref to the parent's dentry */ -- cgit v0.10.2 From 1cb650b91ba582f6737457b7d22e368585596d2c Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Mon, 19 Aug 2013 10:05:24 +0800 Subject: cgroup: change cgroup_from_id() to css_from_id() Now we want cgroup core to always provide the css to use to the subsystems, so change this API to css_from_id(). Uninline css_from_id(), because it's getting bigger and cgroup_css() has been unexported. While at it, remove the #ifdef, and shuffle the order of the args. Signed-off-by: Li Zefan Signed-off-by: Tejun Heo diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index c24bd0b..b685955 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -741,27 +741,11 @@ static inline struct cgroup *task_cgroup(struct task_struct *task, return task_css(task, subsys_id)->cgroup; } -/** - * cgroup_from_id - lookup cgroup by id - * @ss: cgroup subsys to be looked into - * @id: the cgroup id - * - * Returns the cgroup if there's valid one with @id, otherwise returns NULL. - * Should be called under rcu_read_lock(). - */ -static inline struct cgroup *cgroup_from_id(struct cgroup_subsys *ss, int id) -{ -#ifdef CONFIG_PROVE_RCU - rcu_lockdep_assert(rcu_read_lock_held() || - lockdep_is_held(&cgroup_mutex), - "cgroup_from_id() needs proper protection"); -#endif - return idr_find(&ss->root->cgroup_idr, id); -} - struct cgroup_subsys_state *css_next_child(struct cgroup_subsys_state *pos, struct cgroup_subsys_state *parent); +struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss); + /** * css_for_each_child - iterate through children of a css * @pos: the css * to use as the loop cursor diff --git a/kernel/cgroup.c b/kernel/cgroup.c index b69b572..ff7d642 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -5717,6 +5717,28 @@ struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id) return css ? css : ERR_PTR(-ENOENT); } +/** + * css_from_id - lookup css by id + * @id: the cgroup id + * @ss: cgroup subsys to be looked into + * + * Returns the css if there's valid one with @id, otherwise returns NULL. + * Should be called under rcu_read_lock(). + */ +struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss) +{ + struct cgroup *cgrp; + + rcu_lockdep_assert(rcu_read_lock_held() || + lockdep_is_held(&cgroup_mutex), + "css_from_id() needs proper protection"); + + cgrp = idr_find(&ss->root->cgroup_idr, id); + if (cgrp) + return cgroup_css(cgrp, ss->subsys_id); + return NULL; +} + #ifdef CONFIG_CGROUP_DEBUG static struct cgroup_subsys_state * debug_css_alloc(struct cgroup_subsys_state *parent_css) -- cgit v0.10.2 From 0bfb4aa67cef4982adc70590a31624d7b35a0bda Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 15 Aug 2013 11:42:36 -0400 Subject: cgroup: fix subsystem file accesses on the root cgroup 105347ba5 ("cgroup: make cgroup_file_open() rcu_read_lock() around cgroup_css() and add cfent->css") added cfent->css to cache the associted cgroup_subsys_state across file operations. A cfent is associated with single css throughout its lifetime and the origimal commit initialized the cache pointer during cgroup_add_file() and verified that it matches the actual one in cgroup_file_open(). While this works fine for !root cgroups, it's broken for root cgroups as files in a root cgroup are created before the css's are associated with the cgroup and thus cgroup_css() call in cgroup_add_file() returns NULL associating all cfents in the root cgroup with NULL css. This makes cgroup_file_open() trigger WARN and fail with -ENODEV for all !core subsystem files in the root cgroups. There's no reason to initialize cfent->css separately from cgroup_add_file(). As the association never changes, cgroup_file_open() can set it unconditionally every time and containing the logic in cgroup_file_open() makes more sense anyway as the only reason it's necessary is file->private_data being already occupied. Fix it by setting cfent->css unconditionally from cgroup_file_open(). Signed-off-by: Tejun Heo Acked-by: Li Zefan diff --git a/kernel/cgroup.c b/kernel/cgroup.c index ff7d642..896e035 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2490,10 +2490,18 @@ static int cgroup_file_open(struct inode *inode, struct file *file) } rcu_read_unlock(); - /* css should match @cfe->css, see cgroup_add_file() for details */ - if (!css || WARN_ON_ONCE(css != cfe->css)) + if (!css) return -ENODEV; + /* + * @cfe->css is used by read/write/close to determine the + * associated css. @file->private_data would be a better place but + * that's already used by seqfile. Multiple accessors may use it + * simultaneously which is okay as the association never changes. + */ + WARN_ON_ONCE(cfe->css && cfe->css != css); + cfe->css = css; + if (cft->read_map || cft->read_seq_string) { file->f_op = &cgroup_seqfile_operations; err = single_open(file, cgroup_seqfile_show, cfe); @@ -2772,18 +2780,6 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft) dentry->d_fsdata = cfe; simple_xattrs_init(&cfe->xattrs); - /* - * cfe->css is used by read/write/close to determine the associated - * css. file->private_data would be a better place but that's - * already used by seqfile. Note that open will use the usual - * cgroup_css() and css_tryget() to acquire the css and this - * caching doesn't affect css lifetime management. - */ - if (cft->ss) - cfe->css = cgroup_css(cgrp, cft->ss->subsys_id); - else - cfe->css = &cgrp->dummy_css; - mode = cgroup_file_mode(cft); error = cgroup_create_file(dentry, mode | S_IFREG, cgrp->root->sb); if (!error) { -- cgit v0.10.2 From 6e6eab0efdf48fb2d8d7aee904d7740acb4661c6 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 15 Aug 2013 11:43:15 -0400 Subject: cgroup: fix cgroup_write_event_control() 81eeaf0411 ("cgroup: make cftype->[un]register_event() deal with cgroup_subsys_state inst ead of cgroup") updated the cftype event methods to take @css (cgroup_subsys_state) instead of @cgroup; however, it incorrectly used @css passed to cgroup_write_event_control(), which the dummy_css for the cgroup as the file is a cgroup core file. This leads to oops on event registration. Fix it by using the css matching the event target file. Note that cgroup_write_event_control() now disallows cgroup core files from being event sources. This is for simplicity and doesn't matter as cgroup_event will be moved and made specific to memcg. Signed-off-by: Tejun Heo Acked-by: Li Zefan diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 896e035..ef43e3f 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4040,10 +4040,10 @@ static void cgroup_event_ptable_queue_proc(struct file *file, * Input must be in format ' '. * Interpretation of args is defined by control file implementation. */ -static int cgroup_write_event_control(struct cgroup_subsys_state *css, +static int cgroup_write_event_control(struct cgroup_subsys_state *dummy_css, struct cftype *cft, const char *buffer) { - struct cgroup *cgrp = css->cgroup; + struct cgroup *cgrp = dummy_css->cgroup; struct cgroup_event *event; struct cgroup *cgrp_cfile; unsigned int efd, cfd; @@ -4065,7 +4065,7 @@ static int cgroup_write_event_control(struct cgroup_subsys_state *css, event = kzalloc(sizeof(*event), GFP_KERNEL); if (!event) return -ENOMEM; - event->css = css; + INIT_LIST_HEAD(&event->list); init_poll_funcptr(&event->pt, cgroup_event_ptable_queue_proc); init_waitqueue_func_entry(&event->wait, cgroup_event_wake); @@ -4101,6 +4101,23 @@ static int cgroup_write_event_control(struct cgroup_subsys_state *css, goto out_put_cfile; } + if (!event->cft->ss) { + ret = -EBADF; + goto out_put_cfile; + } + + /* determine the css of @cfile and associate @event with it */ + rcu_read_lock(); + + ret = -EINVAL; + event->css = cgroup_css(cgrp, event->cft->ss->subsys_id); + if (event->css) + ret = 0; + + rcu_read_unlock(); + if (ret) + goto out_put_cfile; + /* * The file to be monitored must be in the same cgroup as * cgroup.event_control is. @@ -4116,7 +4133,7 @@ static int cgroup_write_event_control(struct cgroup_subsys_state *css, goto out_put_cfile; } - ret = event->cft->register_event(css, event->cft, + ret = event->cft->register_event(event->css, event->cft, event->eventfd, buffer); if (ret) goto out_put_cfile; -- cgit v0.10.2 From 35cf083619da5677f83e9a8eae813f0b413d7082 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 26 Aug 2013 18:40:56 -0400 Subject: cgroup: rename cgroup_css_from_dir() to css_from_dir() and update its syntax cgroup_css_from_dir() will grow another user. In preparation, make the following changes. * All css functions are prefixed with just "css_", rename it to css_from_dir(). * Take dentry * instead of file * as dentry is what ultimately identifies a cgroup and file may not always be available. Note that the function now checkes whether @dentry->d_inode is NULL as the caller now may specify a negative dentry. * Make it take cgroup_subsys * instead of integer subsys_id. This simplifies the function and allows specifying no subsystem for cgroup->dummy_css. * Make return section a bit less verbose. This patch doesn't introduce any behavior changes. Signed-off-by: Tejun Heo Acked-by: Li Zefan Acked-by: Kirill A. Shutemov Cc: Steven Rostedt Cc: Frederic Weisbecker Cc: Ingo Molnar diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index b685955..21ba298 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -903,7 +903,8 @@ bool css_is_ancestor(struct cgroup_subsys_state *cg, /* Get id and depth of css */ unsigned short css_id(struct cgroup_subsys_state *css); -struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id); +struct cgroup_subsys_state *css_from_dir(struct dentry *dentry, + struct cgroup_subsys *ss); #else /* !CONFIG_CGROUPS */ diff --git a/kernel/cgroup.c b/kernel/cgroup.c index ef43e3f..921b138 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -5700,34 +5700,28 @@ struct cgroup_subsys_state *css_lookup(struct cgroup_subsys *ss, int id) EXPORT_SYMBOL_GPL(css_lookup); /** - * cgroup_css_from_dir - get corresponding css from file open on cgroup dir - * @f: directory file of interest - * @id: subsystem id of interest + * css_from_dir - get corresponding css from the dentry of a cgroup dir + * @dentry: directory dentry of interest + * @ss: subsystem of interest * * Must be called under RCU read lock. The caller is responsible for * pinning the returned css if it needs to be accessed outside the RCU * critical section. */ -struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id) +struct cgroup_subsys_state *css_from_dir(struct dentry *dentry, + struct cgroup_subsys *ss) { struct cgroup *cgrp; - struct inode *inode; - struct cgroup_subsys_state *css; WARN_ON_ONCE(!rcu_read_lock_held()); - inode = file_inode(f); - /* check in cgroup filesystem dir */ - if (inode->i_op != &cgroup_dir_inode_operations) + /* is @dentry a cgroup dir? */ + if (!dentry->d_inode || + dentry->d_inode->i_op != &cgroup_dir_inode_operations) return ERR_PTR(-EBADF); - if (id < 0 || id >= CGROUP_SUBSYS_COUNT) - return ERR_PTR(-EINVAL); - - /* get cgroup */ - cgrp = __d_cgrp(f->f_dentry); - css = cgroup_css(cgrp, id); - return css ? css : ERR_PTR(-ENOENT); + cgrp = __d_cgrp(dentry); + return cgroup_css(cgrp, ss->subsys_id) ?: ERR_PTR(-ENOENT); } /** diff --git a/kernel/events/core.c b/kernel/events/core.c index 23261f9..b59ab66 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -593,7 +593,7 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event, rcu_read_lock(); - css = cgroup_css_from_dir(f.file, perf_subsys_id); + css = css_from_dir(f.file->f_dentry, &perf_subsys); if (IS_ERR(css)) { ret = PTR_ERR(css); goto out; -- cgit v0.10.2 From ca8bdcaff0d77990fb69e0f946018c96a70851cc Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 26 Aug 2013 18:40:56 -0400 Subject: cgroup: make cgroup_css() take cgroup_subsys * instead and allow NULL subsys cgroup_css() is no longer used in hot paths. Make it take struct cgroup_subsys * and allow the users to specify NULL subsys to obtain the dummy_css. This removes open-coded NULL subsystem testing in a couple users and generally simplifies the code. After this patch, css_from_dir() also allows NULL @ss and returns the matching dummy_css. This behavior change doesn't affect its only user - perf. Signed-off-by: Tejun Heo Acked-by: Li Zefan Acked-by: Kirill A. Shutemov diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 921b138..7516668 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -226,19 +226,22 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], /** * cgroup_css - obtain a cgroup's css for the specified subsystem * @cgrp: the cgroup of interest - * @subsys_id: the subsystem of interest + * @ss: the subsystem of interest (%NULL returns the dummy_css) * - * Return @cgrp's css (cgroup_subsys_state) associated with @subsys_id. - * This function must be called either under cgroup_mutex or - * rcu_read_lock() and the caller is responsible for pinning the returned - * css if it wants to keep accessing it outside the said locks. This - * function may return %NULL if @cgrp doesn't have @subsys_id enabled. + * Return @cgrp's css (cgroup_subsys_state) associated with @ss. This + * function must be called either under cgroup_mutex or rcu_read_lock() and + * the caller is responsible for pinning the returned css if it wants to + * keep accessing it outside the said locks. This function may return + * %NULL if @cgrp doesn't have @subsys_id enabled. */ static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp, - int subsys_id) + struct cgroup_subsys *ss) { - return rcu_dereference_check(cgrp->subsys[subsys_id], - lockdep_is_held(&cgroup_mutex)); + if (ss) + return rcu_dereference_check(cgrp->subsys[ss->subsys_id], + lockdep_is_held(&cgroup_mutex)); + else + return &cgrp->dummy_css; } /* convenient tests for these bits */ @@ -580,7 +583,7 @@ static struct css_set *find_existing_css_set(struct css_set *old_cset, /* Subsystem is in this hierarchy. So we want * the subsystem state from the new * cgroup */ - template[i] = cgroup_css(cgrp, i); + template[i] = cgroup_css(cgrp, ss); } else { /* Subsystem is not in this hierarchy, so we * don't want to change the subsystem state */ @@ -1062,30 +1065,30 @@ static int rebind_subsystems(struct cgroupfs_root *root, if (bit & added_mask) { /* We're binding this subsystem to this hierarchy */ - BUG_ON(cgroup_css(cgrp, i)); - BUG_ON(!cgroup_css(cgroup_dummy_top, i)); - BUG_ON(cgroup_css(cgroup_dummy_top, i)->cgroup != cgroup_dummy_top); + BUG_ON(cgroup_css(cgrp, ss)); + BUG_ON(!cgroup_css(cgroup_dummy_top, ss)); + BUG_ON(cgroup_css(cgroup_dummy_top, ss)->cgroup != cgroup_dummy_top); rcu_assign_pointer(cgrp->subsys[i], - cgroup_css(cgroup_dummy_top, i)); - cgroup_css(cgrp, i)->cgroup = cgrp; + cgroup_css(cgroup_dummy_top, ss)); + cgroup_css(cgrp, ss)->cgroup = cgrp; list_move(&ss->sibling, &root->subsys_list); ss->root = root; if (ss->bind) - ss->bind(cgroup_css(cgrp, i)); + ss->bind(cgroup_css(cgrp, ss)); /* refcount was already taken, and we're keeping it */ root->subsys_mask |= bit; } else if (bit & removed_mask) { /* We're removing this subsystem */ - BUG_ON(cgroup_css(cgrp, i) != cgroup_css(cgroup_dummy_top, i)); - BUG_ON(cgroup_css(cgrp, i)->cgroup != cgrp); + BUG_ON(cgroup_css(cgrp, ss) != cgroup_css(cgroup_dummy_top, ss)); + BUG_ON(cgroup_css(cgrp, ss)->cgroup != cgrp); if (ss->bind) - ss->bind(cgroup_css(cgroup_dummy_top, i)); + ss->bind(cgroup_css(cgroup_dummy_top, ss)); - cgroup_css(cgroup_dummy_top, i)->cgroup = cgroup_dummy_top; + cgroup_css(cgroup_dummy_top, ss)->cgroup = cgroup_dummy_top; RCU_INIT_POINTER(cgrp->subsys[i], NULL); cgroup_subsys[i]->root = &cgroup_dummy_root; @@ -1930,7 +1933,7 @@ EXPORT_SYMBOL_GPL(cgroup_taskset_next); struct cgroup_subsys_state *cgroup_taskset_cur_css(struct cgroup_taskset *tset, int subsys_id) { - return cgroup_css(tset->cur_cgrp, subsys_id); + return cgroup_css(tset->cur_cgrp, cgroup_subsys[subsys_id]); } EXPORT_SYMBOL_GPL(cgroup_taskset_cur_css); @@ -2071,7 +2074,7 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk, * step 1: check that we can legitimately attach to the cgroup. */ for_each_root_subsys(root, ss) { - struct cgroup_subsys_state *css = cgroup_css(cgrp, ss->subsys_id); + struct cgroup_subsys_state *css = cgroup_css(cgrp, ss); if (ss->can_attach) { retval = ss->can_attach(css, &tset); @@ -2113,7 +2116,7 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk, * step 4: do subsystem attach callbacks. */ for_each_root_subsys(root, ss) { - struct cgroup_subsys_state *css = cgroup_css(cgrp, ss->subsys_id); + struct cgroup_subsys_state *css = cgroup_css(cgrp, ss); if (ss->attach) ss->attach(css, &tset); @@ -2135,7 +2138,7 @@ out_put_css_set_refs: out_cancel_attach: if (retval) { for_each_root_subsys(root, ss) { - struct cgroup_subsys_state *css = cgroup_css(cgrp, ss->subsys_id); + struct cgroup_subsys_state *css = cgroup_css(cgrp, ss); if (ss == failed_ss) break; @@ -2481,13 +2484,9 @@ static int cgroup_file_open(struct inode *inode, struct file *file) * @css stays alive for all file operations. */ rcu_read_lock(); - if (cft->ss) { - css = cgroup_css(cgrp, cft->ss->subsys_id); - if (!css_tryget(css)) - css = NULL; - } else { - css = &cgrp->dummy_css; - } + css = cgroup_css(cgrp, cft->ss); + if (cft->ss && !css_tryget(css)) + css = NULL; rcu_read_unlock(); if (!css) @@ -2878,7 +2877,7 @@ static int cgroup_cfts_commit(struct cftype *cfts, bool is_add) /* add/rm files for all cgroups created before */ rcu_read_lock(); - css_for_each_descendant_pre(css, cgroup_css(root, ss->subsys_id)) { + css_for_each_descendant_pre(css, cgroup_css(root, ss)) { struct cgroup *cgrp = css->cgroup; if (cgroup_is_dead(cgrp)) @@ -3082,10 +3081,7 @@ css_next_child(struct cgroup_subsys_state *pos_css, if (&next->sibling == &cgrp->children) return NULL; - if (parent_css->ss) - return cgroup_css(next, parent_css->ss->subsys_id); - else - return &next->dummy_css; + return cgroup_css(next, parent_css->ss); } EXPORT_SYMBOL_GPL(css_next_child); @@ -4110,7 +4106,7 @@ static int cgroup_write_event_control(struct cgroup_subsys_state *dummy_css, rcu_read_lock(); ret = -EINVAL; - event->css = cgroup_css(cgrp, event->cft->ss->subsys_id); + event->css = cgroup_css(cgrp, event->cft->ss); if (event->css) ret = 0; @@ -4266,7 +4262,7 @@ static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask) /* This cgroup is ready now */ for_each_root_subsys(cgrp->root, ss) { - struct cgroup_subsys_state *css = cgroup_css(cgrp, ss->subsys_id); + struct cgroup_subsys_state *css = cgroup_css(cgrp, ss); struct css_id *id = rcu_dereference_protected(css->id, true); /* @@ -4349,11 +4345,11 @@ static void init_css(struct cgroup_subsys_state *css, struct cgroup_subsys *ss, css->id = NULL; if (cgrp->parent) - css->parent = cgroup_css(cgrp->parent, ss->subsys_id); + css->parent = cgroup_css(cgrp->parent, ss); else css->flags |= CSS_ROOT; - BUG_ON(cgroup_css(cgrp, ss->subsys_id)); + BUG_ON(cgroup_css(cgrp, ss)); } /* invoke ->css_online() on a new CSS and mark it online if successful */ @@ -4466,7 +4462,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, for_each_root_subsys(root, ss) { struct cgroup_subsys_state *css; - css = ss->css_alloc(cgroup_css(parent, ss->subsys_id)); + css = ss->css_alloc(cgroup_css(parent, ss)); if (IS_ERR(css)) { err = PTR_ERR(css); goto err_free_all; @@ -4712,7 +4708,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) * percpu refs of all css's are confirmed to be killed. */ for_each_root_subsys(cgrp->root, ss) - kill_css(cgroup_css(cgrp, ss->subsys_id)); + kill_css(cgroup_css(cgrp, ss)); /* * Mark @cgrp dead. This prevents further task migration and child @@ -4839,7 +4835,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) /* Create the top cgroup state for this subsystem */ list_add(&ss->sibling, &cgroup_dummy_root.subsys_list); ss->root = &cgroup_dummy_root; - css = ss->css_alloc(cgroup_css(cgroup_dummy_top, ss->subsys_id)); + css = ss->css_alloc(cgroup_css(cgroup_dummy_top, ss)); /* We don't handle early failures gracefully */ BUG_ON(IS_ERR(css)); init_css(css, ss, cgroup_dummy_top); @@ -4918,7 +4914,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) * struct, so this can happen first (i.e. before the dummy root * attachment). */ - css = ss->css_alloc(cgroup_css(cgroup_dummy_top, ss->subsys_id)); + css = ss->css_alloc(cgroup_css(cgroup_dummy_top, ss)); if (IS_ERR(css)) { /* failure case - need to deassign the cgroup_subsys[] slot. */ cgroup_subsys[ss->subsys_id] = NULL; @@ -5000,7 +4996,7 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss) mutex_lock(&cgroup_mutex); - offline_css(cgroup_css(cgroup_dummy_top, ss->subsys_id)); + offline_css(cgroup_css(cgroup_dummy_top, ss)); if (ss->use_id) idr_destroy(&ss->idr); @@ -5034,7 +5030,7 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss) * the cgrp->subsys pointer to find their state. note that this * also takes care of freeing the css_id. */ - ss->css_free(cgroup_css(cgroup_dummy_top, ss->subsys_id)); + ss->css_free(cgroup_css(cgroup_dummy_top, ss)); RCU_INIT_POINTER(cgroup_dummy_top->subsys[ss->subsys_id], NULL); mutex_unlock(&cgroup_mutex); @@ -5721,7 +5717,7 @@ struct cgroup_subsys_state *css_from_dir(struct dentry *dentry, return ERR_PTR(-EBADF); cgrp = __d_cgrp(dentry); - return cgroup_css(cgrp, ss->subsys_id) ?: ERR_PTR(-ENOENT); + return cgroup_css(cgrp, ss) ?: ERR_PTR(-ENOENT); } /** -- cgit v0.10.2 From 9fa4db334c7d9570aec7a5121e84fae99aae1d04 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 26 Aug 2013 18:40:56 -0400 Subject: cgroup: implement CFTYPE_NO_PREFIX When cgroup files are created, cgroup core automatically prepends the name of the subsystem as prefix. This patch adds CFTYPE_NO_ which disables the automatic prefix. This is to work around historical baggages and shouldn't be used for new files. This will be used to move "cgroup.event_control" from cgroup core to memcg. Signed-off-by: Tejun Heo Acked-by: Li Zefan Acked-by: Kirill A. Shutemov Cc: Glauber Costa diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 21ba298..3561d30 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -411,6 +411,7 @@ enum { CFTYPE_ONLY_ON_ROOT = (1 << 0), /* only create on root cgrp */ CFTYPE_NOT_ON_ROOT = (1 << 1), /* don't create on root cgrp */ CFTYPE_INSANE = (1 << 2), /* don't create if sane_behavior */ + CFTYPE_NO_PREFIX = (1 << 3), /* (DON'T USE FOR NEW FILES) no subsys prefix */ }; #define MAX_CFTYPE_NAME 64 diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 7516668..a41dc87 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2756,7 +2756,8 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft) umode_t mode; char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 }; - if (cft->ss && !(cgrp->root->flags & CGRP_ROOT_NOPREFIX)) { + if (cft->ss && !(cft->flags & CFTYPE_NO_PREFIX) && + !(cgrp->root->flags & CGRP_ROOT_NOPREFIX)) { strcpy(name, cft->ss->name); strcat(name, "."); } -- cgit v0.10.2 From 7941cb027dccedec3c047271554ddcf4be2e0697 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 26 Aug 2013 18:40:56 -0400 Subject: cgroup: make cgroup_event hold onto cgroup_subsys_state instead of cgroup Currently, each registered cgroup_event holds an extra reference to the cgroup. This is a bit weird as events are subsystem specific and will also be incorrect in the planned unified hierarchy as css (cgroup_subsys_state) may come and go dynamically across the lifetime of a cgroup. Holding onto cgroup won't prevent the target css from going away. Update cgroup_event to hold onto the css the traget file belongs to instead of cgroup. Signed-off-by: Tejun Heo Acked-by: Li Zefan Acked-by: Kirill A. Shutemov diff --git a/kernel/cgroup.c b/kernel/cgroup.c index a41dc87..12237a2 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -3969,7 +3969,6 @@ static void cgroup_event_remove(struct work_struct *work) struct cgroup_event *event = container_of(work, struct cgroup_event, remove); struct cgroup_subsys_state *css = event->css; - struct cgroup *cgrp = css->cgroup; remove_wait_queue(event->wqh, &event->wait); @@ -3980,7 +3979,7 @@ static void cgroup_event_remove(struct work_struct *work) eventfd_ctx_put(event->eventfd); kfree(event); - cgroup_dput(cgrp); + css_put(css); } /* @@ -4103,12 +4102,16 @@ static int cgroup_write_event_control(struct cgroup_subsys_state *dummy_css, goto out_put_cfile; } - /* determine the css of @cfile and associate @event with it */ + /* + * Determine the css of @cfile and associate @event with it. + * Remaining events are automatically removed on cgroup destruction + * but the removal is asynchronous, so take an extra ref. + */ rcu_read_lock(); ret = -EINVAL; event->css = cgroup_css(cgrp, event->cft->ss); - if (event->css) + if (event->css && css_tryget(event->css)) ret = 0; rcu_read_unlock(); @@ -4122,28 +4125,21 @@ static int cgroup_write_event_control(struct cgroup_subsys_state *dummy_css, cgrp_cfile = __d_cgrp(cfile->f_dentry->d_parent); if (cgrp_cfile != cgrp) { ret = -EINVAL; - goto out_put_cfile; + goto out_put_css; } if (!event->cft->register_event || !event->cft->unregister_event) { ret = -EINVAL; - goto out_put_cfile; + goto out_put_css; } ret = event->cft->register_event(event->css, event->cft, event->eventfd, buffer); if (ret) - goto out_put_cfile; + goto out_put_css; efile->f_op->poll(efile, &event->pt); - /* - * Events should be removed after rmdir of cgroup directory, but before - * destroying subsystem state objects. Let's take reference to cgroup - * directory dentry to do that. - */ - dget(cgrp->dentry); - spin_lock(&cgrp->event_list_lock); list_add(&event->list, &cgrp->event_list); spin_unlock(&cgrp->event_list_lock); @@ -4153,6 +4149,8 @@ static int cgroup_write_event_control(struct cgroup_subsys_state *dummy_css, return 0; +out_put_css: + css_put(event->css); out_put_cfile: fput(cfile); out_put_eventfd: -- cgit v0.10.2 From 7c918cbbd829669bf70ffcc45962d5d992942243 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 26 Aug 2013 18:40:56 -0400 Subject: cgroup: make cgroup_write_event_control() use css_from_dir() instead of __d_cgrp() cgroup_event will be moved to its only user - memcg. Replace __d_cgrp() usage with css_from_dir(), which is already exported. This also simplifies the code a bit. Signed-off-by: Tejun Heo Acked-by: Li Zefan Acked-by: Kirill A. Shutemov diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 12237a2..e76698d 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4041,7 +4041,7 @@ static int cgroup_write_event_control(struct cgroup_subsys_state *dummy_css, { struct cgroup *cgrp = dummy_css->cgroup; struct cgroup_event *event; - struct cgroup *cgrp_cfile; + struct cgroup_subsys_state *cfile_css; unsigned int efd, cfd; struct file *efile; struct file *cfile; @@ -4103,7 +4103,8 @@ static int cgroup_write_event_control(struct cgroup_subsys_state *dummy_css, } /* - * Determine the css of @cfile and associate @event with it. + * Determine the css of @cfile, verify it belongs to the same + * cgroup as cgroup.event_control, and associate @event with it. * Remaining events are automatically removed on cgroup destruction * but the removal is asynchronous, so take an extra ref. */ @@ -4111,23 +4112,14 @@ static int cgroup_write_event_control(struct cgroup_subsys_state *dummy_css, ret = -EINVAL; event->css = cgroup_css(cgrp, event->cft->ss); - if (event->css && css_tryget(event->css)) + cfile_css = css_from_dir(cfile->f_dentry->d_parent, event->cft->ss); + if (event->css && event->css == cfile_css && css_tryget(event->css)) ret = 0; rcu_read_unlock(); if (ret) goto out_put_cfile; - /* - * The file to be monitored must be in the same cgroup as - * cgroup.event_control is. - */ - cgrp_cfile = __d_cgrp(cfile->f_dentry->d_parent); - if (cgrp_cfile != cgrp) { - ret = -EINVAL; - goto out_put_css; - } - if (!event->cft->register_event || !event->cft->unregister_event) { ret = -EINVAL; goto out_put_css; -- cgit v0.10.2 From d1625964da51bda61306ad3ec45307a799c21f08 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 27 Aug 2013 14:27:23 -0400 Subject: cgroup: fix cgroup_css() invocation in css_from_id() ca8bdcaff0 ("cgroup: make cgroup_css() take cgroup_subsys * instead and allow NULL subsys") missed one conversion in css_from_id(), which was newly added. As css_from_id() doesn't have any user yet, this doesn't break anything other than generating a build warning. Convert it. Signed-off-by: Tejun Heo Reported-by: Stephen Rothwell Reported-by: kbuild test robot diff --git a/kernel/cgroup.c b/kernel/cgroup.c index e76698d..b5f4989 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -5729,7 +5729,7 @@ struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss) cgrp = idr_find(&ss->root->cgroup_idr, id); if (cgrp) - return cgroup_css(cgrp, ss->subsys_id); + return cgroup_css(cgrp, ss); return NULL; } -- cgit v0.10.2