From 79bd9814e5ec9a288d6599f53aeac0b548fdfe52 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 22 Nov 2013 18:20:42 -0500
Subject: cgroup, memcg: move cgroup_event implementation to memcg

cgroup_event is way over-designed and tries to build a generic
flexible event mechanism into cgroup - fully customizable event
specification for each user of the interface.  This is utterly
unnecessary and overboard especially in the light of the planned
unified hierarchy as there's gonna be single agent.  Simply generating
events at fixed points, or if that's too restrictive, configureable
cadence or single set of configureable points should be enough.

Thankfully, memcg is the only user and gets to keep it.  Replacing it
with something simpler on sane_behavior is strongly recommended.

This patch moves cgroup_event and "cgroup.event_control"
implementation to mm/memcontrol.c.  Clearing of events on cgroup
destruction is moved from cgroup_destroy_locked() to
mem_cgroup_css_offline(), which shouldn't make any noticeable
difference.

cgroup_css() and __file_cft() are exported to enable the move;
however, this will soon be reverted once the event code is updated to
be memcg specific.

Note that "cgroup.event_control" will now exist only on the hierarchy
with memcg attached to it.  While this change is visible to userland,
it is unlikely to be noticeable as the file has never been meaningful
outside memcg.

Aside from the above change, this is pure code relocation.

v2: Per Li Zefan's comments, init/Kconfig updated accordingly and
    poll.h inclusion moved from cgroup.c to memcontrol.c.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Balbir Singh <bsingharora@gmail.com>

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 3561d30..40c2427 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -907,6 +907,11 @@ unsigned short css_id(struct cgroup_subsys_state *css);
 struct cgroup_subsys_state *css_from_dir(struct dentry *dentry,
 					 struct cgroup_subsys *ss);
 
+/* XXX: temporary */
+struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp,
+				       struct cgroup_subsys *ss);
+struct cftype *__file_cft(struct file *file);
+
 #else /* !CONFIG_CGROUPS */
 
 static inline int cgroup_init_early(void) { return 0; }
diff --git a/init/Kconfig b/init/Kconfig
index 3ecd8a1..3ca5b81 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -861,7 +861,6 @@ config NUMA_BALANCING
 
 menuconfig CGROUPS
 	boolean "Control Group support"
-	depends on EVENTFD
 	help
 	  This option adds support for grouping sets of processes together, for
 	  use with process control subsystems such as Cpusets, CFS, memory
@@ -928,6 +927,7 @@ config MEMCG
 	bool "Memory Resource Controller for Control Groups"
 	depends on RESOURCE_COUNTERS
 	select MM_OWNER
+	select EVENTFD
 	help
 	  Provides a memory resource controller that manages both anonymous
 	  memory and page cache. (See Documentation/cgroups/memory.txt)
@@ -1167,7 +1167,6 @@ config UIDGID_STRICT_TYPE_CHECKS
 
 config SCHED_AUTOGROUP
 	bool "Automatic process group scheduling"
-	select EVENTFD
 	select CGROUPS
 	select CGROUP_SCHED
 	select FAIR_GROUP_SCHED
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 8bd9cfd..4bccaa7 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -56,11 +56,8 @@
 #include <linux/pid_namespace.h>
 #include <linux/idr.h>
 #include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
-#include <linux/eventfd.h>
-#include <linux/poll.h>
 #include <linux/flex_array.h> /* used in cgroup_attach_task */
 #include <linux/kthread.h>
-#include <linux/file.h>
 
 #include <linux/atomic.h>
 
@@ -156,36 +153,6 @@ struct css_id {
 	unsigned short stack[0]; /* Array of Length (depth+1) */
 };
 
-/*
- * cgroup_event represents events which userspace want to receive.
- */
-struct cgroup_event {
-	/*
-	 * css which the event belongs to.
-	 */
-	struct cgroup_subsys_state *css;
-	/*
-	 * Control file which the event associated.
-	 */
-	struct cftype *cft;
-	/*
-	 * eventfd to signal userspace about the event.
-	 */
-	struct eventfd_ctx *eventfd;
-	/*
-	 * Each of these stored in a list by the cgroup.
-	 */
-	struct list_head list;
-	/*
-	 * All fields below needed to unregister event when
-	 * userspace closes eventfd.
-	 */
-	poll_table pt;
-	wait_queue_head_t *wqh;
-	wait_queue_t wait;
-	struct work_struct remove;
-};
-
 /* The list of hierarchy roots */
 
 static LIST_HEAD(cgroup_roots);
@@ -235,8 +202,8 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
  * keep accessing it outside the said locks.  This function may return
  * %NULL if @cgrp doesn't have @subsys_id enabled.
  */
-static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp,
-					      struct cgroup_subsys *ss)
+struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp,
+				       struct cgroup_subsys *ss)
 {
 	if (ss)
 		return rcu_dereference_check(cgrp->subsys[ss->subsys_id],
@@ -2663,7 +2630,7 @@ static const struct inode_operations cgroup_dir_inode_operations = {
 /*
  * Check if a file is a control file
  */
-static inline struct cftype *__file_cft(struct file *file)
+struct cftype *__file_cft(struct file *file)
 {
 	if (file_inode(file)->i_fop != &cgroup_file_operations)
 		return ERR_PTR(-EINVAL);
@@ -3949,202 +3916,6 @@ static void cgroup_dput(struct cgroup *cgrp)
 	deactivate_super(sb);
 }
 
-/*
- * Unregister event and free resources.
- *
- * Gets called from workqueue.
- */
-static void cgroup_event_remove(struct work_struct *work)
-{
-	struct cgroup_event *event = container_of(work, struct cgroup_event,
-			remove);
-	struct cgroup_subsys_state *css = event->css;
-
-	remove_wait_queue(event->wqh, &event->wait);
-
-	event->cft->unregister_event(css, event->cft, event->eventfd);
-
-	/* Notify userspace the event is going away. */
-	eventfd_signal(event->eventfd, 1);
-
-	eventfd_ctx_put(event->eventfd);
-	kfree(event);
-	css_put(css);
-}
-
-/*
- * Gets called on POLLHUP on eventfd when user closes it.
- *
- * Called with wqh->lock held and interrupts disabled.
- */
-static int cgroup_event_wake(wait_queue_t *wait, unsigned mode,
-		int sync, void *key)
-{
-	struct cgroup_event *event = container_of(wait,
-			struct cgroup_event, wait);
-	struct cgroup *cgrp = event->css->cgroup;
-	unsigned long flags = (unsigned long)key;
-
-	if (flags & POLLHUP) {
-		/*
-		 * If the event has been detached at cgroup removal, we
-		 * can simply return knowing the other side will cleanup
-		 * for us.
-		 *
-		 * We can't race against event freeing since the other
-		 * side will require wqh->lock via remove_wait_queue(),
-		 * which we hold.
-		 */
-		spin_lock(&cgrp->event_list_lock);
-		if (!list_empty(&event->list)) {
-			list_del_init(&event->list);
-			/*
-			 * We are in atomic context, but cgroup_event_remove()
-			 * may sleep, so we have to call it in workqueue.
-			 */
-			schedule_work(&event->remove);
-		}
-		spin_unlock(&cgrp->event_list_lock);
-	}
-
-	return 0;
-}
-
-static void cgroup_event_ptable_queue_proc(struct file *file,
-		wait_queue_head_t *wqh, poll_table *pt)
-{
-	struct cgroup_event *event = container_of(pt,
-			struct cgroup_event, pt);
-
-	event->wqh = wqh;
-	add_wait_queue(wqh, &event->wait);
-}
-
-/*
- * Parse input and register new cgroup event handler.
- *
- * Input must be in format '<event_fd> <control_fd> <args>'.
- * Interpretation of args is defined by control file implementation.
- */
-static int cgroup_write_event_control(struct cgroup_subsys_state *dummy_css,
-				      struct cftype *cft, const char *buffer)
-{
-	struct cgroup *cgrp = dummy_css->cgroup;
-	struct cgroup_event *event;
-	struct cgroup_subsys_state *cfile_css;
-	unsigned int efd, cfd;
-	struct fd efile;
-	struct fd cfile;
-	char *endp;
-	int ret;
-
-	efd = simple_strtoul(buffer, &endp, 10);
-	if (*endp != ' ')
-		return -EINVAL;
-	buffer = endp + 1;
-
-	cfd = simple_strtoul(buffer, &endp, 10);
-	if ((*endp != ' ') && (*endp != '\0'))
-		return -EINVAL;
-	buffer = endp + 1;
-
-	event = kzalloc(sizeof(*event), GFP_KERNEL);
-	if (!event)
-		return -ENOMEM;
-
-	INIT_LIST_HEAD(&event->list);
-	init_poll_funcptr(&event->pt, cgroup_event_ptable_queue_proc);
-	init_waitqueue_func_entry(&event->wait, cgroup_event_wake);
-	INIT_WORK(&event->remove, cgroup_event_remove);
-
-	efile = fdget(efd);
-	if (!efile.file) {
-		ret = -EBADF;
-		goto out_kfree;
-	}
-
-	event->eventfd = eventfd_ctx_fileget(efile.file);
-	if (IS_ERR(event->eventfd)) {
-		ret = PTR_ERR(event->eventfd);
-		goto out_put_efile;
-	}
-
-	cfile = fdget(cfd);
-	if (!cfile.file) {
-		ret = -EBADF;
-		goto out_put_eventfd;
-	}
-
-	/* the process need read permission on control file */
-	/* AV: shouldn't we check that it's been opened for read instead? */
-	ret = inode_permission(file_inode(cfile.file), MAY_READ);
-	if (ret < 0)
-		goto out_put_cfile;
-
-	event->cft = __file_cft(cfile.file);
-	if (IS_ERR(event->cft)) {
-		ret = PTR_ERR(event->cft);
-		goto out_put_cfile;
-	}
-
-	if (!event->cft->ss) {
-		ret = -EBADF;
-		goto out_put_cfile;
-	}
-
-	/*
-	 * Determine the css of @cfile, verify it belongs to the same
-	 * cgroup as cgroup.event_control, and associate @event with it.
-	 * Remaining events are automatically removed on cgroup destruction
-	 * but the removal is asynchronous, so take an extra ref.
-	 */
-	rcu_read_lock();
-
-	ret = -EINVAL;
-	event->css = cgroup_css(cgrp, event->cft->ss);
-	cfile_css = css_from_dir(cfile.file->f_dentry->d_parent, event->cft->ss);
-	if (event->css && event->css == cfile_css && css_tryget(event->css))
-		ret = 0;
-
-	rcu_read_unlock();
-	if (ret)
-		goto out_put_cfile;
-
-	if (!event->cft->register_event || !event->cft->unregister_event) {
-		ret = -EINVAL;
-		goto out_put_css;
-	}
-
-	ret = event->cft->register_event(event->css, event->cft,
-			event->eventfd, buffer);
-	if (ret)
-		goto out_put_css;
-
-	efile.file->f_op->poll(efile.file, &event->pt);
-
-	spin_lock(&cgrp->event_list_lock);
-	list_add(&event->list, &cgrp->event_list);
-	spin_unlock(&cgrp->event_list_lock);
-
-	fdput(cfile);
-	fdput(efile);
-
-	return 0;
-
-out_put_css:
-	css_put(event->css);
-out_put_cfile:
-	fdput(cfile);
-out_put_eventfd:
-	eventfd_ctx_put(event->eventfd);
-out_put_efile:
-	fdput(efile);
-out_kfree:
-	kfree(event);
-
-	return ret;
-}
-
 static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css,
 				      struct cftype *cft)
 {
@@ -4170,11 +3941,6 @@ static struct cftype cgroup_base_files[] = {
 		.mode = S_IRUGO | S_IWUSR,
 	},
 	{
-		.name = "cgroup.event_control",
-		.write_string = cgroup_write_event_control,
-		.mode = S_IWUGO,
-	},
-	{
 		.name = "cgroup.clone_children",
 		.flags = CFTYPE_INSANE,
 		.read_u64 = cgroup_clone_children_read,
@@ -4666,7 +4432,6 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
 	__releases(&cgroup_mutex) __acquires(&cgroup_mutex)
 {
 	struct dentry *d = cgrp->dentry;
-	struct cgroup_event *event, *tmp;
 	struct cgroup_subsys *ss;
 	struct cgroup *child;
 	bool empty;
@@ -4741,18 +4506,6 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
 	dget(d);
 	cgroup_d_remove_dir(d);
 
-	/*
-	 * Unregister events and notify userspace.
-	 * Notify userspace about cgroup removing only after rmdir of cgroup
-	 * directory to avoid race between userspace and kernelspace.
-	 */
-	spin_lock(&cgrp->event_list_lock);
-	list_for_each_entry_safe(event, tmp, &cgrp->event_list, list) {
-		list_del_init(&event->list);
-		schedule_work(&event->remove);
-	}
-	spin_unlock(&cgrp->event_list_lock);
-
 	return 0;
 };
 
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 13b9d0f..02dae32 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -45,6 +45,7 @@
 #include <linux/swapops.h>
 #include <linux/spinlock.h>
 #include <linux/eventfd.h>
+#include <linux/poll.h>
 #include <linux/sort.h>
 #include <linux/fs.h>
 #include <linux/seq_file.h>
@@ -55,6 +56,7 @@
 #include <linux/cpu.h>
 #include <linux/oom.h>
 #include <linux/lockdep.h>
+#include <linux/file.h>
 #include "internal.h"
 #include <net/sock.h>
 #include <net/ip.h>
@@ -226,6 +228,36 @@ struct mem_cgroup_eventfd_list {
 	struct eventfd_ctx *eventfd;
 };
 
+/*
+ * cgroup_event represents events which userspace want to receive.
+ */
+struct cgroup_event {
+	/*
+	 * css which the event belongs to.
+	 */
+	struct cgroup_subsys_state *css;
+	/*
+	 * Control file which the event associated.
+	 */
+	struct cftype *cft;
+	/*
+	 * eventfd to signal userspace about the event.
+	 */
+	struct eventfd_ctx *eventfd;
+	/*
+	 * Each of these stored in a list by the cgroup.
+	 */
+	struct list_head list;
+	/*
+	 * All fields below needed to unregister event when
+	 * userspace closes eventfd.
+	 */
+	poll_table pt;
+	wait_queue_head_t *wqh;
+	wait_queue_t wait;
+	struct work_struct remove;
+};
+
 static void mem_cgroup_threshold(struct mem_cgroup *memcg);
 static void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
 
@@ -5947,6 +5979,202 @@ static void kmem_cgroup_css_offline(struct mem_cgroup *memcg)
 }
 #endif
 
+/*
+ * Unregister event and free resources.
+ *
+ * Gets called from workqueue.
+ */
+static void cgroup_event_remove(struct work_struct *work)
+{
+	struct cgroup_event *event = container_of(work, struct cgroup_event,
+			remove);
+	struct cgroup_subsys_state *css = event->css;
+
+	remove_wait_queue(event->wqh, &event->wait);
+
+	event->cft->unregister_event(css, event->cft, event->eventfd);
+
+	/* Notify userspace the event is going away. */
+	eventfd_signal(event->eventfd, 1);
+
+	eventfd_ctx_put(event->eventfd);
+	kfree(event);
+	css_put(css);
+}
+
+/*
+ * Gets called on POLLHUP on eventfd when user closes it.
+ *
+ * Called with wqh->lock held and interrupts disabled.
+ */
+static int cgroup_event_wake(wait_queue_t *wait, unsigned mode,
+		int sync, void *key)
+{
+	struct cgroup_event *event = container_of(wait,
+			struct cgroup_event, wait);
+	struct cgroup *cgrp = event->css->cgroup;
+	unsigned long flags = (unsigned long)key;
+
+	if (flags & POLLHUP) {
+		/*
+		 * If the event has been detached at cgroup removal, we
+		 * can simply return knowing the other side will cleanup
+		 * for us.
+		 *
+		 * We can't race against event freeing since the other
+		 * side will require wqh->lock via remove_wait_queue(),
+		 * which we hold.
+		 */
+		spin_lock(&cgrp->event_list_lock);
+		if (!list_empty(&event->list)) {
+			list_del_init(&event->list);
+			/*
+			 * We are in atomic context, but cgroup_event_remove()
+			 * may sleep, so we have to call it in workqueue.
+			 */
+			schedule_work(&event->remove);
+		}
+		spin_unlock(&cgrp->event_list_lock);
+	}
+
+	return 0;
+}
+
+static void cgroup_event_ptable_queue_proc(struct file *file,
+		wait_queue_head_t *wqh, poll_table *pt)
+{
+	struct cgroup_event *event = container_of(pt,
+			struct cgroup_event, pt);
+
+	event->wqh = wqh;
+	add_wait_queue(wqh, &event->wait);
+}
+
+/*
+ * Parse input and register new cgroup event handler.
+ *
+ * Input must be in format '<event_fd> <control_fd> <args>'.
+ * Interpretation of args is defined by control file implementation.
+ */
+static int cgroup_write_event_control(struct cgroup_subsys_state *dummy_css,
+				      struct cftype *cft, const char *buffer)
+{
+	struct cgroup *cgrp = dummy_css->cgroup;
+	struct cgroup_event *event;
+	struct cgroup_subsys_state *cfile_css;
+	unsigned int efd, cfd;
+	struct fd efile;
+	struct fd cfile;
+	char *endp;
+	int ret;
+
+	efd = simple_strtoul(buffer, &endp, 10);
+	if (*endp != ' ')
+		return -EINVAL;
+	buffer = endp + 1;
+
+	cfd = simple_strtoul(buffer, &endp, 10);
+	if ((*endp != ' ') && (*endp != '\0'))
+		return -EINVAL;
+	buffer = endp + 1;
+
+	event = kzalloc(sizeof(*event), GFP_KERNEL);
+	if (!event)
+		return -ENOMEM;
+
+	INIT_LIST_HEAD(&event->list);
+	init_poll_funcptr(&event->pt, cgroup_event_ptable_queue_proc);
+	init_waitqueue_func_entry(&event->wait, cgroup_event_wake);
+	INIT_WORK(&event->remove, cgroup_event_remove);
+
+	efile = fdget(efd);
+	if (!efile.file) {
+		ret = -EBADF;
+		goto out_kfree;
+	}
+
+	event->eventfd = eventfd_ctx_fileget(efile.file);
+	if (IS_ERR(event->eventfd)) {
+		ret = PTR_ERR(event->eventfd);
+		goto out_put_efile;
+	}
+
+	cfile = fdget(cfd);
+	if (!cfile.file) {
+		ret = -EBADF;
+		goto out_put_eventfd;
+	}
+
+	/* the process need read permission on control file */
+	/* AV: shouldn't we check that it's been opened for read instead? */
+	ret = inode_permission(file_inode(cfile.file), MAY_READ);
+	if (ret < 0)
+		goto out_put_cfile;
+
+	event->cft = __file_cft(cfile.file);
+	if (IS_ERR(event->cft)) {
+		ret = PTR_ERR(event->cft);
+		goto out_put_cfile;
+	}
+
+	if (!event->cft->ss) {
+		ret = -EBADF;
+		goto out_put_cfile;
+	}
+
+	/*
+	 * Determine the css of @cfile, verify it belongs to the same
+	 * cgroup as cgroup.event_control, and associate @event with it.
+	 * Remaining events are automatically removed on cgroup destruction
+	 * but the removal is asynchronous, so take an extra ref.
+	 */
+	rcu_read_lock();
+
+	ret = -EINVAL;
+	event->css = cgroup_css(cgrp, event->cft->ss);
+	cfile_css = css_from_dir(cfile.file->f_dentry->d_parent, event->cft->ss);
+	if (event->css && event->css == cfile_css && css_tryget(event->css))
+		ret = 0;
+
+	rcu_read_unlock();
+	if (ret)
+		goto out_put_cfile;
+
+	if (!event->cft->register_event || !event->cft->unregister_event) {
+		ret = -EINVAL;
+		goto out_put_css;
+	}
+
+	ret = event->cft->register_event(event->css, event->cft,
+			event->eventfd, buffer);
+	if (ret)
+		goto out_put_css;
+
+	efile.file->f_op->poll(efile.file, &event->pt);
+
+	spin_lock(&cgrp->event_list_lock);
+	list_add(&event->list, &cgrp->event_list);
+	spin_unlock(&cgrp->event_list_lock);
+
+	fdput(cfile);
+	fdput(efile);
+
+	return 0;
+
+out_put_css:
+	css_put(event->css);
+out_put_cfile:
+	fdput(cfile);
+out_put_eventfd:
+	eventfd_ctx_put(event->eventfd);
+out_put_efile:
+	fdput(efile);
+out_kfree:
+	kfree(event);
+
+	return ret;
+}
+
 static struct cftype mem_cgroup_files[] = {
 	{
 		.name = "usage_in_bytes",
@@ -5994,6 +6222,12 @@ static struct cftype mem_cgroup_files[] = {
 		.read_u64 = mem_cgroup_hierarchy_read,
 	},
 	{
+		.name = "cgroup.event_control",
+		.write_string = cgroup_write_event_control,
+		.flags = CFTYPE_NO_PREFIX,
+		.mode = S_IWUGO,
+	},
+	{
 		.name = "swappiness",
 		.read_u64 = mem_cgroup_swappiness_read,
 		.write_u64 = mem_cgroup_swappiness_write,
@@ -6326,6 +6560,20 @@ static void mem_cgroup_invalidate_reclaim_iterators(struct mem_cgroup *memcg)
 static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+	struct cgroup *cgrp = css->cgroup;
+	struct cgroup_event *event, *tmp;
+
+	/*
+	 * Unregister events and notify userspace.
+	 * Notify userspace about cgroup removing only after rmdir of cgroup
+	 * directory to avoid race between userspace and kernelspace.
+	 */
+	spin_lock(&cgrp->event_list_lock);
+	list_for_each_entry_safe(event, tmp, &cgrp->event_list, list) {
+		list_del_init(&event->list);
+		schedule_work(&event->remove);
+	}
+	spin_unlock(&cgrp->event_list_lock);
 
 	kmem_cgroup_css_offline(memcg);
 
-- 
cgit v0.10.2


From b5557c4c3b1a38074d7001b87c2482eda3a0834a Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 22 Nov 2013 18:20:42 -0500
Subject: memcg: cgroup_write_event_control() now knows @css is for memcg

@css for cgroup_write_event_control() is now always for memcg and the
target file should be a memcg file too.  Drop code which assumes @css
is dummy_css and the target file may belong to different subsystems.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 02dae32..d003681 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -6056,10 +6056,10 @@ static void cgroup_event_ptable_queue_proc(struct file *file,
  * Input must be in format '<event_fd> <control_fd> <args>'.
  * Interpretation of args is defined by control file implementation.
  */
-static int cgroup_write_event_control(struct cgroup_subsys_state *dummy_css,
+static int cgroup_write_event_control(struct cgroup_subsys_state *css,
 				      struct cftype *cft, const char *buffer)
 {
-	struct cgroup *cgrp = dummy_css->cgroup;
+	struct cgroup *cgrp = css->cgroup;
 	struct cgroup_event *event;
 	struct cgroup_subsys_state *cfile_css;
 	unsigned int efd, cfd;
@@ -6082,6 +6082,7 @@ static int cgroup_write_event_control(struct cgroup_subsys_state *dummy_css,
 	if (!event)
 		return -ENOMEM;
 
+	event->css = css;
 	INIT_LIST_HEAD(&event->list);
 	init_poll_funcptr(&event->pt, cgroup_event_ptable_queue_proc);
 	init_waitqueue_func_entry(&event->wait, cgroup_event_wake);
@@ -6117,23 +6118,17 @@ static int cgroup_write_event_control(struct cgroup_subsys_state *dummy_css,
 		goto out_put_cfile;
 	}
 
-	if (!event->cft->ss) {
-		ret = -EBADF;
-		goto out_put_cfile;
-	}
-
 	/*
-	 * Determine the css of @cfile, verify it belongs to the same
-	 * cgroup as cgroup.event_control, and associate @event with it.
-	 * Remaining events are automatically removed on cgroup destruction
-	 * but the removal is asynchronous, so take an extra ref.
+	 * Verify @cfile should belong to @css.  Also, remaining events are
+	 * automatically removed on cgroup destruction but the removal is
+	 * asynchronous, so take an extra ref on @css.
 	 */
 	rcu_read_lock();
 
 	ret = -EINVAL;
-	event->css = cgroup_css(cgrp, event->cft->ss);
-	cfile_css = css_from_dir(cfile.file->f_dentry->d_parent, event->cft->ss);
-	if (event->css && event->css == cfile_css && css_tryget(event->css))
+	cfile_css = css_from_dir(cfile.file->f_dentry->d_parent,
+				 &mem_cgroup_subsys);
+	if (cfile_css == css && css_tryget(css))
 		ret = 0;
 
 	rcu_read_unlock();
@@ -6145,7 +6140,7 @@ static int cgroup_write_event_control(struct cgroup_subsys_state *dummy_css,
 		goto out_put_css;
 	}
 
-	ret = event->cft->register_event(event->css, event->cft,
+	ret = event->cft->register_event(css, event->cft,
 			event->eventfd, buffer);
 	if (ret)
 		goto out_put_css;
@@ -6162,7 +6157,7 @@ static int cgroup_write_event_control(struct cgroup_subsys_state *dummy_css,
 	return 0;
 
 out_put_css:
-	css_put(event->css);
+	css_put(css);
 out_put_cfile:
 	fdput(cfile);
 out_put_eventfd:
-- 
cgit v0.10.2


From fba94807837850e211f8975e1970e23e7804ff4d Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 22 Nov 2013 18:20:43 -0500
Subject: cgroup, memcg: move cgroup->event_list[_lock] and event callbacks
 into memcg

cgroup_event is being moved from cgroup core to memcg and the
implementation is already moved by the previous patch.  This patch
moves the data fields and callbacks.

* cgroup->event_list[_lock] are moved to mem_cgroup.

* cftype->[un]register_event() are moved to cgroup_event.  This makes
  it impossible for individual cftype definitions to specify their
  event callbacks.  This is worked around by simply hard-coding
  filename to event callback mapping in cgroup_write_event_control().
  This is awkward and inflexible, which is actually desirable given
  that we don't want to grow more usages of this feature.

* eventfd_ctx declaration is removed from cgroup.h, which makes
  vmpressure.h miss eventfd_ctx declaration.  Include eventfd.h from
  vmpressure.h.

v2: Use file name from dentry instead of cftype.  This will allow
    removing all cftype handling in the function.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Balbir Singh <bsingharora@gmail.com>

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 40c2427..612adc5 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -29,7 +29,6 @@ struct cgroup_subsys;
 struct inode;
 struct cgroup;
 struct css_id;
-struct eventfd_ctx;
 
 extern int cgroup_init_early(void);
 extern int cgroup_init(void);
@@ -239,10 +238,6 @@ struct cgroup {
 	struct rcu_head rcu_head;
 	struct work_struct destroy_work;
 
-	/* List of events which userspace want to receive */
-	struct list_head event_list;
-	spinlock_t event_list_lock;
-
 	/* directory xattrs */
 	struct simple_xattrs xattrs;
 };
@@ -506,25 +501,6 @@ struct cftype {
 	int (*trigger)(struct cgroup_subsys_state *css, unsigned int event);
 
 	int (*release)(struct inode *inode, struct file *file);
-
-	/*
-	 * register_event() callback will be used to add new userspace
-	 * waiter for changes related to the cftype. Implement it if
-	 * you want to provide this functionality. Use eventfd_signal()
-	 * on eventfd to send notification to userspace.
-	 */
-	int (*register_event)(struct cgroup_subsys_state *css,
-			      struct cftype *cft, struct eventfd_ctx *eventfd,
-			      const char *args);
-	/*
-	 * unregister_event() callback will be called when userspace
-	 * closes the eventfd or on cgroup removing.
-	 * This callback must be implemented, if you want provide
-	 * notification functionality.
-	 */
-	void (*unregister_event)(struct cgroup_subsys_state *css,
-				 struct cftype *cft,
-				 struct eventfd_ctx *eventfd);
 };
 
 /*
diff --git a/include/linux/vmpressure.h b/include/linux/vmpressure.h
index 3f3788d..9dd1914 100644
--- a/include/linux/vmpressure.h
+++ b/include/linux/vmpressure.h
@@ -7,6 +7,7 @@
 #include <linux/gfp.h>
 #include <linux/types.h>
 #include <linux/cgroup.h>
+#include <linux/eventfd.h>
 
 struct vmpressure {
 	unsigned long scanned;
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 4bccaa7..feda7c5 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1352,8 +1352,6 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
 	INIT_LIST_HEAD(&cgrp->pidlists);
 	mutex_init(&cgrp->pidlist_mutex);
 	cgrp->dummy_css.cgroup = cgrp;
-	INIT_LIST_HEAD(&cgrp->event_list);
-	spin_lock_init(&cgrp->event_list_lock);
 	simple_xattrs_init(&cgrp->xattrs);
 }
 
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index d003681..2fcacb1 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -249,6 +249,22 @@ struct cgroup_event {
 	 */
 	struct list_head list;
 	/*
+	 * register_event() callback will be used to add new userspace
+	 * waiter for changes related to this event.  Use eventfd_signal()
+	 * on eventfd to send notification to userspace.
+	 */
+	int (*register_event)(struct cgroup_subsys_state *css,
+			      struct cftype *cft, struct eventfd_ctx *eventfd,
+			      const char *args);
+	/*
+	 * unregister_event() callback will be called when userspace closes
+	 * the eventfd or on cgroup removing.  This callback must be set,
+	 * if you want provide notification functionality.
+	 */
+	void (*unregister_event)(struct cgroup_subsys_state *css,
+				 struct cftype *cft,
+				 struct eventfd_ctx *eventfd);
+	/*
 	 * All fields below needed to unregister event when
 	 * userspace closes eventfd.
 	 */
@@ -362,6 +378,10 @@ struct mem_cgroup {
 	atomic_t	numainfo_updating;
 #endif
 
+	/* List of events which userspace want to receive */
+	struct list_head event_list;
+	spinlock_t event_list_lock;
+
 	struct mem_cgroup_per_node *nodeinfo[0];
 	/* WARNING: nodeinfo must be the last member here */
 };
@@ -5992,7 +6012,7 @@ static void cgroup_event_remove(struct work_struct *work)
 
 	remove_wait_queue(event->wqh, &event->wait);
 
-	event->cft->unregister_event(css, event->cft, event->eventfd);
+	event->unregister_event(css, event->cft, event->eventfd);
 
 	/* Notify userspace the event is going away. */
 	eventfd_signal(event->eventfd, 1);
@@ -6012,7 +6032,7 @@ static int cgroup_event_wake(wait_queue_t *wait, unsigned mode,
 {
 	struct cgroup_event *event = container_of(wait,
 			struct cgroup_event, wait);
-	struct cgroup *cgrp = event->css->cgroup;
+	struct mem_cgroup *memcg = mem_cgroup_from_css(event->css);
 	unsigned long flags = (unsigned long)key;
 
 	if (flags & POLLHUP) {
@@ -6025,7 +6045,7 @@ static int cgroup_event_wake(wait_queue_t *wait, unsigned mode,
 		 * side will require wqh->lock via remove_wait_queue(),
 		 * which we hold.
 		 */
-		spin_lock(&cgrp->event_list_lock);
+		spin_lock(&memcg->event_list_lock);
 		if (!list_empty(&event->list)) {
 			list_del_init(&event->list);
 			/*
@@ -6034,7 +6054,7 @@ static int cgroup_event_wake(wait_queue_t *wait, unsigned mode,
 			 */
 			schedule_work(&event->remove);
 		}
-		spin_unlock(&cgrp->event_list_lock);
+		spin_unlock(&memcg->event_list_lock);
 	}
 
 	return 0;
@@ -6059,12 +6079,13 @@ static void cgroup_event_ptable_queue_proc(struct file *file,
 static int cgroup_write_event_control(struct cgroup_subsys_state *css,
 				      struct cftype *cft, const char *buffer)
 {
-	struct cgroup *cgrp = css->cgroup;
+	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
 	struct cgroup_event *event;
 	struct cgroup_subsys_state *cfile_css;
 	unsigned int efd, cfd;
 	struct fd efile;
 	struct fd cfile;
+	const char *name;
 	char *endp;
 	int ret;
 
@@ -6119,6 +6140,31 @@ static int cgroup_write_event_control(struct cgroup_subsys_state *css,
 	}
 
 	/*
+	 * Determine the event callbacks and set them in @event.  This used
+	 * to be done via struct cftype but cgroup core no longer knows
+	 * about these events.  The following is crude but the whole thing
+	 * is for compatibility anyway.
+	 */
+	name = cfile.file->f_dentry->d_name.name;
+
+	if (!strcmp(name, "memory.usage_in_bytes")) {
+		event->register_event = mem_cgroup_usage_register_event;
+		event->unregister_event = mem_cgroup_usage_unregister_event;
+	} else if (!strcmp(name, "memory.oom_control")) {
+		event->register_event = mem_cgroup_oom_register_event;
+		event->unregister_event = mem_cgroup_oom_unregister_event;
+	} else if (!strcmp(name, "memory.pressure_level")) {
+		event->register_event = vmpressure_register_event;
+		event->unregister_event = vmpressure_unregister_event;
+	} else if (!strcmp(name, "memory.memsw.usage_in_bytes")) {
+		event->register_event = mem_cgroup_usage_register_event;
+		event->unregister_event = mem_cgroup_usage_unregister_event;
+	} else {
+		ret = -EINVAL;
+		goto out_put_cfile;
+	}
+
+	/*
 	 * Verify @cfile should belong to @css.  Also, remaining events are
 	 * automatically removed on cgroup destruction but the removal is
 	 * asynchronous, so take an extra ref on @css.
@@ -6135,21 +6181,15 @@ static int cgroup_write_event_control(struct cgroup_subsys_state *css,
 	if (ret)
 		goto out_put_cfile;
 
-	if (!event->cft->register_event || !event->cft->unregister_event) {
-		ret = -EINVAL;
-		goto out_put_css;
-	}
-
-	ret = event->cft->register_event(css, event->cft,
-			event->eventfd, buffer);
+	ret = event->register_event(css, event->cft, event->eventfd, buffer);
 	if (ret)
 		goto out_put_css;
 
 	efile.file->f_op->poll(efile.file, &event->pt);
 
-	spin_lock(&cgrp->event_list_lock);
-	list_add(&event->list, &cgrp->event_list);
-	spin_unlock(&cgrp->event_list_lock);
+	spin_lock(&memcg->event_list_lock);
+	list_add(&event->list, &memcg->event_list);
+	spin_unlock(&memcg->event_list_lock);
 
 	fdput(cfile);
 	fdput(efile);
@@ -6175,8 +6215,6 @@ static struct cftype mem_cgroup_files[] = {
 		.name = "usage_in_bytes",
 		.private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
 		.read = mem_cgroup_read,
-		.register_event = mem_cgroup_usage_register_event,
-		.unregister_event = mem_cgroup_usage_unregister_event,
 	},
 	{
 		.name = "max_usage_in_bytes",
@@ -6236,14 +6274,10 @@ static struct cftype mem_cgroup_files[] = {
 		.name = "oom_control",
 		.read_map = mem_cgroup_oom_control_read,
 		.write_u64 = mem_cgroup_oom_control_write,
-		.register_event = mem_cgroup_oom_register_event,
-		.unregister_event = mem_cgroup_oom_unregister_event,
 		.private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL),
 	},
 	{
 		.name = "pressure_level",
-		.register_event = vmpressure_register_event,
-		.unregister_event = vmpressure_unregister_event,
 	},
 #ifdef CONFIG_NUMA
 	{
@@ -6291,8 +6325,6 @@ static struct cftype memsw_cgroup_files[] = {
 		.name = "memsw.usage_in_bytes",
 		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
 		.read = mem_cgroup_read,
-		.register_event = mem_cgroup_usage_register_event,
-		.unregister_event = mem_cgroup_usage_unregister_event,
 	},
 	{
 		.name = "memsw.max_usage_in_bytes",
@@ -6483,6 +6515,8 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
 	mutex_init(&memcg->thresholds_lock);
 	spin_lock_init(&memcg->move_lock);
 	vmpressure_init(&memcg->vmpressure);
+	INIT_LIST_HEAD(&memcg->event_list);
+	spin_lock_init(&memcg->event_list_lock);
 
 	return &memcg->css;
 
@@ -6555,7 +6589,6 @@ static void mem_cgroup_invalidate_reclaim_iterators(struct mem_cgroup *memcg)
 static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
-	struct cgroup *cgrp = css->cgroup;
 	struct cgroup_event *event, *tmp;
 
 	/*
@@ -6563,12 +6596,12 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
 	 * Notify userspace about cgroup removing only after rmdir of cgroup
 	 * directory to avoid race between userspace and kernelspace.
 	 */
-	spin_lock(&cgrp->event_list_lock);
-	list_for_each_entry_safe(event, tmp, &cgrp->event_list, list) {
+	spin_lock(&memcg->event_list_lock);
+	list_for_each_entry_safe(event, tmp, &memcg->event_list, list) {
 		list_del_init(&event->list);
 		schedule_work(&event->remove);
 	}
-	spin_unlock(&cgrp->event_list_lock);
+	spin_unlock(&memcg->event_list_lock);
 
 	kmem_cgroup_css_offline(memcg);
 
-- 
cgit v0.10.2


From 347c4a8747104a945ecced358944e42879176ca5 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 22 Nov 2013 18:20:43 -0500
Subject: memcg: remove cgroup_event->cft

The only use of cgroup_event->cft is distinguishing "usage_in_bytes"
and "memsw.usgae_in_bytes" for mem_cgroup_usage_[un]register_event(),
which can be done by adding an explicit argument to the function and
implementing two wrappers so that the two cases can be distinguished
from the function alone.

Remove cgroup_event->cft and the related code including
[un]register_events() methods.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.cz>

diff --git a/include/linux/vmpressure.h b/include/linux/vmpressure.h
index 9dd1914..b048365 100644
--- a/include/linux/vmpressure.h
+++ b/include/linux/vmpressure.h
@@ -36,11 +36,9 @@ extern struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg);
 extern struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr);
 extern struct vmpressure *css_to_vmpressure(struct cgroup_subsys_state *css);
 extern int vmpressure_register_event(struct cgroup_subsys_state *css,
-				     struct cftype *cft,
 				     struct eventfd_ctx *eventfd,
 				     const char *args);
 extern void vmpressure_unregister_event(struct cgroup_subsys_state *css,
-					struct cftype *cft,
 					struct eventfd_ctx *eventfd);
 #else
 static inline void vmpressure(gfp_t gfp, struct mem_cgroup *memcg,
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 2fcacb1..3c93dcf 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -237,10 +237,6 @@ struct cgroup_event {
 	 */
 	struct cgroup_subsys_state *css;
 	/*
-	 * Control file which the event associated.
-	 */
-	struct cftype *cft;
-	/*
 	 * eventfd to signal userspace about the event.
 	 */
 	struct eventfd_ctx *eventfd;
@@ -254,15 +250,13 @@ struct cgroup_event {
 	 * on eventfd to send notification to userspace.
 	 */
 	int (*register_event)(struct cgroup_subsys_state *css,
-			      struct cftype *cft, struct eventfd_ctx *eventfd,
-			      const char *args);
+			      struct eventfd_ctx *eventfd, const char *args);
 	/*
 	 * unregister_event() callback will be called when userspace closes
 	 * the eventfd or on cgroup removing.  This callback must be set,
 	 * if you want provide notification functionality.
 	 */
 	void (*unregister_event)(struct cgroup_subsys_state *css,
-				 struct cftype *cft,
 				 struct eventfd_ctx *eventfd);
 	/*
 	 * All fields below needed to unregister event when
@@ -5688,13 +5682,12 @@ static void mem_cgroup_oom_notify(struct mem_cgroup *memcg)
 		mem_cgroup_oom_notify_cb(iter);
 }
 
-static int mem_cgroup_usage_register_event(struct cgroup_subsys_state *css,
-	struct cftype *cft, struct eventfd_ctx *eventfd, const char *args)
+static int __mem_cgroup_usage_register_event(struct cgroup_subsys_state *css,
+	struct eventfd_ctx *eventfd, const char *args, enum res_type type)
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
 	struct mem_cgroup_thresholds *thresholds;
 	struct mem_cgroup_threshold_ary *new;
-	enum res_type type = MEMFILE_TYPE(cft->private);
 	u64 threshold, usage;
 	int i, size, ret;
 
@@ -5771,13 +5764,24 @@ unlock:
 	return ret;
 }
 
-static void mem_cgroup_usage_unregister_event(struct cgroup_subsys_state *css,
-	struct cftype *cft, struct eventfd_ctx *eventfd)
+static int mem_cgroup_usage_register_event(struct cgroup_subsys_state *css,
+	struct eventfd_ctx *eventfd, const char *args)
+{
+	return __mem_cgroup_usage_register_event(css, eventfd, args, _MEM);
+}
+
+static int memsw_cgroup_usage_register_event(struct cgroup_subsys_state *css,
+	struct eventfd_ctx *eventfd, const char *args)
+{
+	return __mem_cgroup_usage_register_event(css, eventfd, args, _MEMSWAP);
+}
+
+static void __mem_cgroup_usage_unregister_event(struct cgroup_subsys_state *css,
+	struct eventfd_ctx *eventfd, enum res_type type)
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
 	struct mem_cgroup_thresholds *thresholds;
 	struct mem_cgroup_threshold_ary *new;
-	enum res_type type = MEMFILE_TYPE(cft->private);
 	u64 usage;
 	int i, j, size;
 
@@ -5850,14 +5854,24 @@ unlock:
 	mutex_unlock(&memcg->thresholds_lock);
 }
 
+static void mem_cgroup_usage_unregister_event(struct cgroup_subsys_state *css,
+	struct eventfd_ctx *eventfd)
+{
+	return __mem_cgroup_usage_unregister_event(css, eventfd, _MEM);
+}
+
+static void memsw_cgroup_usage_unregister_event(struct cgroup_subsys_state *css,
+	struct eventfd_ctx *eventfd)
+{
+	return __mem_cgroup_usage_unregister_event(css, eventfd, _MEMSWAP);
+}
+
 static int mem_cgroup_oom_register_event(struct cgroup_subsys_state *css,
-	struct cftype *cft, struct eventfd_ctx *eventfd, const char *args)
+	struct eventfd_ctx *eventfd, const char *args)
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
 	struct mem_cgroup_eventfd_list *event;
-	enum res_type type = MEMFILE_TYPE(cft->private);
 
-	BUG_ON(type != _OOM_TYPE);
 	event = kmalloc(sizeof(*event),	GFP_KERNEL);
 	if (!event)
 		return -ENOMEM;
@@ -5876,13 +5890,10 @@ static int mem_cgroup_oom_register_event(struct cgroup_subsys_state *css,
 }
 
 static void mem_cgroup_oom_unregister_event(struct cgroup_subsys_state *css,
-	struct cftype *cft, struct eventfd_ctx *eventfd)
+	struct eventfd_ctx *eventfd)
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
 	struct mem_cgroup_eventfd_list *ev, *tmp;
-	enum res_type type = MEMFILE_TYPE(cft->private);
-
-	BUG_ON(type != _OOM_TYPE);
 
 	spin_lock(&memcg_oom_lock);
 
@@ -6012,7 +6023,7 @@ static void cgroup_event_remove(struct work_struct *work)
 
 	remove_wait_queue(event->wqh, &event->wait);
 
-	event->unregister_event(css, event->cft, event->eventfd);
+	event->unregister_event(css, event->eventfd);
 
 	/* Notify userspace the event is going away. */
 	eventfd_signal(event->eventfd, 1);
@@ -6133,12 +6144,6 @@ static int cgroup_write_event_control(struct cgroup_subsys_state *css,
 	if (ret < 0)
 		goto out_put_cfile;
 
-	event->cft = __file_cft(cfile.file);
-	if (IS_ERR(event->cft)) {
-		ret = PTR_ERR(event->cft);
-		goto out_put_cfile;
-	}
-
 	/*
 	 * Determine the event callbacks and set them in @event.  This used
 	 * to be done via struct cftype but cgroup core no longer knows
@@ -6157,8 +6162,8 @@ static int cgroup_write_event_control(struct cgroup_subsys_state *css,
 		event->register_event = vmpressure_register_event;
 		event->unregister_event = vmpressure_unregister_event;
 	} else if (!strcmp(name, "memory.memsw.usage_in_bytes")) {
-		event->register_event = mem_cgroup_usage_register_event;
-		event->unregister_event = mem_cgroup_usage_unregister_event;
+		event->register_event = memsw_cgroup_usage_register_event;
+		event->unregister_event = memsw_cgroup_usage_unregister_event;
 	} else {
 		ret = -EINVAL;
 		goto out_put_cfile;
@@ -6181,7 +6186,7 @@ static int cgroup_write_event_control(struct cgroup_subsys_state *css,
 	if (ret)
 		goto out_put_cfile;
 
-	ret = event->register_event(css, event->cft, event->eventfd, buffer);
+	ret = event->register_event(css, event->eventfd, buffer);
 	if (ret)
 		goto out_put_css;
 
diff --git a/mm/vmpressure.c b/mm/vmpressure.c
index e0f6283..0f25a99 100644
--- a/mm/vmpressure.c
+++ b/mm/vmpressure.c
@@ -279,7 +279,6 @@ void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio)
 /**
  * vmpressure_register_event() - Bind vmpressure notifications to an eventfd
  * @css:	css that is interested in vmpressure notifications
- * @cft:	cgroup control files handle
  * @eventfd:	eventfd context to link notifications with
  * @args:	event arguments (used to set up a pressure level threshold)
  *
@@ -289,13 +288,10 @@ void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio)
  * threshold (one of vmpressure_str_levels, i.e. "low", "medium", or
  * "critical").
  *
- * This function should not be used directly, just pass it to (struct
- * cftype).register_event, and then cgroup core will handle everything by
- * itself.
+ * To be used as memcg event method.
  */
 int vmpressure_register_event(struct cgroup_subsys_state *css,
-			      struct cftype *cft, struct eventfd_ctx *eventfd,
-			      const char *args)
+			      struct eventfd_ctx *eventfd, const char *args)
 {
 	struct vmpressure *vmpr = css_to_vmpressure(css);
 	struct vmpressure_event *ev;
@@ -326,19 +322,15 @@ int vmpressure_register_event(struct cgroup_subsys_state *css,
 /**
  * vmpressure_unregister_event() - Unbind eventfd from vmpressure
  * @css:	css handle
- * @cft:	cgroup control files handle
  * @eventfd:	eventfd context that was used to link vmpressure with the @cg
  *
  * This function does internal manipulations to detach the @eventfd from
  * the vmpressure notifications, and then frees internal resources
  * associated with the @eventfd (but the @eventfd itself is not freed).
  *
- * This function should not be used directly, just pass it to (struct
- * cftype).unregister_event, and then cgroup core will handle everything
- * by itself.
+ * To be used as memcg event method.
  */
 void vmpressure_unregister_event(struct cgroup_subsys_state *css,
-				 struct cftype *cft,
 				 struct eventfd_ctx *eventfd)
 {
 	struct vmpressure *vmpr = css_to_vmpressure(css);
-- 
cgit v0.10.2


From 59b6f87344ab5eb3057e5844b8cd8a39e668f477 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 22 Nov 2013 18:20:43 -0500
Subject: memcg: make cgroup_event deal with mem_cgroup instead of
 cgroup_subsys_state

cgroup_event is now memcg specific.  Replace cgroup_event->css with
->memcg and convert [un]register_event() callbacks to take mem_cgroup
pointer instead of cgroup_subsys_state one.  This simplifies the code
slightly and makes css_to_vmpressure() unnecessary which is removed.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.cz>

diff --git a/include/linux/vmpressure.h b/include/linux/vmpressure.h
index b048365..3e45358 100644
--- a/include/linux/vmpressure.h
+++ b/include/linux/vmpressure.h
@@ -34,11 +34,10 @@ extern void vmpressure_init(struct vmpressure *vmpr);
 extern void vmpressure_cleanup(struct vmpressure *vmpr);
 extern struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg);
 extern struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr);
-extern struct vmpressure *css_to_vmpressure(struct cgroup_subsys_state *css);
-extern int vmpressure_register_event(struct cgroup_subsys_state *css,
+extern int vmpressure_register_event(struct mem_cgroup *memcg,
 				     struct eventfd_ctx *eventfd,
 				     const char *args);
-extern void vmpressure_unregister_event(struct cgroup_subsys_state *css,
+extern void vmpressure_unregister_event(struct mem_cgroup *memcg,
 					struct eventfd_ctx *eventfd);
 #else
 static inline void vmpressure(gfp_t gfp, struct mem_cgroup *memcg,
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 3c93dcf..42f2843 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -233,9 +233,9 @@ struct mem_cgroup_eventfd_list {
  */
 struct cgroup_event {
 	/*
-	 * css which the event belongs to.
+	 * memcg which the event belongs to.
 	 */
-	struct cgroup_subsys_state *css;
+	struct mem_cgroup *memcg;
 	/*
 	 * eventfd to signal userspace about the event.
 	 */
@@ -249,14 +249,14 @@ struct cgroup_event {
 	 * waiter for changes related to this event.  Use eventfd_signal()
 	 * on eventfd to send notification to userspace.
 	 */
-	int (*register_event)(struct cgroup_subsys_state *css,
+	int (*register_event)(struct mem_cgroup *memcg,
 			      struct eventfd_ctx *eventfd, const char *args);
 	/*
 	 * unregister_event() callback will be called when userspace closes
 	 * the eventfd or on cgroup removing.  This callback must be set,
 	 * if you want provide notification functionality.
 	 */
-	void (*unregister_event)(struct cgroup_subsys_state *css,
+	void (*unregister_event)(struct mem_cgroup *memcg,
 				 struct eventfd_ctx *eventfd);
 	/*
 	 * All fields below needed to unregister event when
@@ -535,11 +535,6 @@ struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr)
 	return &container_of(vmpr, struct mem_cgroup, vmpressure)->css;
 }
 
-struct vmpressure *css_to_vmpressure(struct cgroup_subsys_state *css)
-{
-	return &mem_cgroup_from_css(css)->vmpressure;
-}
-
 static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
 {
 	return (memcg == root_mem_cgroup);
@@ -5682,10 +5677,9 @@ static void mem_cgroup_oom_notify(struct mem_cgroup *memcg)
 		mem_cgroup_oom_notify_cb(iter);
 }
 
-static int __mem_cgroup_usage_register_event(struct cgroup_subsys_state *css,
+static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
 	struct eventfd_ctx *eventfd, const char *args, enum res_type type)
 {
-	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
 	struct mem_cgroup_thresholds *thresholds;
 	struct mem_cgroup_threshold_ary *new;
 	u64 threshold, usage;
@@ -5764,22 +5758,21 @@ unlock:
 	return ret;
 }
 
-static int mem_cgroup_usage_register_event(struct cgroup_subsys_state *css,
+static int mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
 	struct eventfd_ctx *eventfd, const char *args)
 {
-	return __mem_cgroup_usage_register_event(css, eventfd, args, _MEM);
+	return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEM);
 }
 
-static int memsw_cgroup_usage_register_event(struct cgroup_subsys_state *css,
+static int memsw_cgroup_usage_register_event(struct mem_cgroup *memcg,
 	struct eventfd_ctx *eventfd, const char *args)
 {
-	return __mem_cgroup_usage_register_event(css, eventfd, args, _MEMSWAP);
+	return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEMSWAP);
 }
 
-static void __mem_cgroup_usage_unregister_event(struct cgroup_subsys_state *css,
+static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
 	struct eventfd_ctx *eventfd, enum res_type type)
 {
-	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
 	struct mem_cgroup_thresholds *thresholds;
 	struct mem_cgroup_threshold_ary *new;
 	u64 usage;
@@ -5854,22 +5847,21 @@ unlock:
 	mutex_unlock(&memcg->thresholds_lock);
 }
 
-static void mem_cgroup_usage_unregister_event(struct cgroup_subsys_state *css,
+static void mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
 	struct eventfd_ctx *eventfd)
 {
-	return __mem_cgroup_usage_unregister_event(css, eventfd, _MEM);
+	return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEM);
 }
 
-static void memsw_cgroup_usage_unregister_event(struct cgroup_subsys_state *css,
+static void memsw_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
 	struct eventfd_ctx *eventfd)
 {
-	return __mem_cgroup_usage_unregister_event(css, eventfd, _MEMSWAP);
+	return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEMSWAP);
 }
 
-static int mem_cgroup_oom_register_event(struct cgroup_subsys_state *css,
+static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg,
 	struct eventfd_ctx *eventfd, const char *args)
 {
-	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
 	struct mem_cgroup_eventfd_list *event;
 
 	event = kmalloc(sizeof(*event),	GFP_KERNEL);
@@ -5889,10 +5881,9 @@ static int mem_cgroup_oom_register_event(struct cgroup_subsys_state *css,
 	return 0;
 }
 
-static void mem_cgroup_oom_unregister_event(struct cgroup_subsys_state *css,
+static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg,
 	struct eventfd_ctx *eventfd)
 {
-	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
 	struct mem_cgroup_eventfd_list *ev, *tmp;
 
 	spin_lock(&memcg_oom_lock);
@@ -6019,18 +6010,18 @@ static void cgroup_event_remove(struct work_struct *work)
 {
 	struct cgroup_event *event = container_of(work, struct cgroup_event,
 			remove);
-	struct cgroup_subsys_state *css = event->css;
+	struct mem_cgroup *memcg = event->memcg;
 
 	remove_wait_queue(event->wqh, &event->wait);
 
-	event->unregister_event(css, event->eventfd);
+	event->unregister_event(memcg, event->eventfd);
 
 	/* Notify userspace the event is going away. */
 	eventfd_signal(event->eventfd, 1);
 
 	eventfd_ctx_put(event->eventfd);
 	kfree(event);
-	css_put(css);
+	css_put(&memcg->css);
 }
 
 /*
@@ -6043,7 +6034,7 @@ static int cgroup_event_wake(wait_queue_t *wait, unsigned mode,
 {
 	struct cgroup_event *event = container_of(wait,
 			struct cgroup_event, wait);
-	struct mem_cgroup *memcg = mem_cgroup_from_css(event->css);
+	struct mem_cgroup *memcg = event->memcg;
 	unsigned long flags = (unsigned long)key;
 
 	if (flags & POLLHUP) {
@@ -6114,7 +6105,7 @@ static int cgroup_write_event_control(struct cgroup_subsys_state *css,
 	if (!event)
 		return -ENOMEM;
 
-	event->css = css;
+	event->memcg = memcg;
 	INIT_LIST_HEAD(&event->list);
 	init_poll_funcptr(&event->pt, cgroup_event_ptable_queue_proc);
 	init_waitqueue_func_entry(&event->wait, cgroup_event_wake);
@@ -6186,7 +6177,7 @@ static int cgroup_write_event_control(struct cgroup_subsys_state *css,
 	if (ret)
 		goto out_put_cfile;
 
-	ret = event->register_event(css, event->eventfd, buffer);
+	ret = event->register_event(memcg, event->eventfd, buffer);
 	if (ret)
 		goto out_put_css;
 
diff --git a/mm/vmpressure.c b/mm/vmpressure.c
index 0f25a99..196970a 100644
--- a/mm/vmpressure.c
+++ b/mm/vmpressure.c
@@ -278,7 +278,7 @@ void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio)
 
 /**
  * vmpressure_register_event() - Bind vmpressure notifications to an eventfd
- * @css:	css that is interested in vmpressure notifications
+ * @memcg:	memcg that is interested in vmpressure notifications
  * @eventfd:	eventfd context to link notifications with
  * @args:	event arguments (used to set up a pressure level threshold)
  *
@@ -290,10 +290,10 @@ void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio)
  *
  * To be used as memcg event method.
  */
-int vmpressure_register_event(struct cgroup_subsys_state *css,
+int vmpressure_register_event(struct mem_cgroup *memcg,
 			      struct eventfd_ctx *eventfd, const char *args)
 {
-	struct vmpressure *vmpr = css_to_vmpressure(css);
+	struct vmpressure *vmpr = memcg_to_vmpressure(memcg);
 	struct vmpressure_event *ev;
 	int level;
 
@@ -321,7 +321,7 @@ int vmpressure_register_event(struct cgroup_subsys_state *css,
 
 /**
  * vmpressure_unregister_event() - Unbind eventfd from vmpressure
- * @css:	css handle
+ * @memcg:	memcg handle
  * @eventfd:	eventfd context that was used to link vmpressure with the @cg
  *
  * This function does internal manipulations to detach the @eventfd from
@@ -330,10 +330,10 @@ int vmpressure_register_event(struct cgroup_subsys_state *css,
  *
  * To be used as memcg event method.
  */
-void vmpressure_unregister_event(struct cgroup_subsys_state *css,
+void vmpressure_unregister_event(struct mem_cgroup *memcg,
 				 struct eventfd_ctx *eventfd)
 {
-	struct vmpressure *vmpr = css_to_vmpressure(css);
+	struct vmpressure *vmpr = memcg_to_vmpressure(memcg);
 	struct vmpressure_event *ev;
 
 	mutex_lock(&vmpr->events_lock);
-- 
cgit v0.10.2


From 3bc942f372af383f49d56aab599469561a5e39ec Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 22 Nov 2013 18:20:44 -0500
Subject: memcg: rename cgroup_event to mem_cgroup_event

cgroup_event is only available in memcg now.  Let's brand it that way.
While at it, add a comment encouraging deprecation of the feature and
remove the respective section from cgroup documentation.

This patch is cosmetic.

v3: Typo update as per Li Zefan.

v2: Index in cgroups.txt updated accordingly as suggested by Li Zefan.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.cz>

diff --git a/Documentation/cgroups/cgroups.txt b/Documentation/cgroups/cgroups.txt
index 638bf17..821de56 100644
--- a/Documentation/cgroups/cgroups.txt
+++ b/Documentation/cgroups/cgroups.txt
@@ -24,7 +24,6 @@ CONTENTS:
   2.1 Basic Usage
   2.2 Attaching processes
   2.3 Mounting hierarchies by name
-  2.4 Notification API
 3. Kernel API
   3.1 Overview
   3.2 Synchronization
@@ -472,25 +471,6 @@ you give a subsystem a name.
 The name of the subsystem appears as part of the hierarchy description
 in /proc/mounts and /proc/<pid>/cgroups.
 
-2.4 Notification API
---------------------
-
-There is mechanism which allows to get notifications about changing
-status of a cgroup.
-
-To register a new notification handler you need to:
- - create a file descriptor for event notification using eventfd(2);
- - open a control file to be monitored (e.g. memory.usage_in_bytes);
- - write "<event_fd> <control_fd> <args>" to cgroup.event_control.
-   Interpretation of args is defined by control file implementation;
-
-eventfd will be woken up by control file implementation or when the
-cgroup is removed.
-
-To unregister a notification handler just close eventfd.
-
-NOTE: Support of notifications should be implemented for the control
-file. See documentation for the subsystem.
 
 3. Kernel API
 =============
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 42f2843..ec8582b 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -231,7 +231,7 @@ struct mem_cgroup_eventfd_list {
 /*
  * cgroup_event represents events which userspace want to receive.
  */
-struct cgroup_event {
+struct mem_cgroup_event {
 	/*
 	 * memcg which the event belongs to.
 	 */
@@ -6002,14 +6002,27 @@ static void kmem_cgroup_css_offline(struct mem_cgroup *memcg)
 #endif
 
 /*
+ * DO NOT USE IN NEW FILES.
+ *
+ * "cgroup.event_control" implementation.
+ *
+ * This is way over-engineered.  It tries to support fully configurable
+ * events for each user.  Such level of flexibility is completely
+ * unnecessary especially in the light of the planned unified hierarchy.
+ *
+ * Please deprecate this and replace with something simpler if at all
+ * possible.
+ */
+
+/*
  * Unregister event and free resources.
  *
  * Gets called from workqueue.
  */
-static void cgroup_event_remove(struct work_struct *work)
+static void memcg_event_remove(struct work_struct *work)
 {
-	struct cgroup_event *event = container_of(work, struct cgroup_event,
-			remove);
+	struct mem_cgroup_event *event =
+		container_of(work, struct mem_cgroup_event, remove);
 	struct mem_cgroup *memcg = event->memcg;
 
 	remove_wait_queue(event->wqh, &event->wait);
@@ -6029,11 +6042,11 @@ static void cgroup_event_remove(struct work_struct *work)
  *
  * Called with wqh->lock held and interrupts disabled.
  */
-static int cgroup_event_wake(wait_queue_t *wait, unsigned mode,
-		int sync, void *key)
+static int memcg_event_wake(wait_queue_t *wait, unsigned mode,
+			    int sync, void *key)
 {
-	struct cgroup_event *event = container_of(wait,
-			struct cgroup_event, wait);
+	struct mem_cgroup_event *event =
+		container_of(wait, struct mem_cgroup_event, wait);
 	struct mem_cgroup *memcg = event->memcg;
 	unsigned long flags = (unsigned long)key;
 
@@ -6062,27 +6075,29 @@ static int cgroup_event_wake(wait_queue_t *wait, unsigned mode,
 	return 0;
 }
 
-static void cgroup_event_ptable_queue_proc(struct file *file,
+static void memcg_event_ptable_queue_proc(struct file *file,
 		wait_queue_head_t *wqh, poll_table *pt)
 {
-	struct cgroup_event *event = container_of(pt,
-			struct cgroup_event, pt);
+	struct mem_cgroup_event *event =
+		container_of(pt, struct mem_cgroup_event, pt);
 
 	event->wqh = wqh;
 	add_wait_queue(wqh, &event->wait);
 }
 
 /*
+ * DO NOT USE IN NEW FILES.
+ *
  * Parse input and register new cgroup event handler.
  *
  * Input must be in format '<event_fd> <control_fd> <args>'.
  * Interpretation of args is defined by control file implementation.
  */
-static int cgroup_write_event_control(struct cgroup_subsys_state *css,
-				      struct cftype *cft, const char *buffer)
+static int memcg_write_event_control(struct cgroup_subsys_state *css,
+				     struct cftype *cft, const char *buffer)
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
-	struct cgroup_event *event;
+	struct mem_cgroup_event *event;
 	struct cgroup_subsys_state *cfile_css;
 	unsigned int efd, cfd;
 	struct fd efile;
@@ -6107,9 +6122,9 @@ static int cgroup_write_event_control(struct cgroup_subsys_state *css,
 
 	event->memcg = memcg;
 	INIT_LIST_HEAD(&event->list);
-	init_poll_funcptr(&event->pt, cgroup_event_ptable_queue_proc);
-	init_waitqueue_func_entry(&event->wait, cgroup_event_wake);
-	INIT_WORK(&event->remove, cgroup_event_remove);
+	init_poll_funcptr(&event->pt, memcg_event_ptable_queue_proc);
+	init_waitqueue_func_entry(&event->wait, memcg_event_wake);
+	INIT_WORK(&event->remove, memcg_event_remove);
 
 	efile = fdget(efd);
 	if (!efile.file) {
@@ -6140,6 +6155,8 @@ static int cgroup_write_event_control(struct cgroup_subsys_state *css,
 	 * to be done via struct cftype but cgroup core no longer knows
 	 * about these events.  The following is crude but the whole thing
 	 * is for compatibility anyway.
+	 *
+	 * DO NOT ADD NEW FILES.
 	 */
 	name = cfile.file->f_dentry->d_name.name;
 
@@ -6251,8 +6268,8 @@ static struct cftype mem_cgroup_files[] = {
 		.read_u64 = mem_cgroup_hierarchy_read,
 	},
 	{
-		.name = "cgroup.event_control",
-		.write_string = cgroup_write_event_control,
+		.name = "cgroup.event_control",		/* XXX: for compat */
+		.write_string = memcg_write_event_control,
 		.flags = CFTYPE_NO_PREFIX,
 		.mode = S_IWUGO,
 	},
@@ -6585,7 +6602,7 @@ static void mem_cgroup_invalidate_reclaim_iterators(struct mem_cgroup *memcg)
 static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
-	struct cgroup_event *event, *tmp;
+	struct mem_cgroup_event *event, *tmp;
 
 	/*
 	 * Unregister events and notify userspace.
-- 
cgit v0.10.2


From b36824c75c7855585d6476eef2b234f6e0e68872 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 22 Nov 2013 18:20:44 -0500
Subject: cgroup: unexport cgroup_css() and remove __file_cft()

Now that cgroup_event is made memcg specific, the temporarily exported
functions are no longer necessary.  Unexport cgroup_css() and remove
__file_cft() which doesn't have any user left.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 612adc5..8d9fa89 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -883,11 +883,6 @@ unsigned short css_id(struct cgroup_subsys_state *css);
 struct cgroup_subsys_state *css_from_dir(struct dentry *dentry,
 					 struct cgroup_subsys *ss);
 
-/* XXX: temporary */
-struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp,
-				       struct cgroup_subsys *ss);
-struct cftype *__file_cft(struct file *file);
-
 #else /* !CONFIG_CGROUPS */
 
 static inline int cgroup_init_early(void) { return 0; }
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index feda7c5..c0248e1 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -202,8 +202,8 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
  * keep accessing it outside the said locks.  This function may return
  * %NULL if @cgrp doesn't have @subsys_id enabled.
  */
-struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp,
-				       struct cgroup_subsys *ss)
+static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp,
+					      struct cgroup_subsys *ss)
 {
 	if (ss)
 		return rcu_dereference_check(cgrp->subsys[ss->subsys_id],
@@ -2625,16 +2625,6 @@ static const struct inode_operations cgroup_dir_inode_operations = {
 	.removexattr = cgroup_removexattr,
 };
 
-/*
- * Check if a file is a control file
- */
-struct cftype *__file_cft(struct file *file)
-{
-	if (file_inode(file)->i_fop != &cgroup_file_operations)
-		return ERR_PTR(-EINVAL);
-	return __d_cft(file->f_dentry);
-}
-
 static int cgroup_create_file(struct dentry *dentry, umode_t mode,
 				struct super_block *sb)
 {
-- 
cgit v0.10.2


From ac1e69aa78e2e799a8d5475595ba744bdf29b597 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 29 Nov 2013 10:42:58 -0500
Subject: cgroup: don't skip seq_open on write only opens on pidlist files

Currently, cgroup_pidlist_open() skips seq_open() and pidlist loading
if the file is opened write-only, which is a sensible optimization as
pidlist loading can be costly and there often are occasions where
tasks or cgroup.procs is opened write-only.  However, pidlist init and
release are planned to be moved to cgroup_pidlist_start/stop()
respectively which would make this optimization unnecessary.

This patch removes the optimization and always fully initializes
pidlist files regardless of open mode.  This will help moving pidlist
handling to start/stop by unifying rw paths and removes the need for
specifying cftype->release() in addition to .release in
cgroup_pidlist_operations as file->f_op is now always overridden.  As
pidlist files were the only user of cftype->release(), the next patch
will remove the method.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 34fd1be..1e09ded 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -3781,12 +3781,7 @@ static void cgroup_release_pid_array(struct cgroup_pidlist *l)
 static int cgroup_pidlist_release(struct inode *inode, struct file *file)
 {
 	struct cgroup_pidlist *l;
-	if (!(file->f_mode & FMODE_READ))
-		return 0;
-	/*
-	 * the seq_file will only be initialized if the file was opened for
-	 * reading; hence we check if it's not null only in that case.
-	 */
+
 	l = ((struct seq_file *)file->private_data)->private;
 	cgroup_release_pid_array(l);
 	return seq_release(inode, file);
@@ -3811,10 +3806,6 @@ static int cgroup_pidlist_open(struct file *file, enum cgroup_filetype type)
 	struct cgroup_pidlist *l;
 	int retval;
 
-	/* Nothing to do for write-only files */
-	if (!(file->f_mode & FMODE_READ))
-		return 0;
-
 	/* have the array populated */
 	retval = pidlist_array_load(cgrp, type, &l);
 	if (retval)
@@ -3894,7 +3885,6 @@ static struct cftype cgroup_base_files[] = {
 		.name = "cgroup.procs",
 		.open = cgroup_procs_open,
 		.write_u64 = cgroup_procs_write,
-		.release = cgroup_pidlist_release,
 		.mode = S_IRUGO | S_IWUSR,
 	},
 	{
@@ -3919,7 +3909,6 @@ static struct cftype cgroup_base_files[] = {
 		.flags = CFTYPE_INSANE,		/* use "procs" instead */
 		.open = cgroup_tasks_open,
 		.write_u64 = cgroup_tasks_write,
-		.release = cgroup_pidlist_release,
 		.mode = S_IRUGO | S_IWUSR,
 	},
 	{
-- 
cgit v0.10.2


From b9f3cecaba592d4e98cd155c91b1414323ed51e8 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 29 Nov 2013 10:42:58 -0500
Subject: cgroup: remove cftype->release()

Now that pidlist files don't use cftype->release(), it doesn't have
any user left.  Remove it.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 492fa01..5207c28 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -499,8 +499,6 @@ struct cftype {
 	 * kick type for multiplexing.
 	 */
 	int (*trigger)(struct cgroup_subsys_state *css, unsigned int event);
-
-	int (*release)(struct inode *inode, struct file *file);
 };
 
 /*
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 1e09ded..b59e3453 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -2448,12 +2448,9 @@ static int cgroup_file_open(struct inode *inode, struct file *file)
 static int cgroup_file_release(struct inode *inode, struct file *file)
 {
 	struct cfent *cfe = __d_cfe(file->f_dentry);
-	struct cftype *cft = __d_cft(file->f_dentry);
 	struct cgroup_subsys_state *css = cfe->css;
 	int ret = 0;
 
-	if (cft->release)
-		ret = cft->release(inode, file);
 	if (css->ss)
 		css_put(css);
 	if (file->f_op == &cgroup_seqfile_operations)
-- 
cgit v0.10.2


From b1a21367314f36a819c0676e0999f34db12ee6ed Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 29 Nov 2013 10:42:58 -0500
Subject: cgroup: implement delayed destruction for cgroup_pidlist

Currently, pidlists are reference counted from file open and release
methods.  This means that holding onto an open file may waste memory
and reads may return data which is very stale.  Both aren't critical
because pidlists are keyed and shared per namespace and, well, the
user isn't supposed to have large delay between open and reads.

cgroup is planned to be converted to use kernfs and it'd be best if we
can stick to just the seq_file operations - start, next, stop and
show.  This can be achieved by loading pidlist on demand from start
and release with time delay from stop, so that consecutive reads don't
end up reloading the pidlist on each iteration.  This would remove the
need for hooking into open and release while also avoiding issues with
holding onto pidlist for too long.

This patch implements delayed release of pidlist.  As pidlists could
be lingering on cgroup removal waiting for the timer to expire, cgroup
free path needs to queue the destruction work item immediately and
flush.  As those work items are self-destroying, each work item can't
be flushed directly.  A new workqueue - cgroup_pidlist_destroy_wq - is
added to serve as flush domain.

Note that this patch just adds delayed release on top of the current
implementation and doesn't change where pidlist is loaded and
released.  Following patches will make those changes.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index b59e3453..acdcddf 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -62,6 +62,14 @@
 #include <linux/atomic.h>
 
 /*
+ * pidlists linger the following amount before being destroyed.  The goal
+ * is avoiding frequent destruction in the middle of consecutive read calls
+ * Expiring in the middle is a performance problem not a correctness one.
+ * 1 sec should be enough.
+ */
+#define CGROUP_PIDLIST_DESTROY_DELAY	HZ
+
+/*
  * cgroup_mutex is the master lock.  Any modification to cgroup or its
  * hierarchy must be performed while holding it.
  *
@@ -95,6 +103,12 @@ static DEFINE_MUTEX(cgroup_root_mutex);
 static struct workqueue_struct *cgroup_destroy_wq;
 
 /*
+ * pidlist destructions need to be flushed on cgroup destruction.  Use a
+ * separate workqueue as flush domain.
+ */
+static struct workqueue_struct *cgroup_pidlist_destroy_wq;
+
+/*
  * Generate an array of cgroup subsystem pointers. At boot time, this is
  * populated with the built in subsystems, and modular subsystems are
  * registered after that. The mutable section of this array is protected by
@@ -167,6 +181,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp);
 static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
 			      bool is_add);
 static int cgroup_file_release(struct inode *inode, struct file *file);
+static void cgroup_pidlist_destroy_all(struct cgroup *cgrp);
 
 /**
  * cgroup_css - obtain a cgroup's css for the specified subsystem
@@ -830,11 +845,7 @@ static void cgroup_free_fn(struct work_struct *work)
 	 */
 	deactivate_super(cgrp->root->sb);
 
-	/*
-	 * if we're getting rid of the cgroup, refcount should ensure
-	 * that there are no pidlists left.
-	 */
-	BUG_ON(!list_empty(&cgrp->pidlists));
+	cgroup_pidlist_destroy_all(cgrp);
 
 	simple_xattrs_free(&cgrp->xattrs);
 
@@ -2449,13 +2460,12 @@ static int cgroup_file_release(struct inode *inode, struct file *file)
 {
 	struct cfent *cfe = __d_cfe(file->f_dentry);
 	struct cgroup_subsys_state *css = cfe->css;
-	int ret = 0;
 
 	if (css->ss)
 		css_put(css);
 	if (file->f_op == &cgroup_seqfile_operations)
 		single_release(inode, file);
-	return ret;
+	return 0;
 }
 
 /*
@@ -3454,6 +3464,8 @@ struct cgroup_pidlist {
 	struct cgroup *owner;
 	/* protects the other fields */
 	struct rw_semaphore rwsem;
+	/* for delayed destruction */
+	struct delayed_work destroy_dwork;
 };
 
 /*
@@ -3469,6 +3481,7 @@ static void *pidlist_allocate(int count)
 	else
 		return kmalloc(count * sizeof(pid_t), GFP_KERNEL);
 }
+
 static void pidlist_free(void *p)
 {
 	if (is_vmalloc_addr(p))
@@ -3478,6 +3491,49 @@ static void pidlist_free(void *p)
 }
 
 /*
+ * Used to destroy all pidlists lingering waiting for destroy timer.  None
+ * should be left afterwards.
+ */
+static void cgroup_pidlist_destroy_all(struct cgroup *cgrp)
+{
+	struct cgroup_pidlist *l, *tmp_l;
+
+	mutex_lock(&cgrp->pidlist_mutex);
+	list_for_each_entry_safe(l, tmp_l, &cgrp->pidlists, links)
+		mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork, 0);
+	mutex_unlock(&cgrp->pidlist_mutex);
+
+	flush_workqueue(cgroup_pidlist_destroy_wq);
+	BUG_ON(!list_empty(&cgrp->pidlists));
+}
+
+static void cgroup_pidlist_destroy_work_fn(struct work_struct *work)
+{
+	struct delayed_work *dwork = to_delayed_work(work);
+	struct cgroup_pidlist *l = container_of(dwork, struct cgroup_pidlist,
+						destroy_dwork);
+	struct cgroup_pidlist *tofree = NULL;
+
+	mutex_lock(&l->owner->pidlist_mutex);
+	down_write(&l->rwsem);
+
+	/*
+	 * Destroy iff we didn't race with a new user or get queued again.
+	 * Queued state won't change as it can only be queued while locked.
+	 */
+	if (!l->use_count && !delayed_work_pending(dwork)) {
+		list_del(&l->links);
+		pidlist_free(l->list);
+		put_pid_ns(l->key.ns);
+		tofree = l;
+	}
+
+	up_write(&l->rwsem);
+	mutex_unlock(&l->owner->pidlist_mutex);
+	kfree(tofree);
+}
+
+/*
  * pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries
  * Returns the number of unique elements.
  */
@@ -3547,6 +3603,7 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
 		return l;
 	}
 	init_rwsem(&l->rwsem);
+	INIT_DELAYED_WORK(&l->destroy_dwork, cgroup_pidlist_destroy_work_fn);
 	down_write(&l->rwsem);
 	l->key.type = type;
 	l->key.ns = get_pid_ns(ns);
@@ -3752,26 +3809,12 @@ static const struct seq_operations cgroup_pidlist_seq_operations = {
 
 static void cgroup_release_pid_array(struct cgroup_pidlist *l)
 {
-	/*
-	 * the case where we're the last user of this particular pidlist will
-	 * have us remove it from the cgroup's list, which entails taking the
-	 * mutex. since in pidlist_find the pidlist->lock depends on cgroup->
-	 * pidlist_mutex, we have to take pidlist_mutex first.
-	 */
-	mutex_lock(&l->owner->pidlist_mutex);
 	down_write(&l->rwsem);
 	BUG_ON(!l->use_count);
-	if (!--l->use_count) {
-		/* we're the last user if refcount is 0; remove and free */
-		list_del(&l->links);
-		mutex_unlock(&l->owner->pidlist_mutex);
-		pidlist_free(l->list);
-		put_pid_ns(l->key.ns);
-		up_write(&l->rwsem);
-		kfree(l);
-		return;
-	}
-	mutex_unlock(&l->owner->pidlist_mutex);
+	/* if the last user, arm the destroy work */
+	if (!--l->use_count)
+		mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork,
+				 CGROUP_PIDLIST_DESTROY_DELAY);
 	up_write(&l->rwsem);
 }
 
@@ -4813,6 +4856,15 @@ static int __init cgroup_wq_init(void)
 	 */
 	cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1);
 	BUG_ON(!cgroup_destroy_wq);
+
+	/*
+	 * Used to destroy pidlists and separate to serve as flush domain.
+	 * Cap @max_active to 1 too.
+	 */
+	cgroup_pidlist_destroy_wq = alloc_workqueue("cgroup_pidlist_destroy",
+						    0, 1);
+	BUG_ON(!cgroup_pidlist_destroy_wq);
+
 	return 0;
 }
 core_initcall(cgroup_wq_init);
-- 
cgit v0.10.2


From 62236858f3cc558135d1ab6b2af44d22c489bb7f Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 29 Nov 2013 10:42:58 -0500
Subject: cgroup: introduce struct cgroup_pidlist_open_file

For pidlist files, seq_file->private pointed to the loaded
cgroup_pidlist; however, pidlist loading is planned to be moved to
cgroup_pidlist_start() for kernfs conversion and seq_file->private
needs to carry more information from open to allow that.

This patch introduces struct cgroup_pidlist_open_file which contains
type, cgrp and pidlist and updates pidlist seq_file->private to point
to it using seq_open_private() and seq_release_private().  Note that
this eventually will be replaced by kernfs_open_file.

While this patch makes more information available to seq_file
operations, they don't use it yet and this patch doesn't introduce any
behavior changes except for allocation of the extra private struct.

v2: use __seq_open_private() instead of seq_open_private() for brevity
    as suggested by Li.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index acdcddf..ef019c0 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -3468,6 +3468,13 @@ struct cgroup_pidlist {
 	struct delayed_work destroy_dwork;
 };
 
+/* seq_file->private points to the following */
+struct cgroup_pidlist_open_file {
+	enum cgroup_filetype		type;
+	struct cgroup			*cgrp;
+	struct cgroup_pidlist		*pidlist;
+};
+
 /*
  * The following two functions "fix" the issue where there are more pids
  * than kmalloc will give memory for; in such cases, we use vmalloc/vfree.
@@ -3739,7 +3746,8 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
 	 * after a seek to the start). Use a binary-search to find the
 	 * next pid to display, if any
 	 */
-	struct cgroup_pidlist *l = s->private;
+	struct cgroup_pidlist_open_file *of = s->private;
+	struct cgroup_pidlist *l = of->pidlist;
 	int index = 0, pid = *pos;
 	int *iter;
 
@@ -3769,13 +3777,15 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
 
 static void cgroup_pidlist_stop(struct seq_file *s, void *v)
 {
-	struct cgroup_pidlist *l = s->private;
-	up_read(&l->rwsem);
+	struct cgroup_pidlist_open_file *of = s->private;
+
+	up_read(&of->pidlist->rwsem);
 }
 
 static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
 {
-	struct cgroup_pidlist *l = s->private;
+	struct cgroup_pidlist_open_file *of = s->private;
+	struct cgroup_pidlist *l = of->pidlist;
 	pid_t *p = v;
 	pid_t *end = l->list + l->length;
 	/*
@@ -3820,11 +3830,11 @@ static void cgroup_release_pid_array(struct cgroup_pidlist *l)
 
 static int cgroup_pidlist_release(struct inode *inode, struct file *file)
 {
-	struct cgroup_pidlist *l;
+	struct cgroup_pidlist_open_file *of;
 
-	l = ((struct seq_file *)file->private_data)->private;
-	cgroup_release_pid_array(l);
-	return seq_release(inode, file);
+	of = ((struct seq_file *)file->private_data)->private;
+	cgroup_release_pid_array(of->pidlist);
+	return seq_release_private(inode, file);
 }
 
 static const struct file_operations cgroup_pidlist_operations = {
@@ -3843,6 +3853,7 @@ static const struct file_operations cgroup_pidlist_operations = {
 static int cgroup_pidlist_open(struct file *file, enum cgroup_filetype type)
 {
 	struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
+	struct cgroup_pidlist_open_file *of;
 	struct cgroup_pidlist *l;
 	int retval;
 
@@ -3853,12 +3864,16 @@ static int cgroup_pidlist_open(struct file *file, enum cgroup_filetype type)
 	/* configure file information */
 	file->f_op = &cgroup_pidlist_operations;
 
-	retval = seq_open(file, &cgroup_pidlist_seq_operations);
-	if (retval) {
+	of = __seq_open_private(file, &cgroup_pidlist_seq_operations,
+				sizeof(*of));
+	if (!of) {
 		cgroup_release_pid_array(l);
-		return retval;
+		return -ENOMEM;
 	}
-	((struct seq_file *)file->private_data)->private = l;
+
+	of->type = type;
+	of->cgrp = cgrp;
+	of->pidlist = l;
 	return 0;
 }
 static int cgroup_tasks_open(struct inode *unused, struct file *file)
-- 
cgit v0.10.2


From e6b817103d168a76e4044ebcdbc08225d77a81cb Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 29 Nov 2013 10:42:59 -0500
Subject: cgroup: refactor cgroup_pidlist_find()

Rename cgroup_pidlist_find() to cgroup_pidlist_find_create() and
separate out finding proper to cgroup_pidlist_find().  Also, move
locking to the caller.

This patch is preparation for pidlist restructure and doesn't
introduce any behavior changes.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index ef019c0..d58c30d 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -3575,48 +3575,50 @@ static int cmppid(const void *a, const void *b)
 	return *(pid_t *)a - *(pid_t *)b;
 }
 
+static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
+						  enum cgroup_filetype type)
+{
+	struct cgroup_pidlist *l;
+	/* don't need task_nsproxy() if we're looking at ourself */
+	struct pid_namespace *ns = task_active_pid_ns(current);
+
+	lockdep_assert_held(&cgrp->pidlist_mutex);
+
+	list_for_each_entry(l, &cgrp->pidlists, links)
+		if (l->key.type == type && l->key.ns == ns)
+			return l;
+	return NULL;
+}
+
 /*
  * find the appropriate pidlist for our purpose (given procs vs tasks)
  * returns with the lock on that pidlist already held, and takes care
  * of the use count, or returns NULL with no locks held if we're out of
  * memory.
  */
-static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
-						  enum cgroup_filetype type)
+static struct cgroup_pidlist *cgroup_pidlist_find_create(struct cgroup *cgrp,
+						enum cgroup_filetype type)
 {
 	struct cgroup_pidlist *l;
-	/* don't need task_nsproxy() if we're looking at ourself */
-	struct pid_namespace *ns = task_active_pid_ns(current);
 
-	/*
-	 * We can't drop the pidlist_mutex before taking the l->rwsem in case
-	 * the last ref-holder is trying to remove l from the list at the same
-	 * time. Holding the pidlist_mutex precludes somebody taking whichever
-	 * list we find out from under us - compare release_pid_array().
-	 */
-	mutex_lock(&cgrp->pidlist_mutex);
-	list_for_each_entry(l, &cgrp->pidlists, links) {
-		if (l->key.type == type && l->key.ns == ns) {
-			/* make sure l doesn't vanish out from under us */
-			down_write(&l->rwsem);
-			mutex_unlock(&cgrp->pidlist_mutex);
-			return l;
-		}
-	}
+	lockdep_assert_held(&cgrp->pidlist_mutex);
+
+	l = cgroup_pidlist_find(cgrp, type);
+	if (l)
+		return l;
+
 	/* entry not found; create a new one */
 	l = kzalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL);
-	if (!l) {
-		mutex_unlock(&cgrp->pidlist_mutex);
+	if (!l)
 		return l;
-	}
+
 	init_rwsem(&l->rwsem);
 	INIT_DELAYED_WORK(&l->destroy_dwork, cgroup_pidlist_destroy_work_fn);
-	down_write(&l->rwsem);
 	l->key.type = type;
-	l->key.ns = get_pid_ns(ns);
+	/* don't need task_nsproxy() if we're looking at ourself */
+	l->key.ns = get_pid_ns(task_active_pid_ns(current));
 	l->owner = cgrp;
 	list_add(&l->links, &cgrp->pidlists);
-	mutex_unlock(&cgrp->pidlist_mutex);
 	return l;
 }
 
@@ -3662,17 +3664,26 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
 	sort(array, length, sizeof(pid_t), cmppid, NULL);
 	if (type == CGROUP_FILE_PROCS)
 		length = pidlist_uniq(array, length);
-	l = cgroup_pidlist_find(cgrp, type);
+
+	mutex_lock(&cgrp->pidlist_mutex);
+
+	l = cgroup_pidlist_find_create(cgrp, type);
 	if (!l) {
+		mutex_unlock(&cgrp->pidlist_mutex);
 		pidlist_free(array);
 		return -ENOMEM;
 	}
-	/* store array, freeing old if necessary - lock already held */
+
+	/* store array, freeing old if necessary */
+	down_write(&l->rwsem);
 	pidlist_free(l->list);
 	l->list = array;
 	l->length = length;
 	l->use_count++;
 	up_write(&l->rwsem);
+
+	mutex_unlock(&cgrp->pidlist_mutex);
+
 	*lp = l;
 	return 0;
 }
-- 
cgit v0.10.2


From 069df3b7aeb3f4e926c4da9630c92010909af512 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 29 Nov 2013 10:42:59 -0500
Subject: cgroup: remove cgroup_pidlist->rwsem

cgroup_pidlist locking is needlessly complicated.  It has outer
cgroup->pidlist_mutex to protect the list of pidlists associated with
a cgroup and then each pidlist has rwsem to synchronize updates and
reads.  Given that the only read access is from seq_file operations
which are always invoked back-to-back, the rwsem is a giant overkill.
All it does is adding unnecessary complexity.

This patch removes cgroup_pidlist->rwsem and protects all accesses to
pidlists belonging to a cgroup with cgroup->pidlist_mutex.
pidlist->rwsem locking is removed if it's nested inside
cgroup->pidlist_mutex; otherwise, it's replaced with
cgroup->pidlist_mutex locking.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index d58c30d..dc39e17 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -3462,8 +3462,6 @@ struct cgroup_pidlist {
 	struct list_head links;
 	/* pointer to the cgroup we belong to, for list removal purposes */
 	struct cgroup *owner;
-	/* protects the other fields */
-	struct rw_semaphore rwsem;
 	/* for delayed destruction */
 	struct delayed_work destroy_dwork;
 };
@@ -3522,7 +3520,6 @@ static void cgroup_pidlist_destroy_work_fn(struct work_struct *work)
 	struct cgroup_pidlist *tofree = NULL;
 
 	mutex_lock(&l->owner->pidlist_mutex);
-	down_write(&l->rwsem);
 
 	/*
 	 * Destroy iff we didn't race with a new user or get queued again.
@@ -3535,7 +3532,6 @@ static void cgroup_pidlist_destroy_work_fn(struct work_struct *work)
 		tofree = l;
 	}
 
-	up_write(&l->rwsem);
 	mutex_unlock(&l->owner->pidlist_mutex);
 	kfree(tofree);
 }
@@ -3612,7 +3608,6 @@ static struct cgroup_pidlist *cgroup_pidlist_find_create(struct cgroup *cgrp,
 	if (!l)
 		return l;
 
-	init_rwsem(&l->rwsem);
 	INIT_DELAYED_WORK(&l->destroy_dwork, cgroup_pidlist_destroy_work_fn);
 	l->key.type = type;
 	/* don't need task_nsproxy() if we're looking at ourself */
@@ -3675,12 +3670,10 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
 	}
 
 	/* store array, freeing old if necessary */
-	down_write(&l->rwsem);
 	pidlist_free(l->list);
 	l->list = array;
 	l->length = length;
 	l->use_count++;
-	up_write(&l->rwsem);
 
 	mutex_unlock(&cgrp->pidlist_mutex);
 
@@ -3762,7 +3755,7 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
 	int index = 0, pid = *pos;
 	int *iter;
 
-	down_read(&l->rwsem);
+	mutex_lock(&of->cgrp->pidlist_mutex);
 	if (pid) {
 		int end = l->length;
 
@@ -3790,7 +3783,7 @@ static void cgroup_pidlist_stop(struct seq_file *s, void *v)
 {
 	struct cgroup_pidlist_open_file *of = s->private;
 
-	up_read(&of->pidlist->rwsem);
+	mutex_unlock(&of->cgrp->pidlist_mutex);
 }
 
 static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
@@ -3830,13 +3823,13 @@ static const struct seq_operations cgroup_pidlist_seq_operations = {
 
 static void cgroup_release_pid_array(struct cgroup_pidlist *l)
 {
-	down_write(&l->rwsem);
+	mutex_lock(&l->owner->pidlist_mutex);
 	BUG_ON(!l->use_count);
 	/* if the last user, arm the destroy work */
 	if (!--l->use_count)
 		mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork,
 				 CGROUP_PIDLIST_DESTROY_DELAY);
-	up_write(&l->rwsem);
+	mutex_unlock(&l->owner->pidlist_mutex);
 }
 
 static int cgroup_pidlist_release(struct inode *inode, struct file *file)
-- 
cgit v0.10.2


From 4bac00d16a8760eae7205e41d2c246477d42a210 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 29 Nov 2013 10:42:59 -0500
Subject: cgroup: load and release pidlists from seq_file start and stop
 respectively

Currently, pidlists are reference counted from file open and release
methods.  This means that holding onto an open file may waste memory
and reads may return data which is very stale.  Both aren't critical
because pidlists are keyed and shared per namespace and, well, the
user isn't supposed to have large delay between open and reads.

cgroup is planned to be converted to use kernfs and it'd be best if we
can stick to just the seq_file operations - start, next, stop and
show.  This can be achieved by loading pidlist on demand from start
and release with time delay from stop, so that consecutive reads don't
end up reloading the pidlist on each iteration.  This would remove the
need for hooking into open and release while also avoiding issues with
holding onto pidlist for too long.

The previous patches implemented delayed release and restructured
pidlist handling so that pidlists can be loaded and released from
seq_file start / stop.  This patch actually moves pidlist load to
start and release to stop.

This means that pidlist is pinned only between start and stop and may
go away between two consecutive read calls if the two calls are apart
by more than CGROUP_PIDLIST_DESTROY_DELAY.  cgroup_pidlist_start()
thus can't re-use the stored cgroup_pid_list_open_file->pidlist
directly.  During start, it's only used as a hint indicating whether
this is the first start after open or not and pidlist is always looked
up or created.

pidlist_mutex locking and reference counting are moved out of
pidlist_array_load() so that pidlist_array_load() can perform lookup
and creation atomically.  While this enlarges the area covered by
pidlist_mutex, given how the lock is used, it's highly unlikely to be
noticeable.

v2: Refreshed on top of the updated "cgroup: introduce struct
    cgroup_pidlist_open_file".

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index dc39e17..671cbde 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -3473,6 +3473,8 @@ struct cgroup_pidlist_open_file {
 	struct cgroup_pidlist		*pidlist;
 };
 
+static void cgroup_release_pid_array(struct cgroup_pidlist *l);
+
 /*
  * The following two functions "fix" the issue where there are more pids
  * than kmalloc will give memory for; in such cases, we use vmalloc/vfree.
@@ -3630,6 +3632,8 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
 	struct task_struct *tsk;
 	struct cgroup_pidlist *l;
 
+	lockdep_assert_held(&cgrp->pidlist_mutex);
+
 	/*
 	 * If cgroup gets more users after we read count, we won't have
 	 * enough space - tough.  This race is indistinguishable to the
@@ -3660,8 +3664,6 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
 	if (type == CGROUP_FILE_PROCS)
 		length = pidlist_uniq(array, length);
 
-	mutex_lock(&cgrp->pidlist_mutex);
-
 	l = cgroup_pidlist_find_create(cgrp, type);
 	if (!l) {
 		mutex_unlock(&cgrp->pidlist_mutex);
@@ -3673,10 +3675,6 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
 	pidlist_free(l->list);
 	l->list = array;
 	l->length = length;
-	l->use_count++;
-
-	mutex_unlock(&cgrp->pidlist_mutex);
-
 	*lp = l;
 	return 0;
 }
@@ -3751,11 +3749,34 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
 	 * next pid to display, if any
 	 */
 	struct cgroup_pidlist_open_file *of = s->private;
-	struct cgroup_pidlist *l = of->pidlist;
+	struct cgroup *cgrp = of->cgrp;
+	struct cgroup_pidlist *l;
 	int index = 0, pid = *pos;
-	int *iter;
+	int *iter, ret;
+
+	mutex_lock(&cgrp->pidlist_mutex);
+
+	/*
+	 * !NULL @of->pidlist indicates that this isn't the first start()
+	 * after open.  If the matching pidlist is around, we can use that.
+	 * Look for it.  Note that @of->pidlist can't be used directly.  It
+	 * could already have been destroyed.
+	 */
+	if (of->pidlist)
+		of->pidlist = cgroup_pidlist_find(cgrp, of->type);
+
+	/*
+	 * Either this is the first start() after open or the matching
+	 * pidlist has been destroyed inbetween.  Create a new one.
+	 */
+	if (!of->pidlist) {
+		ret = pidlist_array_load(of->cgrp, of->type, &of->pidlist);
+		if (ret)
+			return ERR_PTR(ret);
+	}
+	l = of->pidlist;
+	l->use_count++;
 
-	mutex_lock(&of->cgrp->pidlist_mutex);
 	if (pid) {
 		int end = l->length;
 
@@ -3784,6 +3805,8 @@ static void cgroup_pidlist_stop(struct seq_file *s, void *v)
 	struct cgroup_pidlist_open_file *of = s->private;
 
 	mutex_unlock(&of->cgrp->pidlist_mutex);
+	if (of->pidlist)
+		cgroup_release_pid_array(of->pidlist);
 }
 
 static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
@@ -3832,20 +3855,11 @@ static void cgroup_release_pid_array(struct cgroup_pidlist *l)
 	mutex_unlock(&l->owner->pidlist_mutex);
 }
 
-static int cgroup_pidlist_release(struct inode *inode, struct file *file)
-{
-	struct cgroup_pidlist_open_file *of;
-
-	of = ((struct seq_file *)file->private_data)->private;
-	cgroup_release_pid_array(of->pidlist);
-	return seq_release_private(inode, file);
-}
-
 static const struct file_operations cgroup_pidlist_operations = {
 	.read = seq_read,
 	.llseek = seq_lseek,
 	.write = cgroup_file_write,
-	.release = cgroup_pidlist_release,
+	.release = seq_release_private,
 };
 
 /*
@@ -3858,26 +3872,17 @@ static int cgroup_pidlist_open(struct file *file, enum cgroup_filetype type)
 {
 	struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
 	struct cgroup_pidlist_open_file *of;
-	struct cgroup_pidlist *l;
-	int retval;
 
-	/* have the array populated */
-	retval = pidlist_array_load(cgrp, type, &l);
-	if (retval)
-		return retval;
 	/* configure file information */
 	file->f_op = &cgroup_pidlist_operations;
 
 	of = __seq_open_private(file, &cgroup_pidlist_seq_operations,
 				sizeof(*of));
-	if (!of) {
-		cgroup_release_pid_array(l);
+	if (!of)
 		return -ENOMEM;
-	}
 
 	of->type = type;
 	of->cgrp = cgrp;
-	of->pidlist = l;
 	return 0;
 }
 static int cgroup_tasks_open(struct inode *unused, struct file *file)
-- 
cgit v0.10.2


From 045023658ca1e30dc0bb1f148b42c95b740d3e02 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 29 Nov 2013 10:42:59 -0500
Subject: cgroup: remove cgroup_pidlist->use_count

After the recent changes, pidlist ref is held only between
cgroup_pidlist_start() and cgroup_pidlist_stop() during which
cgroup->pidlist_mutex is also held.  IOW, the reference count is
redundant now.  While in use, it's always one and pidlist_mutex is
held - holding the mutex has exactly the same protection.

This patch collapses destroy_dwork queueing into cgroup_pidlist_stop()
so that pidlist_mutex is not released inbetween and drops
pidlist->use_count.

This patch shouldn't introduce any behavior changes.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 671cbde..a245803 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -3456,8 +3456,6 @@ struct cgroup_pidlist {
 	pid_t *list;
 	/* how many elements the above list has */
 	int length;
-	/* how many files are using the current array */
-	int use_count;
 	/* each of these stored in a list by its cgroup */
 	struct list_head links;
 	/* pointer to the cgroup we belong to, for list removal purposes */
@@ -3473,8 +3471,6 @@ struct cgroup_pidlist_open_file {
 	struct cgroup_pidlist		*pidlist;
 };
 
-static void cgroup_release_pid_array(struct cgroup_pidlist *l);
-
 /*
  * The following two functions "fix" the issue where there are more pids
  * than kmalloc will give memory for; in such cases, we use vmalloc/vfree.
@@ -3524,10 +3520,10 @@ static void cgroup_pidlist_destroy_work_fn(struct work_struct *work)
 	mutex_lock(&l->owner->pidlist_mutex);
 
 	/*
-	 * Destroy iff we didn't race with a new user or get queued again.
-	 * Queued state won't change as it can only be queued while locked.
+	 * Destroy iff we didn't get queued again.  The state won't change
+	 * as destroy_dwork can only be queued while locked.
 	 */
-	if (!l->use_count && !delayed_work_pending(dwork)) {
+	if (!delayed_work_pending(dwork)) {
 		list_del(&l->links);
 		pidlist_free(l->list);
 		put_pid_ns(l->key.ns);
@@ -3775,7 +3771,6 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
 			return ERR_PTR(ret);
 	}
 	l = of->pidlist;
-	l->use_count++;
 
 	if (pid) {
 		int end = l->length;
@@ -3804,9 +3799,11 @@ static void cgroup_pidlist_stop(struct seq_file *s, void *v)
 {
 	struct cgroup_pidlist_open_file *of = s->private;
 
-	mutex_unlock(&of->cgrp->pidlist_mutex);
 	if (of->pidlist)
-		cgroup_release_pid_array(of->pidlist);
+		mod_delayed_work(cgroup_pidlist_destroy_wq,
+				 &of->pidlist->destroy_dwork,
+				 CGROUP_PIDLIST_DESTROY_DELAY);
+	mutex_unlock(&of->cgrp->pidlist_mutex);
 }
 
 static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
@@ -3844,17 +3841,6 @@ static const struct seq_operations cgroup_pidlist_seq_operations = {
 	.show = cgroup_pidlist_show,
 };
 
-static void cgroup_release_pid_array(struct cgroup_pidlist *l)
-{
-	mutex_lock(&l->owner->pidlist_mutex);
-	BUG_ON(!l->use_count);
-	/* if the last user, arm the destroy work */
-	if (!--l->use_count)
-		mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork,
-				 CGROUP_PIDLIST_DESTROY_DELAY);
-	mutex_unlock(&l->owner->pidlist_mutex);
-}
-
 static const struct file_operations cgroup_pidlist_operations = {
 	.read = seq_read,
 	.llseek = seq_lseek,
-- 
cgit v0.10.2


From afb2bc14e1c989cf0635bd04edb5ff55b8c1c7bd Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 29 Nov 2013 10:42:59 -0500
Subject: cgroup: don't guarantee cgroup.procs is sorted if sane_behavior

For some reason, tasks and cgroup.procs guarantee that the result is
sorted.  This is the only reason this whole pidlist logic is necessary
instead of just iterating through sorted member tasks.  We can't do
anything about the existing interface but at least ensure that such
expectation doesn't exist for the new interface so that pidlist logic
may be removed in the distant future.

This patch scrambles the sort order if sane_behavior so that the
output is usually not sorted in the new interface.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 5207c28..50d8cc3 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -275,6 +275,9 @@ enum {
 	 * - "tasks" is removed.  Everything should be at process
 	 *   granularity.  Use "cgroup.procs" instead.
 	 *
+	 * - "cgroup.procs" is not sorted.  pids will be unique unless they
+	 *   got recycled inbetween reads.
+	 *
 	 * - "release_agent" and "notify_on_release" are removed.
 	 *   Replacement notification mechanism will be implemented.
 	 *
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index a245803..f9f5fe3 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -3564,11 +3564,49 @@ after:
 	return dest;
 }
 
+/*
+ * The two pid files - task and cgroup.procs - guaranteed that the result
+ * is sorted, which forced this whole pidlist fiasco.  As pid order is
+ * different per namespace, each namespace needs differently sorted list,
+ * making it impossible to use, for example, single rbtree of member tasks
+ * sorted by task pointer.  As pidlists can be fairly large, allocating one
+ * per open file is dangerous, so cgroup had to implement shared pool of
+ * pidlists keyed by cgroup and namespace.
+ *
+ * All this extra complexity was caused by the original implementation
+ * committing to an entirely unnecessary property.  In the long term, we
+ * want to do away with it.  Explicitly scramble sort order if
+ * sane_behavior so that no such expectation exists in the new interface.
+ *
+ * Scrambling is done by swapping every two consecutive bits, which is
+ * non-identity one-to-one mapping which disturbs sort order sufficiently.
+ */
+static pid_t pid_fry(pid_t pid)
+{
+	unsigned a = pid & 0x55555555;
+	unsigned b = pid & 0xAAAAAAAA;
+
+	return (a << 1) | (b >> 1);
+}
+
+static pid_t cgroup_pid_fry(struct cgroup *cgrp, pid_t pid)
+{
+	if (cgroup_sane_behavior(cgrp))
+		return pid_fry(pid);
+	else
+		return pid;
+}
+
 static int cmppid(const void *a, const void *b)
 {
 	return *(pid_t *)a - *(pid_t *)b;
 }
 
+static int fried_cmppid(const void *a, const void *b)
+{
+	return pid_fry(*(pid_t *)a) - pid_fry(*(pid_t *)b);
+}
+
 static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
 						  enum cgroup_filetype type)
 {
@@ -3656,7 +3694,10 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
 	css_task_iter_end(&it);
 	length = n;
 	/* now sort & (if procs) strip out duplicates */
-	sort(array, length, sizeof(pid_t), cmppid, NULL);
+	if (cgroup_sane_behavior(cgrp))
+		sort(array, length, sizeof(pid_t), fried_cmppid, NULL);
+	else
+		sort(array, length, sizeof(pid_t), cmppid, NULL);
 	if (type == CGROUP_FILE_PROCS)
 		length = pidlist_uniq(array, length);
 
@@ -3777,10 +3818,10 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
 
 		while (index < end) {
 			int mid = (index + end) / 2;
-			if (l->list[mid] == pid) {
+			if (cgroup_pid_fry(cgrp, l->list[mid]) == pid) {
 				index = mid;
 				break;
-			} else if (l->list[mid] <= pid)
+			} else if (cgroup_pid_fry(cgrp, l->list[mid]) <= pid)
 				index = mid + 1;
 			else
 				end = mid;
@@ -3791,7 +3832,7 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
 		return NULL;
 	/* Update the abstract position to be the actual pid that we found */
 	iter = l->list + index;
-	*pos = *iter;
+	*pos = cgroup_pid_fry(cgrp, *iter);
 	return iter;
 }
 
@@ -3820,7 +3861,7 @@ static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
 	if (p >= end) {
 		return NULL;
 	} else {
-		*pos = *p;
+		*pos = cgroup_pid_fry(of->cgrp, *p);
 		return p;
 	}
 }
-- 
cgit v0.10.2


From 44ffc75ba9a63f972dbebd4fab6888db5fcd3b0e Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 5 Dec 2013 12:28:01 -0500
Subject: cgroup, sched: convert away from cftype->read_map()

In preparation of conversion to kernfs, cgroup file handling is being
consolidated so that it can be easily mapped to the seq_file based
interface of kernfs.

cftype->read_map() doesn't add any value and being replaced with
->read_seq_string().  Update cpu_stats_show() and cpuacct_stats_show()
accordingly.

This patch doesn't make any visible behavior changes.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index c180860..f28ec67 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7257,14 +7257,14 @@ static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota)
 }
 
 static int cpu_stats_show(struct cgroup_subsys_state *css, struct cftype *cft,
-		struct cgroup_map_cb *cb)
+			  struct seq_file *sf)
 {
 	struct task_group *tg = css_tg(css);
 	struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
 
-	cb->fill(cb, "nr_periods", cfs_b->nr_periods);
-	cb->fill(cb, "nr_throttled", cfs_b->nr_throttled);
-	cb->fill(cb, "throttled_time", cfs_b->throttled_time);
+	seq_printf(sf, "nr_periods %d\n", cfs_b->nr_periods);
+	seq_printf(sf, "nr_throttled %d\n", cfs_b->nr_throttled);
+	seq_printf(sf, "throttled_time %llu\n", cfs_b->throttled_time);
 
 	return 0;
 }
@@ -7318,7 +7318,7 @@ static struct cftype cpu_files[] = {
 	},
 	{
 		.name = "stat",
-		.read_map = cpu_stats_show,
+		.read_seq_string = cpu_stats_show,
 	},
 #endif
 #ifdef CONFIG_RT_GROUP_SCHED
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c
index f64722f..dd88738 100644
--- a/kernel/sched/cpuacct.c
+++ b/kernel/sched/cpuacct.c
@@ -184,7 +184,7 @@ static const char * const cpuacct_stat_desc[] = {
 };
 
 static int cpuacct_stats_show(struct cgroup_subsys_state *css,
-			      struct cftype *cft, struct cgroup_map_cb *cb)
+			      struct cftype *cft, struct seq_file *sf)
 {
 	struct cpuacct *ca = css_ca(css);
 	int cpu;
@@ -196,7 +196,7 @@ static int cpuacct_stats_show(struct cgroup_subsys_state *css,
 		val += kcpustat->cpustat[CPUTIME_NICE];
 	}
 	val = cputime64_to_clock_t(val);
-	cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_USER], val);
+	seq_printf(sf, "%s %lld\n", cpuacct_stat_desc[CPUACCT_STAT_USER], val);
 
 	val = 0;
 	for_each_online_cpu(cpu) {
@@ -207,7 +207,7 @@ static int cpuacct_stats_show(struct cgroup_subsys_state *css,
 	}
 
 	val = cputime64_to_clock_t(val);
-	cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val);
+	seq_printf(sf, "%s %lld\n", cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val);
 
 	return 0;
 }
@@ -224,7 +224,7 @@ static struct cftype files[] = {
 	},
 	{
 		.name = "stat",
-		.read_map = cpuacct_stats_show,
+		.read_seq_string = cpuacct_stats_show,
 	},
 	{ }	/* terminate */
 };
-- 
cgit v0.10.2


From 51ffe41178c49738e4d6d17b9bdcf528a262abc8 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 5 Dec 2013 12:28:02 -0500
Subject: cpuset: convert away from cftype->read()

In preparation of conversion to kernfs, cgroup file handling is being
consolidated so that it can be easily mapped to the seq_file based
interface of kernfs.

All users of cftype->read() can be easily served, usually better, by
seq_file and other methods.  Rename cpuset_common_file_read() to
cpuset_common_read_seq_string() and convert it to use
read_seq_string() interface instead.  This not only simplifies the
code but also makes it more versatile.  Before, the file couldn't
output if the result is longer than PAGE_SIZE.  After the conversion,
seq_file automatically grows the buffer until the output can fit.

This patch doesn't make any visible behavior changes except for being
able to handle output larger than PAGE_SIZE.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 4772034..032929f 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1731,66 +1731,43 @@ out_unlock:
  * used, list of ranges of sequential numbers, is variable length,
  * and since these maps can change value dynamically, one could read
  * gibberish by doing partial reads while a list was changing.
- * A single large read to a buffer that crosses a page boundary is
- * ok, because the result being copied to user land is not recomputed
- * across a page fault.
  */
-
-static size_t cpuset_sprintf_cpulist(char *page, struct cpuset *cs)
-{
-	size_t count;
-
-	mutex_lock(&callback_mutex);
-	count = cpulist_scnprintf(page, PAGE_SIZE, cs->cpus_allowed);
-	mutex_unlock(&callback_mutex);
-
-	return count;
-}
-
-static size_t cpuset_sprintf_memlist(char *page, struct cpuset *cs)
-{
-	size_t count;
-
-	mutex_lock(&callback_mutex);
-	count = nodelist_scnprintf(page, PAGE_SIZE, cs->mems_allowed);
-	mutex_unlock(&callback_mutex);
-
-	return count;
-}
-
-static ssize_t cpuset_common_file_read(struct cgroup_subsys_state *css,
-				       struct cftype *cft, struct file *file,
-				       char __user *buf, size_t nbytes,
-				       loff_t *ppos)
+static int cpuset_common_read_seq_string(struct cgroup_subsys_state *css,
+					 struct cftype *cft,
+					 struct seq_file *sf)
 {
 	struct cpuset *cs = css_cs(css);
 	cpuset_filetype_t type = cft->private;
-	char *page;
-	ssize_t retval = 0;
-	char *s;
+	ssize_t count;
+	char *buf, *s;
+	int ret = 0;
 
-	if (!(page = (char *)__get_free_page(GFP_TEMPORARY)))
-		return -ENOMEM;
+	count = seq_get_buf(sf, &buf);
+	s = buf;
 
-	s = page;
+	mutex_lock(&callback_mutex);
 
 	switch (type) {
 	case FILE_CPULIST:
-		s += cpuset_sprintf_cpulist(s, cs);
+		s += cpulist_scnprintf(s, count, cs->cpus_allowed);
 		break;
 	case FILE_MEMLIST:
-		s += cpuset_sprintf_memlist(s, cs);
+		s += nodelist_scnprintf(s, count, cs->mems_allowed);
 		break;
 	default:
-		retval = -EINVAL;
-		goto out;
+		ret = -EINVAL;
+		goto out_unlock;
 	}
-	*s++ = '\n';
 
-	retval = simple_read_from_buffer(buf, nbytes, ppos, page, s - page);
-out:
-	free_page((unsigned long)page);
-	return retval;
+	if (s < buf + count - 1) {
+		*s++ = '\n';
+		seq_commit(sf, s - buf);
+	} else {
+		seq_commit(sf, -1);
+	}
+out_unlock:
+	mutex_unlock(&callback_mutex);
+	return ret;
 }
 
 static u64 cpuset_read_u64(struct cgroup_subsys_state *css, struct cftype *cft)
@@ -1847,7 +1824,7 @@ static s64 cpuset_read_s64(struct cgroup_subsys_state *css, struct cftype *cft)
 static struct cftype files[] = {
 	{
 		.name = "cpus",
-		.read = cpuset_common_file_read,
+		.read_seq_string = cpuset_common_read_seq_string,
 		.write_string = cpuset_write_resmask,
 		.max_write_len = (100U + 6 * NR_CPUS),
 		.private = FILE_CPULIST,
@@ -1855,7 +1832,7 @@ static struct cftype files[] = {
 
 	{
 		.name = "mems",
-		.read = cpuset_common_file_read,
+		.read_seq_string = cpuset_common_read_seq_string,
 		.write_string = cpuset_write_resmask,
 		.max_write_len = (100U + 6 * MAX_NUMNODES),
 		.private = FILE_MEMLIST,
-- 
cgit v0.10.2


From 791badbdb3e4fc1001ee3bcdaedc6d4f167fcbe8 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 5 Dec 2013 12:28:02 -0500
Subject: memcg: convert away from cftype->read() and ->read_map()

In preparation of conversion to kernfs, cgroup file handling is being
consolidated so that it can be easily mapped to the seq_file based
interface of kernfs.

cftype->read_map() doesn't add any value and being replaced with
->read_seq_string(), and all users of cftype->read() can be easily
served, usually better, by seq_file and other methods.

Update mem_cgroup_read() to return u64 instead of printing itself and
rename it to mem_cgroup_read_u64(), and update
mem_cgroup_oom_control_read() to use ->read_seq_string() instead of
->read_map().

This patch doesn't make any visible behavior changes.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Acked-by: Li Zefan <lizefan@huawei.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Balbir Singh <bsingharora@gmail.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 7aa0d40..f149521 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -5150,14 +5150,12 @@ static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
 	return val << PAGE_SHIFT;
 }
 
-static ssize_t mem_cgroup_read(struct cgroup_subsys_state *css,
-			       struct cftype *cft, struct file *file,
-			       char __user *buf, size_t nbytes, loff_t *ppos)
+static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
+				   struct cftype *cft)
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
-	char str[64];
 	u64 val;
-	int name, len;
+	int name;
 	enum res_type type;
 
 	type = MEMFILE_TYPE(cft->private);
@@ -5183,8 +5181,7 @@ static ssize_t mem_cgroup_read(struct cgroup_subsys_state *css,
 		BUG();
 	}
 
-	len = scnprintf(str, sizeof(str), "%llu\n", (unsigned long long)val);
-	return simple_read_from_buffer(buf, nbytes, ppos, str, len);
+	return val;
 }
 
 static int memcg_update_kmem_limit(struct cgroup_subsys_state *css, u64 val)
@@ -5911,16 +5908,12 @@ static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg,
 }
 
 static int mem_cgroup_oom_control_read(struct cgroup_subsys_state *css,
-	struct cftype *cft,  struct cgroup_map_cb *cb)
+				       struct cftype *cft, struct seq_file *sf)
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
 
-	cb->fill(cb, "oom_kill_disable", memcg->oom_kill_disable);
-
-	if (atomic_read(&memcg->under_oom))
-		cb->fill(cb, "under_oom", 1);
-	else
-		cb->fill(cb, "under_oom", 0);
+	seq_printf(sf, "oom_kill_disable %d\n", memcg->oom_kill_disable);
+	seq_printf(sf, "under_oom %d\n", (bool)atomic_read(&memcg->under_oom));
 	return 0;
 }
 
@@ -6239,31 +6232,31 @@ static struct cftype mem_cgroup_files[] = {
 	{
 		.name = "usage_in_bytes",
 		.private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
-		.read = mem_cgroup_read,
+		.read_u64 = mem_cgroup_read_u64,
 	},
 	{
 		.name = "max_usage_in_bytes",
 		.private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE),
 		.trigger = mem_cgroup_reset,
-		.read = mem_cgroup_read,
+		.read_u64 = mem_cgroup_read_u64,
 	},
 	{
 		.name = "limit_in_bytes",
 		.private = MEMFILE_PRIVATE(_MEM, RES_LIMIT),
 		.write_string = mem_cgroup_write,
-		.read = mem_cgroup_read,
+		.read_u64 = mem_cgroup_read_u64,
 	},
 	{
 		.name = "soft_limit_in_bytes",
 		.private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT),
 		.write_string = mem_cgroup_write,
-		.read = mem_cgroup_read,
+		.read_u64 = mem_cgroup_read_u64,
 	},
 	{
 		.name = "failcnt",
 		.private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT),
 		.trigger = mem_cgroup_reset,
-		.read = mem_cgroup_read,
+		.read_u64 = mem_cgroup_read_u64,
 	},
 	{
 		.name = "stat",
@@ -6297,7 +6290,7 @@ static struct cftype mem_cgroup_files[] = {
 	},
 	{
 		.name = "oom_control",
-		.read_map = mem_cgroup_oom_control_read,
+		.read_seq_string = mem_cgroup_oom_control_read,
 		.write_u64 = mem_cgroup_oom_control_write,
 		.private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL),
 	},
@@ -6315,24 +6308,24 @@ static struct cftype mem_cgroup_files[] = {
 		.name = "kmem.limit_in_bytes",
 		.private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT),
 		.write_string = mem_cgroup_write,
-		.read = mem_cgroup_read,
+		.read_u64 = mem_cgroup_read_u64,
 	},
 	{
 		.name = "kmem.usage_in_bytes",
 		.private = MEMFILE_PRIVATE(_KMEM, RES_USAGE),
-		.read = mem_cgroup_read,
+		.read_u64 = mem_cgroup_read_u64,
 	},
 	{
 		.name = "kmem.failcnt",
 		.private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT),
 		.trigger = mem_cgroup_reset,
-		.read = mem_cgroup_read,
+		.read_u64 = mem_cgroup_read_u64,
 	},
 	{
 		.name = "kmem.max_usage_in_bytes",
 		.private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE),
 		.trigger = mem_cgroup_reset,
-		.read = mem_cgroup_read,
+		.read_u64 = mem_cgroup_read_u64,
 	},
 #ifdef CONFIG_SLABINFO
 	{
@@ -6349,25 +6342,25 @@ static struct cftype memsw_cgroup_files[] = {
 	{
 		.name = "memsw.usage_in_bytes",
 		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
-		.read = mem_cgroup_read,
+		.read_u64 = mem_cgroup_read_u64,
 	},
 	{
 		.name = "memsw.max_usage_in_bytes",
 		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
 		.trigger = mem_cgroup_reset,
-		.read = mem_cgroup_read,
+		.read_u64 = mem_cgroup_read_u64,
 	},
 	{
 		.name = "memsw.limit_in_bytes",
 		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
 		.write_string = mem_cgroup_write,
-		.read = mem_cgroup_read,
+		.read_u64 = mem_cgroup_read_u64,
 	},
 	{
 		.name = "memsw.failcnt",
 		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
 		.trigger = mem_cgroup_reset,
-		.read = mem_cgroup_read,
+		.read_u64 = mem_cgroup_read_u64,
 	},
 	{ },	/* terminate */
 };
-- 
cgit v0.10.2


From e92e113cabc1d3e47dc4740a814adb413f022e2f Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 5 Dec 2013 12:28:02 -0500
Subject: netprio_cgroup: convert away from cftype->read_map()

In preparation of conversion to kernfs, cgroup file handling is being
consolidated so that it can be easily mapped to the seq_file based
interface of kernfs.

cftype->read_map() doesn't add any value and being replaced with
->read_seq_string().  Update read_priomap() to use ->read_seq_string()
instead.

This patch doesn't make any visible behavior changes.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Neil Horman <nhorman@tuxdriver.com>
Acked-by: Daniel Wagner <daniel.wagner@bmw-carit.de>
Acked-by: Li Zefan <lizefan@huawei.com>

diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c
index 9b7cf6c..498710d 100644
--- a/net/core/netprio_cgroup.c
+++ b/net/core/netprio_cgroup.c
@@ -174,13 +174,13 @@ static u64 read_prioidx(struct cgroup_subsys_state *css, struct cftype *cft)
 }
 
 static int read_priomap(struct cgroup_subsys_state *css, struct cftype *cft,
-			struct cgroup_map_cb *cb)
+			struct seq_file *sf)
 {
 	struct net_device *dev;
 
 	rcu_read_lock();
 	for_each_netdev_rcu(&init_net, dev)
-		cb->fill(cb, dev->name, netprio_prio(css, dev));
+		seq_printf(sf, "%s %u\n", dev->name, netprio_prio(css, dev));
 	rcu_read_unlock();
 	return 0;
 }
@@ -238,7 +238,7 @@ static struct cftype ss_files[] = {
 	},
 	{
 		.name = "ifpriomap",
-		.read_map = read_priomap,
+		.read_seq_string = read_priomap,
 		.write_string = write_priomap,
 	},
 	{ }	/* terminate */
-- 
cgit v0.10.2


From 716f479d279fb456f58be44180d7479da75e5a4e Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 5 Dec 2013 12:28:03 -0500
Subject: hugetlb_cgroup: convert away from cftype->read()

In preparation of conversion to kernfs, cgroup file handling is being
consolidated so that it can be easily mapped to the seq_file based
interface of kernfs.

All users of cftype->read() can be easily served, usually better, by
seq_file and other methods.  Update hugetlb_cgroup_read() to return
u64 instead of printing itself and rename it to
hugetlb_cgroup_read_u64().

This patch doesn't make any visible behavior changes.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Michal Hocko <mhocko@suse.cz>
Acked-by: Li Zefan <lizefan@huawei.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>

diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c
index bda8e44..d747a84 100644
--- a/mm/hugetlb_cgroup.c
+++ b/mm/hugetlb_cgroup.c
@@ -242,22 +242,16 @@ void hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages,
 	return;
 }
 
-static ssize_t hugetlb_cgroup_read(struct cgroup_subsys_state *css,
-				   struct cftype *cft, struct file *file,
-				   char __user *buf, size_t nbytes,
-				   loff_t *ppos)
+static u64 hugetlb_cgroup_read_u64(struct cgroup_subsys_state *css,
+				   struct cftype *cft)
 {
-	u64 val;
-	char str[64];
-	int idx, name, len;
+	int idx, name;
 	struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css);
 
 	idx = MEMFILE_IDX(cft->private);
 	name = MEMFILE_ATTR(cft->private);
 
-	val = res_counter_read_u64(&h_cg->hugepage[idx], name);
-	len = scnprintf(str, sizeof(str), "%llu\n", (unsigned long long)val);
-	return simple_read_from_buffer(buf, nbytes, ppos, str, len);
+	return res_counter_read_u64(&h_cg->hugepage[idx], name);
 }
 
 static int hugetlb_cgroup_write(struct cgroup_subsys_state *css,
@@ -337,28 +331,28 @@ static void __init __hugetlb_cgroup_file_init(int idx)
 	cft = &h->cgroup_files[0];
 	snprintf(cft->name, MAX_CFTYPE_NAME, "%s.limit_in_bytes", buf);
 	cft->private = MEMFILE_PRIVATE(idx, RES_LIMIT);
-	cft->read = hugetlb_cgroup_read;
+	cft->read_u64 = hugetlb_cgroup_read_u64;
 	cft->write_string = hugetlb_cgroup_write;
 
 	/* Add the usage file */
 	cft = &h->cgroup_files[1];
 	snprintf(cft->name, MAX_CFTYPE_NAME, "%s.usage_in_bytes", buf);
 	cft->private = MEMFILE_PRIVATE(idx, RES_USAGE);
-	cft->read = hugetlb_cgroup_read;
+	cft->read_u64 = hugetlb_cgroup_read_u64;
 
 	/* Add the MAX usage file */
 	cft = &h->cgroup_files[2];
 	snprintf(cft->name, MAX_CFTYPE_NAME, "%s.max_usage_in_bytes", buf);
 	cft->private = MEMFILE_PRIVATE(idx, RES_MAX_USAGE);
 	cft->trigger = hugetlb_cgroup_reset;
-	cft->read = hugetlb_cgroup_read;
+	cft->read_u64 = hugetlb_cgroup_read_u64;
 
 	/* Add the failcntfile */
 	cft = &h->cgroup_files[3];
 	snprintf(cft->name, MAX_CFTYPE_NAME, "%s.failcnt", buf);
 	cft->private  = MEMFILE_PRIVATE(idx, RES_FAILCNT);
 	cft->trigger  = hugetlb_cgroup_reset;
-	cft->read = hugetlb_cgroup_read;
+	cft->read_u64 = hugetlb_cgroup_read_u64;
 
 	/* NULL terminate the last cft */
 	cft = &h->cgroup_files[4];
-- 
cgit v0.10.2


From 6e0755b08dd6a3b5260fafc6969268c2ba261300 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 5 Dec 2013 12:28:03 -0500
Subject: cgroup: remove cftype->read(), ->read_map() and ->write()

In preparation of conversion to kernfs, cgroup file handling is being
consolidated so that it can be easily mapped to the seq_file based
interface of kernfs.

After recent updates, ->read() and ->read_map() don't have any user
left and ->write() never had any user.  Remove them.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 50d8cc3..53e11da 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -387,16 +387,6 @@ struct css_set {
 };
 
 /*
- * cgroup_map_cb is an abstract callback API for reporting map-valued
- * control files
- */
-
-struct cgroup_map_cb {
-	int (*fill)(struct cgroup_map_cb *cb, const char *key, u64 value);
-	void *state;
-};
-
-/*
  * struct cftype: handler definitions for cgroup control files
  *
  * When reading/writing to a file:
@@ -444,9 +434,6 @@ struct cftype {
 	struct cgroup_subsys *ss;
 
 	int (*open)(struct inode *inode, struct file *file);
-	ssize_t (*read)(struct cgroup_subsys_state *css, struct cftype *cft,
-			struct file *file,
-			char __user *buf, size_t nbytes, loff_t *ppos);
 	/*
 	 * read_u64() is a shortcut for the common case of returning a
 	 * single integer. Use it in place of read()
@@ -457,24 +444,12 @@ struct cftype {
 	 */
 	s64 (*read_s64)(struct cgroup_subsys_state *css, struct cftype *cft);
 	/*
-	 * read_map() is used for defining a map of key/value
-	 * pairs. It should call cb->fill(cb, key, value) for each
-	 * entry. The key/value pairs (and their ordering) should not
-	 * change between reboots.
-	 */
-	int (*read_map)(struct cgroup_subsys_state *css, struct cftype *cft,
-			struct cgroup_map_cb *cb);
-	/*
 	 * read_seq_string() is used for outputting a simple sequence
 	 * using seqfile.
 	 */
 	int (*read_seq_string)(struct cgroup_subsys_state *css,
 			       struct cftype *cft, struct seq_file *m);
 
-	ssize_t (*write)(struct cgroup_subsys_state *css, struct cftype *cft,
-			 struct file *file,
-			 const char __user *buf, size_t nbytes, loff_t *ppos);
-
 	/*
 	 * write_u64() is a shortcut for the common case of accepting
 	 * a single integer (as parsed by simple_strtoull) from
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index f9f5fe3..b132ff9 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -2324,8 +2324,6 @@ static ssize_t cgroup_file_write(struct file *file, const char __user *buf,
 	struct cftype *cft = __d_cft(file->f_dentry);
 	struct cgroup_subsys_state *css = cfe->css;
 
-	if (cft->write)
-		return cft->write(css, cft, file, buf, nbytes, ppos);
 	if (cft->write_u64 || cft->write_s64)
 		return cgroup_write_X64(css, cft, file, buf, nbytes, ppos);
 	if (cft->write_string)
@@ -2366,8 +2364,6 @@ static ssize_t cgroup_file_read(struct file *file, char __user *buf,
 	struct cftype *cft = __d_cft(file->f_dentry);
 	struct cgroup_subsys_state *css = cfe->css;
 
-	if (cft->read)
-		return cft->read(css, cft, file, buf, nbytes, ppos);
 	if (cft->read_u64)
 		return cgroup_read_u64(css, cft, file, buf, nbytes, ppos);
 	if (cft->read_s64)
@@ -2380,25 +2376,12 @@ static ssize_t cgroup_file_read(struct file *file, char __user *buf,
  * supports string->u64 maps, but can be extended in future.
  */
 
-static int cgroup_map_add(struct cgroup_map_cb *cb, const char *key, u64 value)
-{
-	struct seq_file *sf = cb->state;
-	return seq_printf(sf, "%s %llu\n", key, (unsigned long long)value);
-}
-
 static int cgroup_seqfile_show(struct seq_file *m, void *arg)
 {
 	struct cfent *cfe = m->private;
 	struct cftype *cft = cfe->type;
 	struct cgroup_subsys_state *css = cfe->css;
 
-	if (cft->read_map) {
-		struct cgroup_map_cb cb = {
-			.fill = cgroup_map_add,
-			.state = m,
-		};
-		return cft->read_map(css, cft, &cb);
-	}
 	return cft->read_seq_string(css, cft, m);
 }
 
@@ -2444,7 +2427,7 @@ static int cgroup_file_open(struct inode *inode, struct file *file)
 	WARN_ON_ONCE(cfe->css && cfe->css != css);
 	cfe->css = css;
 
-	if (cft->read_map || cft->read_seq_string) {
+	if (cft->read_seq_string) {
 		file->f_op = &cgroup_seqfile_operations;
 		err = single_open(file, cgroup_seqfile_show, cfe);
 	} else if (cft->open) {
@@ -2658,12 +2641,11 @@ static umode_t cgroup_file_mode(const struct cftype *cft)
 	if (cft->mode)
 		return cft->mode;
 
-	if (cft->read || cft->read_u64 || cft->read_s64 ||
-	    cft->read_map || cft->read_seq_string)
+	if (cft->read_u64 || cft->read_s64 || cft->read_seq_string)
 		mode |= S_IRUGO;
 
-	if (cft->write || cft->write_u64 || cft->write_s64 ||
-	    cft->write_string || cft->trigger)
+	if (cft->write_u64 || cft->write_s64 || cft->write_string ||
+	    cft->trigger)
 		mode |= S_IWUSR;
 
 	return mode;
-- 
cgit v0.10.2


From a742c59de66ea080afa3edaf3428b3cdd5aa87cd Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 5 Dec 2013 12:28:03 -0500
Subject: cgroup: unify cgroup_write_X64() and cgroup_write_string()

cgroup_write_X64() and cgroup_write_string() both implement about the
same buffering logic.  Unify the two into cgroup_file_write() which
always allocates dynamic buffer for simplicity and uses kstrto*()
instead of simple_strto*().

This patch doesn't make any visible behavior changes except for
possibly different error value from kstrsto*().

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index b132ff9..eb34caf 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -2249,90 +2249,50 @@ static int cgroup_sane_behavior_show(struct cgroup_subsys_state *css,
 /* A buffer size big enough for numbers or short strings */
 #define CGROUP_LOCAL_BUFFER_SIZE 64
 
-static ssize_t cgroup_write_X64(struct cgroup_subsys_state *css,
-				struct cftype *cft, struct file *file,
-				const char __user *userbuf, size_t nbytes,
-				loff_t *unused_ppos)
-{
-	char buffer[CGROUP_LOCAL_BUFFER_SIZE];
-	int retval = 0;
-	char *end;
-
-	if (!nbytes)
-		return -EINVAL;
-	if (nbytes >= sizeof(buffer))
-		return -E2BIG;
-	if (copy_from_user(buffer, userbuf, nbytes))
-		return -EFAULT;
-
-	buffer[nbytes] = 0;     /* nul-terminate */
-	if (cft->write_u64) {
-		u64 val = simple_strtoull(strstrip(buffer), &end, 0);
-		if (*end)
-			return -EINVAL;
-		retval = cft->write_u64(css, cft, val);
-	} else {
-		s64 val = simple_strtoll(strstrip(buffer), &end, 0);
-		if (*end)
-			return -EINVAL;
-		retval = cft->write_s64(css, cft, val);
-	}
-	if (!retval)
-		retval = nbytes;
-	return retval;
-}
-
-static ssize_t cgroup_write_string(struct cgroup_subsys_state *css,
-				   struct cftype *cft, struct file *file,
-				   const char __user *userbuf, size_t nbytes,
-				   loff_t *unused_ppos)
+static ssize_t cgroup_file_write(struct file *file, const char __user *userbuf,
+				 size_t nbytes, loff_t *ppos)
 {
-	char local_buffer[CGROUP_LOCAL_BUFFER_SIZE];
-	int retval = 0;
-	size_t max_bytes = cft->max_write_len;
-	char *buffer = local_buffer;
+	struct cfent *cfe = __d_cfe(file->f_dentry);
+	struct cftype *cft = __d_cft(file->f_dentry);
+	struct cgroup_subsys_state *css = cfe->css;
+	size_t max_bytes = cft->max_write_len ?: CGROUP_LOCAL_BUFFER_SIZE - 1;
+	char *buf;
+	int ret;
 
-	if (!max_bytes)
-		max_bytes = sizeof(local_buffer) - 1;
 	if (nbytes >= max_bytes)
 		return -E2BIG;
-	/* Allocate a dynamic buffer if we need one */
-	if (nbytes >= sizeof(local_buffer)) {
-		buffer = kmalloc(nbytes + 1, GFP_KERNEL);
-		if (buffer == NULL)
-			return -ENOMEM;
-	}
-	if (nbytes && copy_from_user(buffer, userbuf, nbytes)) {
-		retval = -EFAULT;
-		goto out;
-	}
 
-	buffer[nbytes] = 0;     /* nul-terminate */
-	retval = cft->write_string(css, cft, strstrip(buffer));
-	if (!retval)
-		retval = nbytes;
-out:
-	if (buffer != local_buffer)
-		kfree(buffer);
-	return retval;
-}
+	buf = kmalloc(nbytes + 1, GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
 
-static ssize_t cgroup_file_write(struct file *file, const char __user *buf,
-				 size_t nbytes, loff_t *ppos)
-{
-	struct cfent *cfe = __d_cfe(file->f_dentry);
-	struct cftype *cft = __d_cft(file->f_dentry);
-	struct cgroup_subsys_state *css = cfe->css;
+	if (copy_from_user(buf, userbuf, nbytes)) {
+		ret = -EFAULT;
+		goto out_free;
+	}
 
-	if (cft->write_u64 || cft->write_s64)
-		return cgroup_write_X64(css, cft, file, buf, nbytes, ppos);
-	if (cft->write_string)
-		return cgroup_write_string(css, cft, file, buf, nbytes, ppos);
-	if (cft->trigger) {
-		int ret = cft->trigger(css, (unsigned int)cft->private);
-		return ret ? ret : nbytes;
+	buf[nbytes] = '\0';
+
+	if (cft->write_string) {
+		ret = cft->write_string(css, cft, strstrip(buf));
+	} else if (cft->write_u64) {
+		unsigned long long v;
+		ret = kstrtoull(buf, 0, &v);
+		if (!ret)
+			ret = cft->write_u64(css, cft, v);
+	} else if (cft->write_s64) {
+		long long v;
+		ret = kstrtoll(buf, 0, &v);
+		if (!ret)
+			ret = cft->write_s64(css, cft, v);
+	} else if (cft->trigger) {
+		ret = cft->trigger(css, (unsigned int)cft->private);
+	} else {
+		ret = -EINVAL;
 	}
-	return -EINVAL;
+out_free:
+	kfree(buf);
+	return ret ?: nbytes;
 }
 
 static ssize_t cgroup_read_u64(struct cgroup_subsys_state *css,
-- 
cgit v0.10.2


From 896f5199631560202885715da1b2f018632084e5 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 5 Dec 2013 12:28:04 -0500
Subject: cgroup: unify read path so that seq_file is always used

With the recent removal of cftype->read() and ->read_map(), only three
operations are remaining, ->read_u64(), ->read_s64() and
->read_seq_string().  Currently, the first two are handled directly
while the last is handled through seq_file.

It is trivial to serve the first two through the seq_file path too.
This patch restructures read path so that all operations are served
through cgroup_seqfile_show().  This makes all cgroup files seq_file -
single_open/release() are now used by default,
cgroup_seqfile_operations is dropped, and cgroup_file_operations uses
seq_read() for read.

This simplifies the code and makes the read path easy to convert to
use kernfs.

Note that, while cgroup_file_operations uses seq_read() for read, it
still uses generic_file_llseek() for seeking instead of seq_lseek().
This is different from cgroup_seqfile_operations but shouldn't break
anything and brings the seeking behavior aligned with kernfs.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index eb34caf..ce6db71 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -2295,42 +2295,6 @@ out_free:
 	return ret ?: nbytes;
 }
 
-static ssize_t cgroup_read_u64(struct cgroup_subsys_state *css,
-			       struct cftype *cft, struct file *file,
-			       char __user *buf, size_t nbytes, loff_t *ppos)
-{
-	char tmp[CGROUP_LOCAL_BUFFER_SIZE];
-	u64 val = cft->read_u64(css, cft);
-	int len = sprintf(tmp, "%llu\n", (unsigned long long) val);
-
-	return simple_read_from_buffer(buf, nbytes, ppos, tmp, len);
-}
-
-static ssize_t cgroup_read_s64(struct cgroup_subsys_state *css,
-			       struct cftype *cft, struct file *file,
-			       char __user *buf, size_t nbytes, loff_t *ppos)
-{
-	char tmp[CGROUP_LOCAL_BUFFER_SIZE];
-	s64 val = cft->read_s64(css, cft);
-	int len = sprintf(tmp, "%lld\n", (long long) val);
-
-	return simple_read_from_buffer(buf, nbytes, ppos, tmp, len);
-}
-
-static ssize_t cgroup_file_read(struct file *file, char __user *buf,
-				size_t nbytes, loff_t *ppos)
-{
-	struct cfent *cfe = __d_cfe(file->f_dentry);
-	struct cftype *cft = __d_cft(file->f_dentry);
-	struct cgroup_subsys_state *css = cfe->css;
-
-	if (cft->read_u64)
-		return cgroup_read_u64(css, cft, file, buf, nbytes, ppos);
-	if (cft->read_s64)
-		return cgroup_read_s64(css, cft, file, buf, nbytes, ppos);
-	return -EINVAL;
-}
-
 /*
  * seqfile ops/methods for returning structured data. Currently just
  * supports string->u64 maps, but can be extended in future.
@@ -2342,15 +2306,17 @@ static int cgroup_seqfile_show(struct seq_file *m, void *arg)
 	struct cftype *cft = cfe->type;
 	struct cgroup_subsys_state *css = cfe->css;
 
-	return cft->read_seq_string(css, cft, m);
-}
+	if (cft->read_seq_string)
+		return cft->read_seq_string(css, cft, m);
 
-static const struct file_operations cgroup_seqfile_operations = {
-	.read = seq_read,
-	.write = cgroup_file_write,
-	.llseek = seq_lseek,
-	.release = cgroup_file_release,
-};
+	if (cft->read_u64)
+		seq_printf(m, "%llu\n", cft->read_u64(css, cft));
+	else if (cft->read_s64)
+		seq_printf(m, "%lld\n", cft->read_s64(css, cft));
+	else
+		return -EINVAL;
+	return 0;
+}
 
 static int cgroup_file_open(struct inode *inode, struct file *file)
 {
@@ -2387,12 +2353,10 @@ static int cgroup_file_open(struct inode *inode, struct file *file)
 	WARN_ON_ONCE(cfe->css && cfe->css != css);
 	cfe->css = css;
 
-	if (cft->read_seq_string) {
-		file->f_op = &cgroup_seqfile_operations;
-		err = single_open(file, cgroup_seqfile_show, cfe);
-	} else if (cft->open) {
+	if (cft->open)
 		err = cft->open(inode, file);
-	}
+	else
+		err = single_open(file, cgroup_seqfile_show, cfe);
 
 	if (css->ss && err)
 		css_put(css);
@@ -2406,9 +2370,7 @@ static int cgroup_file_release(struct inode *inode, struct file *file)
 
 	if (css->ss)
 		css_put(css);
-	if (file->f_op == &cgroup_seqfile_operations)
-		single_release(inode, file);
-	return 0;
+	return single_release(inode, file);
 }
 
 /*
@@ -2519,7 +2481,7 @@ static ssize_t cgroup_listxattr(struct dentry *dentry, char *buf, size_t size)
 }
 
 static const struct file_operations cgroup_file_operations = {
-	.read = cgroup_file_read,
+	.read = seq_read,
 	.write = cgroup_file_write,
 	.llseek = generic_file_llseek,
 	.open = cgroup_file_open,
-- 
cgit v0.10.2


From 5d22444f427511fe7c4733b233680309af5b071c Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 5 Dec 2013 12:28:04 -0500
Subject: cgroup: generalize cgroup_pidlist_open_file

In preparation of conversion to kernfs, cgroup file handling is
updated so that it can be easily mapped to kernfs.  This patch renames
cgroup_pidlist_open_file to cgroup_open_file and updates it so that it
only contains a field to identify the specific file, ->cfe, and an
opaque ->priv pointer.  When cgroup is converted to kernfs, this will
be replaced by kernfs_open_file which contains about the same
information.

As whether the file is "cgroup.procs" or "tasks" should now be
determined from cgroup_open_file->cfe, the cftype->private for the two
files now carry the file type and cgroup_pidlist_start() reads the
type through cfe->type->private.  This makes the distinction between
cgroup_tasks_open() and cgroup_procs_open() unnecessary.
cgroup_pidlist_open() is now directly used as the open method.

This patch doesn't make any behavior changes.

v2: Refreshed on top of the updated "cgroup: introduce struct
    cgroup_pidlist_open_file".

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index ce6db71..17272893 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -3369,10 +3369,9 @@ struct cgroup_pidlist {
 };
 
 /* seq_file->private points to the following */
-struct cgroup_pidlist_open_file {
-	enum cgroup_filetype		type;
-	struct cgroup			*cgrp;
-	struct cgroup_pidlist		*pidlist;
+struct cgroup_open_file {
+	struct cfent			*cfe;
+	void				*priv;
 };
 
 /*
@@ -3689,33 +3688,35 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
 	 * after a seek to the start). Use a binary-search to find the
 	 * next pid to display, if any
 	 */
-	struct cgroup_pidlist_open_file *of = s->private;
-	struct cgroup *cgrp = of->cgrp;
+	struct cgroup_open_file *of = s->private;
+	struct cgroup *cgrp = of->cfe->css->cgroup;
 	struct cgroup_pidlist *l;
+	enum cgroup_filetype type = of->cfe->type->private;
 	int index = 0, pid = *pos;
 	int *iter, ret;
 
 	mutex_lock(&cgrp->pidlist_mutex);
 
 	/*
-	 * !NULL @of->pidlist indicates that this isn't the first start()
+	 * !NULL @of->priv indicates that this isn't the first start()
 	 * after open.  If the matching pidlist is around, we can use that.
-	 * Look for it.  Note that @of->pidlist can't be used directly.  It
+	 * Look for it.  Note that @of->priv can't be used directly.  It
 	 * could already have been destroyed.
 	 */
-	if (of->pidlist)
-		of->pidlist = cgroup_pidlist_find(cgrp, of->type);
+	if (of->priv)
+		of->priv = cgroup_pidlist_find(cgrp, type);
 
 	/*
 	 * Either this is the first start() after open or the matching
 	 * pidlist has been destroyed inbetween.  Create a new one.
 	 */
-	if (!of->pidlist) {
-		ret = pidlist_array_load(of->cgrp, of->type, &of->pidlist);
+	if (!of->priv) {
+		ret = pidlist_array_load(cgrp, type,
+					 (struct cgroup_pidlist **)&of->priv);
 		if (ret)
 			return ERR_PTR(ret);
 	}
-	l = of->pidlist;
+	l = of->priv;
 
 	if (pid) {
 		int end = l->length;
@@ -3742,19 +3743,19 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
 
 static void cgroup_pidlist_stop(struct seq_file *s, void *v)
 {
-	struct cgroup_pidlist_open_file *of = s->private;
+	struct cgroup_open_file *of = s->private;
+	struct cgroup_pidlist *l = of->priv;
 
-	if (of->pidlist)
-		mod_delayed_work(cgroup_pidlist_destroy_wq,
-				 &of->pidlist->destroy_dwork,
+	if (l)
+		mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork,
 				 CGROUP_PIDLIST_DESTROY_DELAY);
-	mutex_unlock(&of->cgrp->pidlist_mutex);
+	mutex_unlock(&of->cfe->css->cgroup->pidlist_mutex);
 }
 
 static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
 {
-	struct cgroup_pidlist_open_file *of = s->private;
-	struct cgroup_pidlist *l = of->pidlist;
+	struct cgroup_open_file *of = s->private;
+	struct cgroup_pidlist *l = of->priv;
 	pid_t *p = v;
 	pid_t *end = l->list + l->length;
 	/*
@@ -3765,7 +3766,7 @@ static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
 	if (p >= end) {
 		return NULL;
 	} else {
-		*pos = cgroup_pid_fry(of->cgrp, *p);
+		*pos = cgroup_pid_fry(of->cfe->css->cgroup, *p);
 		return p;
 	}
 }
@@ -3799,10 +3800,10 @@ static const struct file_operations cgroup_pidlist_operations = {
  * in the cgroup.
  */
 /* helper function for the two below it */
-static int cgroup_pidlist_open(struct file *file, enum cgroup_filetype type)
+static int cgroup_pidlist_open(struct inode *unused, struct file *file)
 {
-	struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
-	struct cgroup_pidlist_open_file *of;
+	struct cfent *cfe = __d_cfe(file->f_dentry);
+	struct cgroup_open_file *of;
 
 	/* configure file information */
 	file->f_op = &cgroup_pidlist_operations;
@@ -3812,18 +3813,9 @@ static int cgroup_pidlist_open(struct file *file, enum cgroup_filetype type)
 	if (!of)
 		return -ENOMEM;
 
-	of->type = type;
-	of->cgrp = cgrp;
+	of->cfe = cfe;
 	return 0;
 }
-static int cgroup_tasks_open(struct inode *unused, struct file *file)
-{
-	return cgroup_pidlist_open(file, CGROUP_FILE_TASKS);
-}
-static int cgroup_procs_open(struct inode *unused, struct file *file)
-{
-	return cgroup_pidlist_open(file, CGROUP_FILE_PROCS);
-}
 
 static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css,
 					 struct cftype *cft)
@@ -3878,7 +3870,8 @@ static int cgroup_clone_children_write(struct cgroup_subsys_state *css,
 static struct cftype cgroup_base_files[] = {
 	{
 		.name = "cgroup.procs",
-		.open = cgroup_procs_open,
+		.open = cgroup_pidlist_open,
+		.private = CGROUP_FILE_PROCS,
 		.write_u64 = cgroup_procs_write,
 		.mode = S_IRUGO | S_IWUSR,
 	},
@@ -3902,7 +3895,8 @@ static struct cftype cgroup_base_files[] = {
 	{
 		.name = "tasks",
 		.flags = CFTYPE_INSANE,		/* use "procs" instead */
-		.open = cgroup_tasks_open,
+		.open = cgroup_pidlist_open,
+		.private = CGROUP_FILE_TASKS,
 		.write_u64 = cgroup_tasks_write,
 		.mode = S_IRUGO | S_IWUSR,
 	},
-- 
cgit v0.10.2


From 7da112792753d71aed44b918395892a1fc53048a Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 5 Dec 2013 12:28:04 -0500
Subject: cgroup: attach cgroup_open_file to all cgroup files

In preparation of conversion to kernfs, cgroup file handling is
updated so that it can be easily mapped to kernfs.  This patch
attaches cgroup_open_file, which used to be attached to pidlist files,
to all cgroup files, introduces seq_css/cft() accessors to determine
the cgroup_subsys_state and cftype associated with a given cgroup
seq_file, exports them as public interface.

This doesn't cause any behavior changes but unifies cgroup file
handling across different file types and will help converting them to
kernfs seq_show() interface.

v2: Li pointed out that the original patch was using
    single_open_size() incorrectly assuming that the size param is
    private data size.  Fix it by allocating @of separately and
    passing it to single_open() and explicitly freeing it in the
    release path.  This isn't the prettiest but this path is gonna be
    restructured by the following patches pretty soon.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 53e11da..c3d698a 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -21,6 +21,7 @@
 #include <linux/xattr.h>
 #include <linux/fs.h>
 #include <linux/percpu-refcount.h>
+#include <linux/seq_file.h>
 
 #ifdef CONFIG_CGROUPS
 
@@ -490,6 +491,26 @@ struct cftype_set {
 };
 
 /*
+ * cgroupfs file entry, pointed to from leaf dentry->d_fsdata.  Don't
+ * access directly.
+ */
+struct cfent {
+	struct list_head		node;
+	struct dentry			*dentry;
+	struct cftype			*type;
+	struct cgroup_subsys_state	*css;
+
+	/* file xattrs */
+	struct simple_xattrs		xattrs;
+};
+
+/* seq_file->private points to the following, only ->priv is public */
+struct cgroup_open_file {
+	struct cfent			*cfe;
+	void				*priv;
+};
+
+/*
  * See the comment above CGRP_ROOT_SANE_BEHAVIOR for details.  This
  * function can be called as long as @cgrp is accessible.
  */
@@ -504,6 +525,18 @@ static inline const char *cgroup_name(const struct cgroup *cgrp)
 	return rcu_dereference(cgrp->name)->name;
 }
 
+static inline struct cgroup_subsys_state *seq_css(struct seq_file *seq)
+{
+	struct cgroup_open_file *of = seq->private;
+	return of->cfe->css;
+}
+
+static inline struct cftype *seq_cft(struct seq_file *seq)
+{
+	struct cgroup_open_file *of = seq->private;
+	return of->cfe->type;
+}
+
 int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts);
 int cgroup_rm_cftypes(struct cftype *cfts);
 
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 17272893..036c05d 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -41,7 +41,6 @@
 #include <linux/rcupdate.h>
 #include <linux/sched.h>
 #include <linux/backing-dev.h>
-#include <linux/seq_file.h>
 #include <linux/slab.h>
 #include <linux/magic.h>
 #include <linux/spinlock.h>
@@ -130,19 +129,6 @@ static struct cgroupfs_root cgroup_dummy_root;
 /* dummy_top is a shorthand for the dummy hierarchy's top cgroup */
 static struct cgroup * const cgroup_dummy_top = &cgroup_dummy_root.top_cgroup;
 
-/*
- * cgroupfs file entry, pointed to from leaf dentry->d_fsdata.
- */
-struct cfent {
-	struct list_head		node;
-	struct dentry			*dentry;
-	struct cftype			*type;
-	struct cgroup_subsys_state	*css;
-
-	/* file xattrs */
-	struct simple_xattrs		xattrs;
-};
-
 /* The list of hierarchy roots */
 
 static LIST_HEAD(cgroup_roots);
@@ -2302,9 +2288,8 @@ out_free:
 
 static int cgroup_seqfile_show(struct seq_file *m, void *arg)
 {
-	struct cfent *cfe = m->private;
-	struct cftype *cft = cfe->type;
-	struct cgroup_subsys_state *css = cfe->css;
+	struct cftype *cft = seq_cft(m);
+	struct cgroup_subsys_state *css = seq_css(m);
 
 	if (cft->read_seq_string)
 		return cft->read_seq_string(css, cft, m);
@@ -2353,10 +2338,20 @@ static int cgroup_file_open(struct inode *inode, struct file *file)
 	WARN_ON_ONCE(cfe->css && cfe->css != css);
 	cfe->css = css;
 
-	if (cft->open)
+	if (cft->open) {
 		err = cft->open(inode, file);
-	else
-		err = single_open(file, cgroup_seqfile_show, cfe);
+	} else {
+		struct cgroup_open_file *of;
+
+		err = -ENOMEM;
+		of = kzalloc(sizeof(*of), GFP_KERNEL);
+		if (of) {
+			of->cfe = cfe;
+			err = single_open(file, cgroup_seqfile_show, of);
+			if (err)
+				kfree(of);
+		}
+	}
 
 	if (css->ss && err)
 		css_put(css);
@@ -2370,6 +2365,7 @@ static int cgroup_file_release(struct inode *inode, struct file *file)
 
 	if (css->ss)
 		css_put(css);
+	kfree(((struct seq_file *)file->private_data)->private);
 	return single_release(inode, file);
 }
 
@@ -3368,12 +3364,6 @@ struct cgroup_pidlist {
 	struct delayed_work destroy_dwork;
 };
 
-/* seq_file->private points to the following */
-struct cgroup_open_file {
-	struct cfent			*cfe;
-	void				*priv;
-};
-
 /*
  * The following two functions "fix" the issue where there are more pids
  * than kmalloc will give memory for; in such cases, we use vmalloc/vfree.
@@ -3689,9 +3679,9 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
 	 * next pid to display, if any
 	 */
 	struct cgroup_open_file *of = s->private;
-	struct cgroup *cgrp = of->cfe->css->cgroup;
+	struct cgroup *cgrp = seq_css(s)->cgroup;
 	struct cgroup_pidlist *l;
-	enum cgroup_filetype type = of->cfe->type->private;
+	enum cgroup_filetype type = seq_cft(s)->private;
 	int index = 0, pid = *pos;
 	int *iter, ret;
 
@@ -3749,7 +3739,7 @@ static void cgroup_pidlist_stop(struct seq_file *s, void *v)
 	if (l)
 		mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork,
 				 CGROUP_PIDLIST_DESTROY_DELAY);
-	mutex_unlock(&of->cfe->css->cgroup->pidlist_mutex);
+	mutex_unlock(&seq_css(s)->cgroup->pidlist_mutex);
 }
 
 static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
@@ -3766,7 +3756,7 @@ static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
 	if (p >= end) {
 		return NULL;
 	} else {
-		*pos = cgroup_pid_fry(of->cfe->css->cgroup, *p);
+		*pos = cgroup_pid_fry(seq_css(s)->cgroup, *p);
 		return p;
 	}
 }
-- 
cgit v0.10.2


From 2da8ca822d49c8b8781800ad155aaa00e7bb5f1a Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 5 Dec 2013 12:28:04 -0500
Subject: cgroup: replace cftype->read_seq_string() with cftype->seq_show()

In preparation of conversion to kernfs, cgroup file handling is
updated so that it can be easily mapped to kernfs.  This patch
replaces cftype->read_seq_string() with cftype->seq_show() which is
not limited to single_open() operation and will map directcly to
kernfs seq_file interface.

The conversions are mechanical.  As ->seq_show() doesn't have @css and
@cft, the functions which make use of them are converted to use
seq_css() and seq_cft() respectively.  In several occassions, e.f. if
it has seq_string in its name, the function name is updated to fit the
new method better.

This patch does not introduce any behavior changes.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Aristeu Rozanski <arozansk@redhat.com>
Acked-by: Vivek Goyal <vgoyal@redhat.com>
Acked-by: Michal Hocko <mhocko@suse.cz>
Acked-by: Daniel Wagner <daniel.wagner@bmw-carit.de>
Acked-by: Li Zefan <lizefan@huawei.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Balbir Singh <bsingharora@gmail.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Neil Horman <nhorman@tuxdriver.com>

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 0653404..a760857 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -1303,13 +1303,10 @@ static u64 tg_prfill_cpu_rwstat(struct seq_file *sf,
 	return __blkg_prfill_rwstat(sf, pd, &rwstat);
 }
 
-static int tg_print_cpu_rwstat(struct cgroup_subsys_state *css,
-			       struct cftype *cft, struct seq_file *sf)
+static int tg_print_cpu_rwstat(struct seq_file *sf, void *v)
 {
-	struct blkcg *blkcg = css_to_blkcg(css);
-
-	blkcg_print_blkgs(sf, blkcg, tg_prfill_cpu_rwstat, &blkcg_policy_throtl,
-			  cft->private, true);
+	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_cpu_rwstat,
+			  &blkcg_policy_throtl, seq_cft(sf)->private, true);
 	return 0;
 }
 
@@ -1335,19 +1332,17 @@ static u64 tg_prfill_conf_uint(struct seq_file *sf, struct blkg_policy_data *pd,
 	return __blkg_prfill_u64(sf, pd, v);
 }
 
-static int tg_print_conf_u64(struct cgroup_subsys_state *css,
-			     struct cftype *cft, struct seq_file *sf)
+static int tg_print_conf_u64(struct seq_file *sf, void *v)
 {
-	blkcg_print_blkgs(sf, css_to_blkcg(css), tg_prfill_conf_u64,
-			  &blkcg_policy_throtl, cft->private, false);
+	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_conf_u64,
+			  &blkcg_policy_throtl, seq_cft(sf)->private, false);
 	return 0;
 }
 
-static int tg_print_conf_uint(struct cgroup_subsys_state *css,
-			      struct cftype *cft, struct seq_file *sf)
+static int tg_print_conf_uint(struct seq_file *sf, void *v)
 {
-	blkcg_print_blkgs(sf, css_to_blkcg(css), tg_prfill_conf_uint,
-			  &blkcg_policy_throtl, cft->private, false);
+	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_conf_uint,
+			  &blkcg_policy_throtl, seq_cft(sf)->private, false);
 	return 0;
 }
 
@@ -1428,40 +1423,40 @@ static struct cftype throtl_files[] = {
 	{
 		.name = "throttle.read_bps_device",
 		.private = offsetof(struct throtl_grp, bps[READ]),
-		.read_seq_string = tg_print_conf_u64,
+		.seq_show = tg_print_conf_u64,
 		.write_string = tg_set_conf_u64,
 		.max_write_len = 256,
 	},
 	{
 		.name = "throttle.write_bps_device",
 		.private = offsetof(struct throtl_grp, bps[WRITE]),
-		.read_seq_string = tg_print_conf_u64,
+		.seq_show = tg_print_conf_u64,
 		.write_string = tg_set_conf_u64,
 		.max_write_len = 256,
 	},
 	{
 		.name = "throttle.read_iops_device",
 		.private = offsetof(struct throtl_grp, iops[READ]),
-		.read_seq_string = tg_print_conf_uint,
+		.seq_show = tg_print_conf_uint,
 		.write_string = tg_set_conf_uint,
 		.max_write_len = 256,
 	},
 	{
 		.name = "throttle.write_iops_device",
 		.private = offsetof(struct throtl_grp, iops[WRITE]),
-		.read_seq_string = tg_print_conf_uint,
+		.seq_show = tg_print_conf_uint,
 		.write_string = tg_set_conf_uint,
 		.max_write_len = 256,
 	},
 	{
 		.name = "throttle.io_service_bytes",
 		.private = offsetof(struct tg_stats_cpu, service_bytes),
-		.read_seq_string = tg_print_cpu_rwstat,
+		.seq_show = tg_print_cpu_rwstat,
 	},
 	{
 		.name = "throttle.io_serviced",
 		.private = offsetof(struct tg_stats_cpu, serviced),
-		.read_seq_string = tg_print_cpu_rwstat,
+		.seq_show = tg_print_cpu_rwstat,
 	},
 	{ }	/* terminate */
 };
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 4d5cec1..744833b 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -1632,11 +1632,11 @@ static u64 cfqg_prfill_weight_device(struct seq_file *sf,
 	return __blkg_prfill_u64(sf, pd, cfqg->dev_weight);
 }
 
-static int cfqg_print_weight_device(struct cgroup_subsys_state *css,
-				    struct cftype *cft, struct seq_file *sf)
+static int cfqg_print_weight_device(struct seq_file *sf, void *v)
 {
-	blkcg_print_blkgs(sf, css_to_blkcg(css), cfqg_prfill_weight_device,
-			  &blkcg_policy_cfq, 0, false);
+	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
+			  cfqg_prfill_weight_device, &blkcg_policy_cfq,
+			  0, false);
 	return 0;
 }
 
@@ -1650,26 +1650,23 @@ static u64 cfqg_prfill_leaf_weight_device(struct seq_file *sf,
 	return __blkg_prfill_u64(sf, pd, cfqg->dev_leaf_weight);
 }
 
-static int cfqg_print_leaf_weight_device(struct cgroup_subsys_state *css,
-					 struct cftype *cft,
-					 struct seq_file *sf)
+static int cfqg_print_leaf_weight_device(struct seq_file *sf, void *v)
 {
-	blkcg_print_blkgs(sf, css_to_blkcg(css), cfqg_prfill_leaf_weight_device,
-			  &blkcg_policy_cfq, 0, false);
+	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
+			  cfqg_prfill_leaf_weight_device, &blkcg_policy_cfq,
+			  0, false);
 	return 0;
 }
 
-static int cfq_print_weight(struct cgroup_subsys_state *css, struct cftype *cft,
-			    struct seq_file *sf)
+static int cfq_print_weight(struct seq_file *sf, void *v)
 {
-	seq_printf(sf, "%u\n", css_to_blkcg(css)->cfq_weight);
+	seq_printf(sf, "%u\n", css_to_blkcg(seq_css(sf))->cfq_weight);
 	return 0;
 }
 
-static int cfq_print_leaf_weight(struct cgroup_subsys_state *css,
-				 struct cftype *cft, struct seq_file *sf)
+static int cfq_print_leaf_weight(struct seq_file *sf, void *v)
 {
-	seq_printf(sf, "%u\n", css_to_blkcg(css)->cfq_leaf_weight);
+	seq_printf(sf, "%u\n", css_to_blkcg(seq_css(sf))->cfq_leaf_weight);
 	return 0;
 }
 
@@ -1762,23 +1759,17 @@ static int cfq_set_leaf_weight(struct cgroup_subsys_state *css,
 	return __cfq_set_weight(css, cft, val, true);
 }
 
-static int cfqg_print_stat(struct cgroup_subsys_state *css, struct cftype *cft,
-			   struct seq_file *sf)
+static int cfqg_print_stat(struct seq_file *sf, void *v)
 {
-	struct blkcg *blkcg = css_to_blkcg(css);
-
-	blkcg_print_blkgs(sf, blkcg, blkg_prfill_stat, &blkcg_policy_cfq,
-			  cft->private, false);
+	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_stat,
+			  &blkcg_policy_cfq, seq_cft(sf)->private, false);
 	return 0;
 }
 
-static int cfqg_print_rwstat(struct cgroup_subsys_state *css,
-			     struct cftype *cft, struct seq_file *sf)
+static int cfqg_print_rwstat(struct seq_file *sf, void *v)
 {
-	struct blkcg *blkcg = css_to_blkcg(css);
-
-	blkcg_print_blkgs(sf, blkcg, blkg_prfill_rwstat, &blkcg_policy_cfq,
-			  cft->private, true);
+	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_rwstat,
+			  &blkcg_policy_cfq, seq_cft(sf)->private, true);
 	return 0;
 }
 
@@ -1798,23 +1789,19 @@ static u64 cfqg_prfill_rwstat_recursive(struct seq_file *sf,
 	return __blkg_prfill_rwstat(sf, pd, &sum);
 }
 
-static int cfqg_print_stat_recursive(struct cgroup_subsys_state *css,
-				     struct cftype *cft, struct seq_file *sf)
+static int cfqg_print_stat_recursive(struct seq_file *sf, void *v)
 {
-	struct blkcg *blkcg = css_to_blkcg(css);
-
-	blkcg_print_blkgs(sf, blkcg, cfqg_prfill_stat_recursive,
-			  &blkcg_policy_cfq, cft->private, false);
+	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
+			  cfqg_prfill_stat_recursive, &blkcg_policy_cfq,
+			  seq_cft(sf)->private, false);
 	return 0;
 }
 
-static int cfqg_print_rwstat_recursive(struct cgroup_subsys_state *css,
-				       struct cftype *cft, struct seq_file *sf)
+static int cfqg_print_rwstat_recursive(struct seq_file *sf, void *v)
 {
-	struct blkcg *blkcg = css_to_blkcg(css);
-
-	blkcg_print_blkgs(sf, blkcg, cfqg_prfill_rwstat_recursive,
-			  &blkcg_policy_cfq, cft->private, true);
+	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
+			  cfqg_prfill_rwstat_recursive, &blkcg_policy_cfq,
+			  seq_cft(sf)->private, true);
 	return 0;
 }
 
@@ -1835,13 +1822,11 @@ static u64 cfqg_prfill_avg_queue_size(struct seq_file *sf,
 }
 
 /* print avg_queue_size */
-static int cfqg_print_avg_queue_size(struct cgroup_subsys_state *css,
-				     struct cftype *cft, struct seq_file *sf)
+static int cfqg_print_avg_queue_size(struct seq_file *sf, void *v)
 {
-	struct blkcg *blkcg = css_to_blkcg(css);
-
-	blkcg_print_blkgs(sf, blkcg, cfqg_prfill_avg_queue_size,
-			  &blkcg_policy_cfq, 0, false);
+	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
+			  cfqg_prfill_avg_queue_size, &blkcg_policy_cfq,
+			  0, false);
 	return 0;
 }
 #endif	/* CONFIG_DEBUG_BLK_CGROUP */
@@ -1851,14 +1836,14 @@ static struct cftype cfq_blkcg_files[] = {
 	{
 		.name = "weight_device",
 		.flags = CFTYPE_ONLY_ON_ROOT,
-		.read_seq_string = cfqg_print_leaf_weight_device,
+		.seq_show = cfqg_print_leaf_weight_device,
 		.write_string = cfqg_set_leaf_weight_device,
 		.max_write_len = 256,
 	},
 	{
 		.name = "weight",
 		.flags = CFTYPE_ONLY_ON_ROOT,
-		.read_seq_string = cfq_print_leaf_weight,
+		.seq_show = cfq_print_leaf_weight,
 		.write_u64 = cfq_set_leaf_weight,
 	},
 
@@ -1866,26 +1851,26 @@ static struct cftype cfq_blkcg_files[] = {
 	{
 		.name = "weight_device",
 		.flags = CFTYPE_NOT_ON_ROOT,
-		.read_seq_string = cfqg_print_weight_device,
+		.seq_show = cfqg_print_weight_device,
 		.write_string = cfqg_set_weight_device,
 		.max_write_len = 256,
 	},
 	{
 		.name = "weight",
 		.flags = CFTYPE_NOT_ON_ROOT,
-		.read_seq_string = cfq_print_weight,
+		.seq_show = cfq_print_weight,
 		.write_u64 = cfq_set_weight,
 	},
 
 	{
 		.name = "leaf_weight_device",
-		.read_seq_string = cfqg_print_leaf_weight_device,
+		.seq_show = cfqg_print_leaf_weight_device,
 		.write_string = cfqg_set_leaf_weight_device,
 		.max_write_len = 256,
 	},
 	{
 		.name = "leaf_weight",
-		.read_seq_string = cfq_print_leaf_weight,
+		.seq_show = cfq_print_leaf_weight,
 		.write_u64 = cfq_set_leaf_weight,
 	},
 
@@ -1893,114 +1878,114 @@ static struct cftype cfq_blkcg_files[] = {
 	{
 		.name = "time",
 		.private = offsetof(struct cfq_group, stats.time),
-		.read_seq_string = cfqg_print_stat,
+		.seq_show = cfqg_print_stat,
 	},
 	{
 		.name = "sectors",
 		.private = offsetof(struct cfq_group, stats.sectors),
-		.read_seq_string = cfqg_print_stat,
+		.seq_show = cfqg_print_stat,
 	},
 	{
 		.name = "io_service_bytes",
 		.private = offsetof(struct cfq_group, stats.service_bytes),
-		.read_seq_string = cfqg_print_rwstat,
+		.seq_show = cfqg_print_rwstat,
 	},
 	{
 		.name = "io_serviced",
 		.private = offsetof(struct cfq_group, stats.serviced),
-		.read_seq_string = cfqg_print_rwstat,
+		.seq_show = cfqg_print_rwstat,
 	},
 	{
 		.name = "io_service_time",
 		.private = offsetof(struct cfq_group, stats.service_time),
-		.read_seq_string = cfqg_print_rwstat,
+		.seq_show = cfqg_print_rwstat,
 	},
 	{
 		.name = "io_wait_time",
 		.private = offsetof(struct cfq_group, stats.wait_time),
-		.read_seq_string = cfqg_print_rwstat,
+		.seq_show = cfqg_print_rwstat,
 	},
 	{
 		.name = "io_merged",
 		.private = offsetof(struct cfq_group, stats.merged),
-		.read_seq_string = cfqg_print_rwstat,
+		.seq_show = cfqg_print_rwstat,
 	},
 	{
 		.name = "io_queued",
 		.private = offsetof(struct cfq_group, stats.queued),
-		.read_seq_string = cfqg_print_rwstat,
+		.seq_show = cfqg_print_rwstat,
 	},
 
 	/* the same statictics which cover the cfqg and its descendants */
 	{
 		.name = "time_recursive",
 		.private = offsetof(struct cfq_group, stats.time),
-		.read_seq_string = cfqg_print_stat_recursive,
+		.seq_show = cfqg_print_stat_recursive,
 	},
 	{
 		.name = "sectors_recursive",
 		.private = offsetof(struct cfq_group, stats.sectors),
-		.read_seq_string = cfqg_print_stat_recursive,
+		.seq_show = cfqg_print_stat_recursive,
 	},
 	{
 		.name = "io_service_bytes_recursive",
 		.private = offsetof(struct cfq_group, stats.service_bytes),
-		.read_seq_string = cfqg_print_rwstat_recursive,
+		.seq_show = cfqg_print_rwstat_recursive,
 	},
 	{
 		.name = "io_serviced_recursive",
 		.private = offsetof(struct cfq_group, stats.serviced),
-		.read_seq_string = cfqg_print_rwstat_recursive,
+		.seq_show = cfqg_print_rwstat_recursive,
 	},
 	{
 		.name = "io_service_time_recursive",
 		.private = offsetof(struct cfq_group, stats.service_time),
-		.read_seq_string = cfqg_print_rwstat_recursive,
+		.seq_show = cfqg_print_rwstat_recursive,
 	},
 	{
 		.name = "io_wait_time_recursive",
 		.private = offsetof(struct cfq_group, stats.wait_time),
-		.read_seq_string = cfqg_print_rwstat_recursive,
+		.seq_show = cfqg_print_rwstat_recursive,
 	},
 	{
 		.name = "io_merged_recursive",
 		.private = offsetof(struct cfq_group, stats.merged),
-		.read_seq_string = cfqg_print_rwstat_recursive,
+		.seq_show = cfqg_print_rwstat_recursive,
 	},
 	{
 		.name = "io_queued_recursive",
 		.private = offsetof(struct cfq_group, stats.queued),
-		.read_seq_string = cfqg_print_rwstat_recursive,
+		.seq_show = cfqg_print_rwstat_recursive,
 	},
 #ifdef CONFIG_DEBUG_BLK_CGROUP
 	{
 		.name = "avg_queue_size",
-		.read_seq_string = cfqg_print_avg_queue_size,
+		.seq_show = cfqg_print_avg_queue_size,
 	},
 	{
 		.name = "group_wait_time",
 		.private = offsetof(struct cfq_group, stats.group_wait_time),
-		.read_seq_string = cfqg_print_stat,
+		.seq_show = cfqg_print_stat,
 	},
 	{
 		.name = "idle_time",
 		.private = offsetof(struct cfq_group, stats.idle_time),
-		.read_seq_string = cfqg_print_stat,
+		.seq_show = cfqg_print_stat,
 	},
 	{
 		.name = "empty_time",
 		.private = offsetof(struct cfq_group, stats.empty_time),
-		.read_seq_string = cfqg_print_stat,
+		.seq_show = cfqg_print_stat,
 	},
 	{
 		.name = "dequeue",
 		.private = offsetof(struct cfq_group, stats.dequeue),
-		.read_seq_string = cfqg_print_stat,
+		.seq_show = cfqg_print_stat,
 	},
 	{
 		.name = "unaccounted_time",
 		.private = offsetof(struct cfq_group, stats.unaccounted_time),
-		.read_seq_string = cfqg_print_stat,
+		.seq_show = cfqg_print_stat,
 	},
 #endif	/* CONFIG_DEBUG_BLK_CGROUP */
 	{ }	/* terminate */
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index c3d698a..b32a0f8 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -444,12 +444,9 @@ struct cftype {
 	 * read_s64() is a signed version of read_u64()
 	 */
 	s64 (*read_s64)(struct cgroup_subsys_state *css, struct cftype *cft);
-	/*
-	 * read_seq_string() is used for outputting a simple sequence
-	 * using seqfile.
-	 */
-	int (*read_seq_string)(struct cgroup_subsys_state *css,
-			       struct cftype *cft, struct seq_file *m);
+
+	/* generic seq_file read interface */
+	int (*seq_show)(struct seq_file *sf, void *v);
 
 	/*
 	 * write_u64() is a shortcut for the common case of accepting
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 036c05d..c45e633 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -2212,10 +2212,9 @@ static int cgroup_release_agent_write(struct cgroup_subsys_state *css,
 	return 0;
 }
 
-static int cgroup_release_agent_show(struct cgroup_subsys_state *css,
-				     struct cftype *cft, struct seq_file *seq)
+static int cgroup_release_agent_show(struct seq_file *seq, void *v)
 {
-	struct cgroup *cgrp = css->cgroup;
+	struct cgroup *cgrp = seq_css(seq)->cgroup;
 
 	if (!cgroup_lock_live_group(cgrp))
 		return -ENODEV;
@@ -2225,10 +2224,11 @@ static int cgroup_release_agent_show(struct cgroup_subsys_state *css,
 	return 0;
 }
 
-static int cgroup_sane_behavior_show(struct cgroup_subsys_state *css,
-				     struct cftype *cft, struct seq_file *seq)
+static int cgroup_sane_behavior_show(struct seq_file *seq, void *v)
 {
-	seq_printf(seq, "%d\n", cgroup_sane_behavior(css->cgroup));
+	struct cgroup *cgrp = seq_css(seq)->cgroup;
+
+	seq_printf(seq, "%d\n", cgroup_sane_behavior(cgrp));
 	return 0;
 }
 
@@ -2291,8 +2291,8 @@ static int cgroup_seqfile_show(struct seq_file *m, void *arg)
 	struct cftype *cft = seq_cft(m);
 	struct cgroup_subsys_state *css = seq_css(m);
 
-	if (cft->read_seq_string)
-		return cft->read_seq_string(css, cft, m);
+	if (cft->seq_show)
+		return cft->seq_show(m, arg);
 
 	if (cft->read_u64)
 		seq_printf(m, "%llu\n", cft->read_u64(css, cft));
@@ -2559,7 +2559,7 @@ static umode_t cgroup_file_mode(const struct cftype *cft)
 	if (cft->mode)
 		return cft->mode;
 
-	if (cft->read_u64 || cft->read_s64 || cft->read_seq_string)
+	if (cft->read_u64 || cft->read_s64 || cft->seq_show)
 		mode |= S_IRUGO;
 
 	if (cft->write_u64 || cft->write_s64 || cft->write_string ||
@@ -3874,7 +3874,7 @@ static struct cftype cgroup_base_files[] = {
 	{
 		.name = "cgroup.sane_behavior",
 		.flags = CFTYPE_ONLY_ON_ROOT,
-		.read_seq_string = cgroup_sane_behavior_show,
+		.seq_show = cgroup_sane_behavior_show,
 	},
 
 	/*
@@ -3899,7 +3899,7 @@ static struct cftype cgroup_base_files[] = {
 	{
 		.name = "release_agent",
 		.flags = CFTYPE_INSANE | CFTYPE_ONLY_ON_ROOT,
-		.read_seq_string = cgroup_release_agent_show,
+		.seq_show = cgroup_release_agent_show,
 		.write_string = cgroup_release_agent_write,
 		.max_write_len = PATH_MAX,
 	},
@@ -5274,9 +5274,7 @@ static u64 current_css_set_refcount_read(struct cgroup_subsys_state *css,
 	return count;
 }
 
-static int current_css_set_cg_links_read(struct cgroup_subsys_state *css,
-					 struct cftype *cft,
-					 struct seq_file *seq)
+static int current_css_set_cg_links_read(struct seq_file *seq, void *v)
 {
 	struct cgrp_cset_link *link;
 	struct css_set *cset;
@@ -5301,9 +5299,9 @@ static int current_css_set_cg_links_read(struct cgroup_subsys_state *css,
 }
 
 #define MAX_TASKS_SHOWN_PER_CSS 25
-static int cgroup_css_links_read(struct cgroup_subsys_state *css,
-				 struct cftype *cft, struct seq_file *seq)
+static int cgroup_css_links_read(struct seq_file *seq, void *v)
 {
+	struct cgroup_subsys_state *css = seq_css(seq);
 	struct cgrp_cset_link *link;
 
 	read_lock(&css_set_lock);
@@ -5349,12 +5347,12 @@ static struct cftype debug_files[] =  {
 
 	{
 		.name = "current_css_set_cg_links",
-		.read_seq_string = current_css_set_cg_links_read,
+		.seq_show = current_css_set_cg_links_read,
 	},
 
 	{
 		.name = "cgroup_css_links",
-		.read_seq_string = cgroup_css_links_read,
+		.seq_show = cgroup_css_links_read,
 	},
 
 	{
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index f0ff64d..6c3154e 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -301,10 +301,9 @@ out_unlock:
 	spin_unlock_irq(&freezer->lock);
 }
 
-static int freezer_read(struct cgroup_subsys_state *css, struct cftype *cft,
-			struct seq_file *m)
+static int freezer_read(struct seq_file *m, void *v)
 {
-	struct cgroup_subsys_state *pos;
+	struct cgroup_subsys_state *css = seq_css(m), *pos;
 
 	rcu_read_lock();
 
@@ -458,7 +457,7 @@ static struct cftype files[] = {
 	{
 		.name = "state",
 		.flags = CFTYPE_NOT_ON_ROOT,
-		.read_seq_string = freezer_read,
+		.seq_show = freezer_read,
 		.write_string = freezer_write,
 	},
 	{
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 032929f..4410ac6 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1732,12 +1732,10 @@ out_unlock:
  * and since these maps can change value dynamically, one could read
  * gibberish by doing partial reads while a list was changing.
  */
-static int cpuset_common_read_seq_string(struct cgroup_subsys_state *css,
-					 struct cftype *cft,
-					 struct seq_file *sf)
+static int cpuset_common_seq_show(struct seq_file *sf, void *v)
 {
-	struct cpuset *cs = css_cs(css);
-	cpuset_filetype_t type = cft->private;
+	struct cpuset *cs = css_cs(seq_css(sf));
+	cpuset_filetype_t type = seq_cft(sf)->private;
 	ssize_t count;
 	char *buf, *s;
 	int ret = 0;
@@ -1824,7 +1822,7 @@ static s64 cpuset_read_s64(struct cgroup_subsys_state *css, struct cftype *cft)
 static struct cftype files[] = {
 	{
 		.name = "cpus",
-		.read_seq_string = cpuset_common_read_seq_string,
+		.seq_show = cpuset_common_seq_show,
 		.write_string = cpuset_write_resmask,
 		.max_write_len = (100U + 6 * NR_CPUS),
 		.private = FILE_CPULIST,
@@ -1832,7 +1830,7 @@ static struct cftype files[] = {
 
 	{
 		.name = "mems",
-		.read_seq_string = cpuset_common_read_seq_string,
+		.seq_show = cpuset_common_seq_show,
 		.write_string = cpuset_write_resmask,
 		.max_write_len = (100U + 6 * MAX_NUMNODES),
 		.private = FILE_MEMLIST,
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index f28ec67..7e8cbb9 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7256,10 +7256,9 @@ static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota)
 	return ret;
 }
 
-static int cpu_stats_show(struct cgroup_subsys_state *css, struct cftype *cft,
-			  struct seq_file *sf)
+static int cpu_stats_show(struct seq_file *sf, void *v)
 {
-	struct task_group *tg = css_tg(css);
+	struct task_group *tg = css_tg(seq_css(sf));
 	struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
 
 	seq_printf(sf, "nr_periods %d\n", cfs_b->nr_periods);
@@ -7318,7 +7317,7 @@ static struct cftype cpu_files[] = {
 	},
 	{
 		.name = "stat",
-		.read_seq_string = cpu_stats_show,
+		.seq_show = cpu_stats_show,
 	},
 #endif
 #ifdef CONFIG_RT_GROUP_SCHED
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c
index dd88738..622e081 100644
--- a/kernel/sched/cpuacct.c
+++ b/kernel/sched/cpuacct.c
@@ -163,10 +163,9 @@ out:
 	return err;
 }
 
-static int cpuacct_percpu_seq_read(struct cgroup_subsys_state *css,
-				   struct cftype *cft, struct seq_file *m)
+static int cpuacct_percpu_seq_show(struct seq_file *m, void *V)
 {
-	struct cpuacct *ca = css_ca(css);
+	struct cpuacct *ca = css_ca(seq_css(m));
 	u64 percpu;
 	int i;
 
@@ -183,10 +182,9 @@ static const char * const cpuacct_stat_desc[] = {
 	[CPUACCT_STAT_SYSTEM] = "system",
 };
 
-static int cpuacct_stats_show(struct cgroup_subsys_state *css,
-			      struct cftype *cft, struct seq_file *sf)
+static int cpuacct_stats_show(struct seq_file *sf, void *v)
 {
-	struct cpuacct *ca = css_ca(css);
+	struct cpuacct *ca = css_ca(seq_css(sf));
 	int cpu;
 	s64 val = 0;
 
@@ -220,11 +218,11 @@ static struct cftype files[] = {
 	},
 	{
 		.name = "usage_percpu",
-		.read_seq_string = cpuacct_percpu_seq_read,
+		.seq_show = cpuacct_percpu_seq_show,
 	},
 	{
 		.name = "stat",
-		.read_seq_string = cpuacct_stats_show,
+		.seq_show = cpuacct_stats_show,
 	},
 	{ }	/* terminate */
 };
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index f149521..9252219 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -3014,10 +3014,9 @@ static struct kmem_cache *memcg_params_to_cache(struct memcg_cache_params *p)
 }
 
 #ifdef CONFIG_SLABINFO
-static int mem_cgroup_slabinfo_read(struct cgroup_subsys_state *css,
-				    struct cftype *cft, struct seq_file *m)
+static int mem_cgroup_slabinfo_read(struct seq_file *m, void *v)
 {
-	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+	struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
 	struct memcg_cache_params *params;
 
 	if (!memcg_can_account_kmem(memcg))
@@ -5418,8 +5417,7 @@ static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
 #endif
 
 #ifdef CONFIG_NUMA
-static int memcg_numa_stat_show(struct cgroup_subsys_state *css,
-				struct cftype *cft, struct seq_file *m)
+static int memcg_numa_stat_show(struct seq_file *m, void *v)
 {
 	struct numa_stat {
 		const char *name;
@@ -5435,7 +5433,7 @@ static int memcg_numa_stat_show(struct cgroup_subsys_state *css,
 	const struct numa_stat *stat;
 	int nid;
 	unsigned long nr;
-	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+	struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
 
 	for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
 		nr = mem_cgroup_nr_lru_pages(memcg, stat->lru_mask);
@@ -5474,10 +5472,9 @@ static inline void mem_cgroup_lru_names_not_uptodate(void)
 	BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS);
 }
 
-static int memcg_stat_show(struct cgroup_subsys_state *css, struct cftype *cft,
-				 struct seq_file *m)
+static int memcg_stat_show(struct seq_file *m, void *v)
 {
-	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+	struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
 	struct mem_cgroup *mi;
 	unsigned int i;
 
@@ -5907,10 +5904,9 @@ static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg,
 	spin_unlock(&memcg_oom_lock);
 }
 
-static int mem_cgroup_oom_control_read(struct cgroup_subsys_state *css,
-				       struct cftype *cft, struct seq_file *sf)
+static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v)
 {
-	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+	struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(sf));
 
 	seq_printf(sf, "oom_kill_disable %d\n", memcg->oom_kill_disable);
 	seq_printf(sf, "under_oom %d\n", (bool)atomic_read(&memcg->under_oom));
@@ -6260,7 +6256,7 @@ static struct cftype mem_cgroup_files[] = {
 	},
 	{
 		.name = "stat",
-		.read_seq_string = memcg_stat_show,
+		.seq_show = memcg_stat_show,
 	},
 	{
 		.name = "force_empty",
@@ -6290,7 +6286,7 @@ static struct cftype mem_cgroup_files[] = {
 	},
 	{
 		.name = "oom_control",
-		.read_seq_string = mem_cgroup_oom_control_read,
+		.seq_show = mem_cgroup_oom_control_read,
 		.write_u64 = mem_cgroup_oom_control_write,
 		.private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL),
 	},
@@ -6300,7 +6296,7 @@ static struct cftype mem_cgroup_files[] = {
 #ifdef CONFIG_NUMA
 	{
 		.name = "numa_stat",
-		.read_seq_string = memcg_numa_stat_show,
+		.seq_show = memcg_numa_stat_show,
 	},
 #endif
 #ifdef CONFIG_MEMCG_KMEM
@@ -6330,7 +6326,7 @@ static struct cftype mem_cgroup_files[] = {
 #ifdef CONFIG_SLABINFO
 	{
 		.name = "kmem.slabinfo",
-		.read_seq_string = mem_cgroup_slabinfo_read,
+		.seq_show = mem_cgroup_slabinfo_read,
 	},
 #endif
 #endif
diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c
index 498710d..56cbb69 100644
--- a/net/core/netprio_cgroup.c
+++ b/net/core/netprio_cgroup.c
@@ -173,14 +173,14 @@ static u64 read_prioidx(struct cgroup_subsys_state *css, struct cftype *cft)
 	return css->cgroup->id;
 }
 
-static int read_priomap(struct cgroup_subsys_state *css, struct cftype *cft,
-			struct seq_file *sf)
+static int read_priomap(struct seq_file *sf, void *v)
 {
 	struct net_device *dev;
 
 	rcu_read_lock();
 	for_each_netdev_rcu(&init_net, dev)
-		seq_printf(sf, "%s %u\n", dev->name, netprio_prio(css, dev));
+		seq_printf(sf, "%s %u\n", dev->name,
+			   netprio_prio(seq_css(sf), dev));
 	rcu_read_unlock();
 	return 0;
 }
@@ -238,7 +238,7 @@ static struct cftype ss_files[] = {
 	},
 	{
 		.name = "ifpriomap",
-		.read_seq_string = read_priomap,
+		.seq_show = read_priomap,
 		.write_string = write_priomap,
 	},
 	{ }	/* terminate */
diff --git a/security/device_cgroup.c b/security/device_cgroup.c
index 7c2a0a7..d3b6d2c 100644
--- a/security/device_cgroup.c
+++ b/security/device_cgroup.c
@@ -274,10 +274,9 @@ static void set_majmin(char *str, unsigned m)
 		sprintf(str, "%u", m);
 }
 
-static int devcgroup_seq_read(struct cgroup_subsys_state *css,
-			      struct cftype *cft, struct seq_file *m)
+static int devcgroup_seq_show(struct seq_file *m, void *v)
 {
-	struct dev_cgroup *devcgroup = css_to_devcgroup(css);
+	struct dev_cgroup *devcgroup = css_to_devcgroup(seq_css(m));
 	struct dev_exception_item *ex;
 	char maj[MAJMINLEN], min[MAJMINLEN], acc[ACCLEN];
 
@@ -679,7 +678,7 @@ static struct cftype dev_cgroup_files[] = {
 	},
 	{
 		.name = "list",
-		.read_seq_string = devcgroup_seq_read,
+		.seq_show = devcgroup_seq_show,
 		.private = DEVCG_LIST,
 	},
 	{ }	/* terminate */
-- 
cgit v0.10.2


From 6612f05b88fa309c91a345690411217959bb2486 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 5 Dec 2013 12:28:04 -0500
Subject: cgroup: unify pidlist and other file handling

In preparation of conversion to kernfs, cgroup file handling is
updated so that it can be easily mapped to kernfs.  With the previous
changes, the difference between pidlist and other files are very
small.  Both are served by seq_file in a pretty standard way with the
only difference being !pidlist files use single_open().

This patch adds cftype->seq_start(), ->seq_next and ->seq_stop() and
implements the matching cgroup_seqfile_start/next/stop() which either
emulates single_open() behavior or invokes cftype->seq_*() operations
if specified.  This allows using single seq_operations for both
pidlist and other files and makes cgroup_pidlist_operations and
cgorup_pidlist_open() no longer necessary.  As cgroup_pidlist_open()
was the only user of cftype->open(), the method is dropped together.

This brings cftype file interface very close to kernfs interface and
mapping shouldn't be too difficult.  Once converted to kernfs, most of
the plumbing code including cgroup_seqfile_*() will be removed as
kernfs provides those facilities.

This patch does not introduce any behavior changes.

v2: Refreshed on top of the updated "cgroup: introduce struct
    cgroup_pidlist_open_file".

v3: Refreshed on top of the updated "cgroup: attach cgroup_open_file
    to all cgroup files".

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index b32a0f8..8b9a594 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -434,7 +434,6 @@ struct cftype {
 	 */
 	struct cgroup_subsys *ss;
 
-	int (*open)(struct inode *inode, struct file *file);
 	/*
 	 * read_u64() is a shortcut for the common case of returning a
 	 * single integer. Use it in place of read()
@@ -448,6 +447,11 @@ struct cftype {
 	/* generic seq_file read interface */
 	int (*seq_show)(struct seq_file *sf, void *v);
 
+	/* optional ops, implement all or none */
+	void *(*seq_start)(struct seq_file *sf, loff_t *ppos);
+	void *(*seq_next)(struct seq_file *sf, void *v, loff_t *ppos);
+	void (*seq_stop)(struct seq_file *sf, void *v);
+
 	/*
 	 * write_u64() is a shortcut for the common case of accepting
 	 * a single integer (as parsed by simple_strtoull) from
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index c45e633..f9ae38a 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -2286,6 +2286,45 @@ out_free:
  * supports string->u64 maps, but can be extended in future.
  */
 
+static void *cgroup_seqfile_start(struct seq_file *seq, loff_t *ppos)
+{
+	struct cftype *cft = seq_cft(seq);
+
+	if (cft->seq_start) {
+		return cft->seq_start(seq, ppos);
+	} else {
+		/*
+		 * The same behavior and code as single_open().  Returns
+		 * !NULL if pos is at the beginning; otherwise, NULL.
+		 */
+		return NULL + !*ppos;
+	}
+}
+
+static void *cgroup_seqfile_next(struct seq_file *seq, void *v, loff_t *ppos)
+{
+	struct cftype *cft = seq_cft(seq);
+
+	if (cft->seq_next) {
+		return cft->seq_next(seq, v, ppos);
+	} else {
+		/*
+		 * The same behavior and code as single_open(), always
+		 * terminate after the initial read.
+		 */
+		++*ppos;
+		return NULL;
+	}
+}
+
+static void cgroup_seqfile_stop(struct seq_file *seq, void *v)
+{
+	struct cftype *cft = seq_cft(seq);
+
+	if (cft->seq_stop)
+		cft->seq_stop(seq, v);
+}
+
 static int cgroup_seqfile_show(struct seq_file *m, void *arg)
 {
 	struct cftype *cft = seq_cft(m);
@@ -2303,12 +2342,20 @@ static int cgroup_seqfile_show(struct seq_file *m, void *arg)
 	return 0;
 }
 
+static struct seq_operations cgroup_seq_operations = {
+	.start		= cgroup_seqfile_start,
+	.next		= cgroup_seqfile_next,
+	.stop		= cgroup_seqfile_stop,
+	.show		= cgroup_seqfile_show,
+};
+
 static int cgroup_file_open(struct inode *inode, struct file *file)
 {
 	struct cfent *cfe = __d_cfe(file->f_dentry);
 	struct cftype *cft = __d_cft(file->f_dentry);
 	struct cgroup *cgrp = __d_cgrp(cfe->dentry->d_parent);
 	struct cgroup_subsys_state *css;
+	struct cgroup_open_file *of;
 	int err;
 
 	err = generic_file_open(inode, file);
@@ -2338,24 +2385,16 @@ static int cgroup_file_open(struct inode *inode, struct file *file)
 	WARN_ON_ONCE(cfe->css && cfe->css != css);
 	cfe->css = css;
 
-	if (cft->open) {
-		err = cft->open(inode, file);
-	} else {
-		struct cgroup_open_file *of;
-
-		err = -ENOMEM;
-		of = kzalloc(sizeof(*of), GFP_KERNEL);
-		if (of) {
-			of->cfe = cfe;
-			err = single_open(file, cgroup_seqfile_show, of);
-			if (err)
-				kfree(of);
-		}
+	of = __seq_open_private(file, &cgroup_seq_operations,
+				sizeof(struct cgroup_open_file));
+	if (of) {
+		of->cfe = cfe;
+		return 0;
 	}
 
-	if (css->ss && err)
+	if (css->ss)
 		css_put(css);
-	return err;
+	return -ENOMEM;
 }
 
 static int cgroup_file_release(struct inode *inode, struct file *file)
@@ -2365,8 +2404,7 @@ static int cgroup_file_release(struct inode *inode, struct file *file)
 
 	if (css->ss)
 		css_put(css);
-	kfree(((struct seq_file *)file->private_data)->private);
-	return single_release(inode, file);
+	return seq_release_private(inode, file);
 }
 
 /*
@@ -3777,36 +3815,6 @@ static const struct seq_operations cgroup_pidlist_seq_operations = {
 	.show = cgroup_pidlist_show,
 };
 
-static const struct file_operations cgroup_pidlist_operations = {
-	.read = seq_read,
-	.llseek = seq_lseek,
-	.write = cgroup_file_write,
-	.release = seq_release_private,
-};
-
-/*
- * The following functions handle opens on a file that displays a pidlist
- * (tasks or procs). Prepare an array of the process/thread IDs of whoever's
- * in the cgroup.
- */
-/* helper function for the two below it */
-static int cgroup_pidlist_open(struct inode *unused, struct file *file)
-{
-	struct cfent *cfe = __d_cfe(file->f_dentry);
-	struct cgroup_open_file *of;
-
-	/* configure file information */
-	file->f_op = &cgroup_pidlist_operations;
-
-	of = __seq_open_private(file, &cgroup_pidlist_seq_operations,
-				sizeof(*of));
-	if (!of)
-		return -ENOMEM;
-
-	of->cfe = cfe;
-	return 0;
-}
-
 static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css,
 					 struct cftype *cft)
 {
@@ -3860,7 +3868,10 @@ static int cgroup_clone_children_write(struct cgroup_subsys_state *css,
 static struct cftype cgroup_base_files[] = {
 	{
 		.name = "cgroup.procs",
-		.open = cgroup_pidlist_open,
+		.seq_start = cgroup_pidlist_start,
+		.seq_next = cgroup_pidlist_next,
+		.seq_stop = cgroup_pidlist_stop,
+		.seq_show = cgroup_pidlist_show,
 		.private = CGROUP_FILE_PROCS,
 		.write_u64 = cgroup_procs_write,
 		.mode = S_IRUGO | S_IWUSR,
@@ -3885,7 +3896,10 @@ static struct cftype cgroup_base_files[] = {
 	{
 		.name = "tasks",
 		.flags = CFTYPE_INSANE,		/* use "procs" instead */
-		.open = cgroup_pidlist_open,
+		.seq_start = cgroup_pidlist_start,
+		.seq_next = cgroup_pidlist_next,
+		.seq_stop = cgroup_pidlist_stop,
+		.seq_show = cgroup_pidlist_show,
 		.private = CGROUP_FILE_TASKS,
 		.write_u64 = cgroup_tasks_write,
 		.mode = S_IRUGO | S_IWUSR,
-- 
cgit v0.10.2


From 87fb54f1b5a447662854f176eeb1ba92d5ffc1d5 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 6 Dec 2013 15:11:55 -0500
Subject: cgroup: css iterations and css_from_dir() are safe under cgroup_mutex

Currently, all css iterations and css_from_dir() require RCU read lock
whether the caller is holding cgroup_mutex or not, which is
unnecessarily restrictive.  They are all safe to use under
cgroup_mutex without holding RCU read lock.

Factor out cgroup_assert_mutex_or_rcu_locked() from css_from_id() and
apply it to all css iteration functions and css_from_dir().

v2: cgroup_assert_mutex_or_rcu_locked() definition doesn't need to be
    inside CONFIG_PROVE_RCU ifdef as rcu_lockdep_assert() is always
    defined and conditionalized.  Move it outside of the ifdef block.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 2e5fbf9..c22eecb 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -93,6 +93,11 @@ static DEFINE_MUTEX(cgroup_mutex);
 
 static DEFINE_MUTEX(cgroup_root_mutex);
 
+#define cgroup_assert_mutex_or_rcu_locked()				\
+	rcu_lockdep_assert(rcu_read_lock_held() ||			\
+			   lockdep_is_held(&cgroup_mutex),		\
+			   "cgroup_mutex or RCU read lock required");
+
 /*
  * cgroup destruction makes heavy use of work items and there can be a lot
  * of concurrent destructions.  Use a separate workqueue so that cgroup
@@ -2897,9 +2902,9 @@ static void cgroup_enable_task_cg_lists(void)
  * @parent_css: css whose children to walk
  *
  * This function returns the next child of @parent_css and should be called
- * under RCU read lock.  The only requirement is that @parent_css and
- * @pos_css are accessible.  The next sibling is guaranteed to be returned
- * regardless of their states.
+ * under either cgroup_mutex or RCU read lock.  The only requirement is
+ * that @parent_css and @pos_css are accessible.  The next sibling is
+ * guaranteed to be returned regardless of their states.
  */
 struct cgroup_subsys_state *
 css_next_child(struct cgroup_subsys_state *pos_css,
@@ -2909,7 +2914,7 @@ css_next_child(struct cgroup_subsys_state *pos_css,
 	struct cgroup *cgrp = parent_css->cgroup;
 	struct cgroup *next;
 
-	WARN_ON_ONCE(!rcu_read_lock_held());
+	cgroup_assert_mutex_or_rcu_locked();
 
 	/*
 	 * @pos could already have been removed.  Once a cgroup is removed,
@@ -2956,10 +2961,10 @@ EXPORT_SYMBOL_GPL(css_next_child);
  * to visit for pre-order traversal of @root's descendants.  @root is
  * included in the iteration and the first node to be visited.
  *
- * While this function requires RCU read locking, it doesn't require the
- * whole traversal to be contained in a single RCU critical section.  This
- * function will return the correct next descendant as long as both @pos
- * and @root are accessible and @pos is a descendant of @root.
+ * While this function requires cgroup_mutex or RCU read locking, it
+ * doesn't require the whole traversal to be contained in a single critical
+ * section.  This function will return the correct next descendant as long
+ * as both @pos and @root are accessible and @pos is a descendant of @root.
  */
 struct cgroup_subsys_state *
 css_next_descendant_pre(struct cgroup_subsys_state *pos,
@@ -2967,7 +2972,7 @@ css_next_descendant_pre(struct cgroup_subsys_state *pos,
 {
 	struct cgroup_subsys_state *next;
 
-	WARN_ON_ONCE(!rcu_read_lock_held());
+	cgroup_assert_mutex_or_rcu_locked();
 
 	/* if first iteration, visit @root */
 	if (!pos)
@@ -2998,17 +3003,17 @@ EXPORT_SYMBOL_GPL(css_next_descendant_pre);
  * is returned.  This can be used during pre-order traversal to skip
  * subtree of @pos.
  *
- * While this function requires RCU read locking, it doesn't require the
- * whole traversal to be contained in a single RCU critical section.  This
- * function will return the correct rightmost descendant as long as @pos is
- * accessible.
+ * While this function requires cgroup_mutex or RCU read locking, it
+ * doesn't require the whole traversal to be contained in a single critical
+ * section.  This function will return the correct rightmost descendant as
+ * long as @pos is accessible.
  */
 struct cgroup_subsys_state *
 css_rightmost_descendant(struct cgroup_subsys_state *pos)
 {
 	struct cgroup_subsys_state *last, *tmp;
 
-	WARN_ON_ONCE(!rcu_read_lock_held());
+	cgroup_assert_mutex_or_rcu_locked();
 
 	do {
 		last = pos;
@@ -3044,10 +3049,11 @@ css_leftmost_descendant(struct cgroup_subsys_state *pos)
  * to visit for post-order traversal of @root's descendants.  @root is
  * included in the iteration and the last node to be visited.
  *
- * While this function requires RCU read locking, it doesn't require the
- * whole traversal to be contained in a single RCU critical section.  This
- * function will return the correct next descendant as long as both @pos
- * and @cgroup are accessible and @pos is a descendant of @cgroup.
+ * While this function requires cgroup_mutex or RCU read locking, it
+ * doesn't require the whole traversal to be contained in a single critical
+ * section.  This function will return the correct next descendant as long
+ * as both @pos and @cgroup are accessible and @pos is a descendant of
+ * @cgroup.
  */
 struct cgroup_subsys_state *
 css_next_descendant_post(struct cgroup_subsys_state *pos,
@@ -3055,7 +3061,7 @@ css_next_descendant_post(struct cgroup_subsys_state *pos,
 {
 	struct cgroup_subsys_state *next;
 
-	WARN_ON_ONCE(!rcu_read_lock_held());
+	cgroup_assert_mutex_or_rcu_locked();
 
 	/* if first iteration, visit leftmost descendant which may be @root */
 	if (!pos)
@@ -5217,16 +5223,16 @@ __setup("cgroup_disable=", cgroup_disable);
  * @dentry: directory dentry of interest
  * @ss: subsystem of interest
  *
- * Must be called under RCU read lock.  The caller is responsible for
- * pinning the returned css if it needs to be accessed outside the RCU
- * critical section.
+ * Must be called under cgroup_mutex or RCU read lock.  The caller is
+ * responsible for pinning the returned css if it needs to be accessed
+ * outside the critical section.
  */
 struct cgroup_subsys_state *css_from_dir(struct dentry *dentry,
 					 struct cgroup_subsys *ss)
 {
 	struct cgroup *cgrp;
 
-	WARN_ON_ONCE(!rcu_read_lock_held());
+	cgroup_assert_mutex_or_rcu_locked();
 
 	/* is @dentry a cgroup dir? */
 	if (!dentry->d_inode ||
@@ -5249,9 +5255,7 @@ struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss)
 {
 	struct cgroup *cgrp;
 
-	rcu_lockdep_assert(rcu_read_lock_held() ||
-			   lockdep_is_held(&cgroup_mutex),
-			   "css_from_id() needs proper protection");
+	cgroup_assert_mutex_or_rcu_locked();
 
 	cgrp = idr_find(&ss->root->cgroup_idr, id);
 	if (cgrp)
-- 
cgit v0.10.2


From 780cd8b347b52584704438e599274f25c0a0fd13 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 6 Dec 2013 15:11:56 -0500
Subject: cgroup: make for_each_subsys() useable under cgroup_root_mutex

We want to use for_each_subsys() in cgroupfs_root handling where only
cgroup_root_mutex is held.  The only way cgroup_subsys[] can change is
through module load/unload, make cgroup_[un]load_subsys() grab
cgroup_root_mutex too and update the lockdep annotation in
for_each_subsys() to allow either cgroup_mutex or cgroup_root_mutex.

* Lockdep annotation is moved from inner 'if' condition to outer 'for'
  init caluse.  There's no reason to execute the assertion every loop.

* Loop index @i is renamed to @ssid.  Indices iterating through subsys
  will be [re]named to @ssid gradually.

v2: cgroup_assert_mutex_or_root_locked() caused build failure if
    !CONFIG_LOCKEDP.  Conditionalize its definition.  The build failure
    was reported by kbuild test bot.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
Cc: kbuild test robot <fengguang.wu@intel.com>

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index c22eecb..4a7fb40 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -98,6 +98,14 @@ static DEFINE_MUTEX(cgroup_root_mutex);
 			   lockdep_is_held(&cgroup_mutex),		\
 			   "cgroup_mutex or RCU read lock required");
 
+#ifdef CONFIG_LOCKDEP
+#define cgroup_assert_mutex_or_root_locked()				\
+	WARN_ON_ONCE(debug_locks && (!lockdep_is_held(&cgroup_mutex) &&	\
+				     !lockdep_is_held(&cgroup_root_mutex)))
+#else
+#define cgroup_assert_mutex_or_root_locked()	do { } while (0)
+#endif
+
 /*
  * cgroup destruction makes heavy use of work items and there can be a lot
  * of concurrent destructions.  Use a separate workqueue so that cgroup
@@ -237,14 +245,15 @@ static int notify_on_release(const struct cgroup *cgrp)
 /**
  * for_each_subsys - iterate all loaded cgroup subsystems
  * @ss: the iteration cursor
- * @i: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end
+ * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end
  *
- * Should be called under cgroup_mutex.
+ * Iterates through all loaded subsystems.  Should be called under
+ * cgroup_mutex or cgroup_root_mutex.
  */
-#define for_each_subsys(ss, i)						\
-	for ((i) = 0; (i) < CGROUP_SUBSYS_COUNT; (i)++)			\
-		if (({ lockdep_assert_held(&cgroup_mutex);		\
-		       !((ss) = cgroup_subsys[i]); })) { }		\
+#define for_each_subsys(ss, ssid)					\
+	for (({ cgroup_assert_mutex_or_root_locked(); (ssid) = 0; });	\
+	     (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++)			\
+		if (!((ss) = cgroup_subsys[(ssid)])) { }		\
 		else
 
 /**
@@ -4592,6 +4601,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
 	cgroup_init_cftsets(ss);
 
 	mutex_lock(&cgroup_mutex);
+	mutex_lock(&cgroup_root_mutex);
 	cgroup_subsys[ss->subsys_id] = ss;
 
 	/*
@@ -4641,10 +4651,12 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
 		goto err_unload;
 
 	/* success! */
+	mutex_unlock(&cgroup_root_mutex);
 	mutex_unlock(&cgroup_mutex);
 	return 0;
 
 err_unload:
+	mutex_unlock(&cgroup_root_mutex);
 	mutex_unlock(&cgroup_mutex);
 	/* @ss can't be mounted here as try_module_get() would fail */
 	cgroup_unload_subsys(ss);
@@ -4674,6 +4686,7 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)
 	BUG_ON(ss->root != &cgroup_dummy_root);
 
 	mutex_lock(&cgroup_mutex);
+	mutex_lock(&cgroup_root_mutex);
 
 	offline_css(cgroup_css(cgroup_dummy_top, ss));
 
@@ -4708,6 +4721,7 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)
 	ss->css_free(cgroup_css(cgroup_dummy_top, ss));
 	RCU_INIT_POINTER(cgroup_dummy_top->subsys[ss->subsys_id], NULL);
 
+	mutex_unlock(&cgroup_root_mutex);
 	mutex_unlock(&cgroup_mutex);
 }
 EXPORT_SYMBOL_GPL(cgroup_unload_subsys);
-- 
cgit v0.10.2


From 0d80255e42b54419cfc6b10a3ec74b60fe04b8d7 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 6 Dec 2013 15:11:56 -0500
Subject: cgroup: reorder operations in cgroup_create()

cgroup_create() currently does the followings.

1. alloc cgroup
2. alloc css's
3. create the directory and commit to cgroup creation
4. online css's
5. create cgroup and css files

The sequence performs allocations before other operations but it
doesn't buy anything because each of the above steps may fail and
should be unrollable.  Reorganize the sequence such that cgroup
operations are done before css operations.

1. alloc cgroup
2. create the directory and files and commit to cgroup creation
3. alloc css's
4. create files for and online css's

This simplifies the code a bit and enables further simplification and
separating out css creation from cgroup creation which is necessary
for the planned unified hierarchy where css's will be created and
destroyed dynamically across the lifetime of a cgroup.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 4a7fb40..30a2670 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -4144,23 +4144,6 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
 	if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags))
 		set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
 
-	for_each_root_subsys(root, ss) {
-		struct cgroup_subsys_state *css;
-
-		css = ss->css_alloc(cgroup_css(parent, ss));
-		if (IS_ERR(css)) {
-			err = PTR_ERR(css);
-			goto err_free_all;
-		}
-		css_ar[ss->subsys_id] = css;
-
-		err = percpu_ref_init(&css->refcnt, css_release);
-		if (err)
-			goto err_free_all;
-
-		init_css(css, ss, cgrp);
-	}
-
 	/*
 	 * Create directory.  cgroup_create_file() returns with the new
 	 * directory locked on success so that it can be populated without
@@ -4168,7 +4151,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
 	 */
 	err = cgroup_create_file(dentry, S_IFDIR | mode, sb);
 	if (err < 0)
-		goto err_free_all;
+		goto err_unlock;
 	lockdep_assert_held(&dentry->d_inode->i_mutex);
 
 	cgrp->serial_nr = cgroup_serial_nr_next++;
@@ -4180,10 +4163,41 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
 	/* hold a ref to the parent's dentry */
 	dget(parent->dentry);
 
+	/*
+	 * @cgrp is now fully operational.  If something fails after this
+	 * point, it'll be released via the normal destruction path.
+	 */
+	idr_replace(&root->cgroup_idr, cgrp, cgrp->id);
+
+	err = cgroup_addrm_files(cgrp, cgroup_base_files, true);
+	if (err)
+		goto err_destroy;
+
+	for_each_root_subsys(root, ss) {
+		struct cgroup_subsys_state *css;
+
+		css = ss->css_alloc(cgroup_css(parent, ss));
+		if (IS_ERR(css)) {
+			err = PTR_ERR(css);
+			goto err_destroy;
+		}
+		css_ar[ss->subsys_id] = css;
+
+		err = percpu_ref_init(&css->refcnt, css_release);
+		if (err)
+			goto err_destroy;
+
+		init_css(css, ss, cgrp);
+	}
+
 	/* creation succeeded, notify subsystems */
 	for_each_root_subsys(root, ss) {
 		struct cgroup_subsys_state *css = css_ar[ss->subsys_id];
 
+		err = cgroup_populate_dir(cgrp, 1 << ss->subsys_id);
+		if (err)
+			goto err_destroy;
+
 		err = online_css(css);
 		if (err)
 			goto err_destroy;
@@ -4205,30 +4219,12 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
 		}
 	}
 
-	idr_replace(&root->cgroup_idr, cgrp, cgrp->id);
-
-	err = cgroup_addrm_files(cgrp, cgroup_base_files, true);
-	if (err)
-		goto err_destroy;
-
-	err = cgroup_populate_dir(cgrp, root->subsys_mask);
-	if (err)
-		goto err_destroy;
-
 	mutex_unlock(&cgroup_mutex);
 	mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
 
 	return 0;
 
-err_free_all:
-	for_each_root_subsys(root, ss) {
-		struct cgroup_subsys_state *css = css_ar[ss->subsys_id];
-
-		if (css) {
-			percpu_ref_cancel_init(&css->refcnt);
-			ss->css_free(css);
-		}
-	}
+err_unlock:
 	mutex_unlock(&cgroup_mutex);
 	/* Release the reference count that we took on the superblock */
 	deactivate_super(sb);
-- 
cgit v0.10.2


From 9d403e99238ed6d7151a2c07db6cf8f6932ef3d5 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 6 Dec 2013 15:11:56 -0500
Subject: cgroup: combine css handling loops in cgroup_create()

Now that css operations in cgroup_create() are back-to-back, there
isn't much point in allocating css's in one loop and onlining them in
another.  Merge the two loops so that a css is allocated and onlined
on each iteration.

css_ar[] is no longer necessary and replaced with a single pointer.
This also simplifies the error handling path.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 30a2670..39e2295 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -4084,7 +4084,7 @@ static void offline_css(struct cgroup_subsys_state *css)
 static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
 			     umode_t mode)
 {
-	struct cgroup_subsys_state *css_ar[CGROUP_SUBSYS_COUNT] = { };
+	struct cgroup_subsys_state *css = NULL;
 	struct cgroup *cgrp;
 	struct cgroup_name *name;
 	struct cgroupfs_root *root = parent->root;
@@ -4173,26 +4173,20 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
 	if (err)
 		goto err_destroy;
 
+	/* let's create and online css's */
 	for_each_root_subsys(root, ss) {
-		struct cgroup_subsys_state *css;
-
 		css = ss->css_alloc(cgroup_css(parent, ss));
 		if (IS_ERR(css)) {
 			err = PTR_ERR(css);
+			css = NULL;
 			goto err_destroy;
 		}
-		css_ar[ss->subsys_id] = css;
 
 		err = percpu_ref_init(&css->refcnt, css_release);
 		if (err)
 			goto err_destroy;
 
 		init_css(css, ss, cgrp);
-	}
-
-	/* creation succeeded, notify subsystems */
-	for_each_root_subsys(root, ss) {
-		struct cgroup_subsys_state *css = css_ar[ss->subsys_id];
 
 		err = cgroup_populate_dir(cgrp, 1 << ss->subsys_id);
 		if (err)
@@ -4202,12 +4196,11 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
 		if (err)
 			goto err_destroy;
 
-		/* each css holds a ref to the cgroup's dentry and parent css */
 		dget(dentry);
 		css_get(css->parent);
 
 		/* mark it consumed for error path */
-		css_ar[ss->subsys_id] = NULL;
+		css = NULL;
 
 		if (ss->broken_hierarchy && !ss->warned_broken_hierarchy &&
 		    parent->parent) {
@@ -4237,13 +4230,9 @@ err_free_cgrp:
 	return err;
 
 err_destroy:
-	for_each_root_subsys(root, ss) {
-		struct cgroup_subsys_state *css = css_ar[ss->subsys_id];
-
-		if (css) {
-			percpu_ref_cancel_init(&css->refcnt);
-			ss->css_free(css);
-		}
+	if (css) {
+		percpu_ref_cancel_init(&css->refcnt);
+		css->ss->css_free(css);
 	}
 	cgroup_destroy_locked(cgrp);
 	mutex_unlock(&cgroup_mutex);
-- 
cgit v0.10.2


From c81c925ad9b0460a14ec35b52c61158da0733d51 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 6 Dec 2013 15:11:56 -0500
Subject: cgroup: factor out cgroup_subsys_state creation into create_css()

Now that all opertations to create a css (cgroup_subsys_state) are
collected into a single loop in cgroup_create(), it's easy to factor
it out into its own function.  Factor out css creation into
create_css().  This makes the code easier to follow and will enable
decoupling css creation from cgroup creation which is necessary for
the planned unified hierarchy.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 39e2295..d12c29f 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -4073,6 +4073,62 @@ static void offline_css(struct cgroup_subsys_state *css)
 	RCU_INIT_POINTER(css->cgroup->subsys[ss->subsys_id], css);
 }
 
+/**
+ * create_css - create a cgroup_subsys_state
+ * @cgrp: the cgroup new css will be associated with
+ * @ss: the subsys of new css
+ *
+ * Create a new css associated with @cgrp - @ss pair.  On success, the new
+ * css is online and installed in @cgrp with all interface files created.
+ * Returns 0 on success, -errno on failure.
+ */
+static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss)
+{
+	struct cgroup *parent = cgrp->parent;
+	struct cgroup_subsys_state *css;
+	int err;
+
+	lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex);
+	lockdep_assert_held(&cgroup_mutex);
+
+	css = ss->css_alloc(cgroup_css(parent, ss));
+	if (IS_ERR(css))
+		return PTR_ERR(css);
+
+	err = percpu_ref_init(&css->refcnt, css_release);
+	if (err)
+		goto err_free;
+
+	init_css(css, ss, cgrp);
+
+	err = cgroup_populate_dir(cgrp, 1 << ss->subsys_id);
+	if (err)
+		goto err_free;
+
+	err = online_css(css);
+	if (err)
+		goto err_free;
+
+	dget(cgrp->dentry);
+	css_get(css->parent);
+
+	if (ss->broken_hierarchy && !ss->warned_broken_hierarchy &&
+	    parent->parent) {
+		pr_warning("cgroup: %s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n",
+			   current->comm, current->pid, ss->name);
+		if (!strcmp(ss->name, "memory"))
+			pr_warning("cgroup: \"memory\" requires setting use_hierarchy to 1 on the root.\n");
+		ss->warned_broken_hierarchy = true;
+	}
+
+	return 0;
+
+err_free:
+	percpu_ref_cancel_init(&css->refcnt);
+	ss->css_free(css);
+	return err;
+}
+
 /*
  * cgroup_create - create a cgroup
  * @parent: cgroup that will be parent of the new cgroup
@@ -4084,7 +4140,6 @@ static void offline_css(struct cgroup_subsys_state *css)
 static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
 			     umode_t mode)
 {
-	struct cgroup_subsys_state *css = NULL;
 	struct cgroup *cgrp;
 	struct cgroup_name *name;
 	struct cgroupfs_root *root = parent->root;
@@ -4175,41 +4230,9 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
 
 	/* let's create and online css's */
 	for_each_root_subsys(root, ss) {
-		css = ss->css_alloc(cgroup_css(parent, ss));
-		if (IS_ERR(css)) {
-			err = PTR_ERR(css);
-			css = NULL;
-			goto err_destroy;
-		}
-
-		err = percpu_ref_init(&css->refcnt, css_release);
+		err = create_css(cgrp, ss);
 		if (err)
 			goto err_destroy;
-
-		init_css(css, ss, cgrp);
-
-		err = cgroup_populate_dir(cgrp, 1 << ss->subsys_id);
-		if (err)
-			goto err_destroy;
-
-		err = online_css(css);
-		if (err)
-			goto err_destroy;
-
-		dget(dentry);
-		css_get(css->parent);
-
-		/* mark it consumed for error path */
-		css = NULL;
-
-		if (ss->broken_hierarchy && !ss->warned_broken_hierarchy &&
-		    parent->parent) {
-			pr_warning("cgroup: %s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n",
-				   current->comm, current->pid, ss->name);
-			if (!strcmp(ss->name, "memory"))
-				pr_warning("cgroup: \"memory\" requires setting use_hierarchy to 1 on the root.\n");
-			ss->warned_broken_hierarchy = true;
-		}
 	}
 
 	mutex_unlock(&cgroup_mutex);
@@ -4230,10 +4253,6 @@ err_free_cgrp:
 	return err;
 
 err_destroy:
-	if (css) {
-		percpu_ref_cancel_init(&css->refcnt);
-		css->ss->css_free(css);
-	}
 	cgroup_destroy_locked(cgrp);
 	mutex_unlock(&cgroup_mutex);
 	mutex_unlock(&dentry->d_inode->i_mutex);
-- 
cgit v0.10.2


From 1c6727af4b495a9ec74c46d1fc08e508e675899d Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 6 Dec 2013 15:11:56 -0500
Subject: cgroup: implement for_each_css()

There are enough places where css's of a cgroup are iterated, which
currently uses for_each_root_subsys() + explicit cgroup_css().  This
patch implements for_each_css() and replaces the above combination
with it.

This patch doesn't introduce any behavior changes.

v2: Updated to apply cleanly on top of v2 of "cgroup: fix css leaks on
    online_css() failure"

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index d12c29f..329fde8 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -243,6 +243,21 @@ static int notify_on_release(const struct cgroup *cgrp)
 }
 
 /**
+ * for_each_css - iterate all css's of a cgroup
+ * @css: the iteration cursor
+ * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end
+ * @cgrp: the target cgroup to iterate css's of
+ *
+ * Should be called under cgroup_mutex.
+ */
+#define for_each_css(css, ssid, cgrp)					\
+	for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++)	\
+		if (!((css) = rcu_dereference_check(			\
+				(cgrp)->subsys[(ssid)],			\
+				lockdep_is_held(&cgroup_mutex)))) { }	\
+		else
+
+/**
  * for_each_subsys - iterate all loaded cgroup subsystems
  * @ss: the iteration cursor
  * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end
@@ -1942,8 +1957,8 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
 			      bool threadgroup)
 {
 	int retval, i, group_size;
-	struct cgroup_subsys *ss, *failed_ss = NULL;
 	struct cgroupfs_root *root = cgrp->root;
+	struct cgroup_subsys_state *css, *failed_css = NULL;
 	/* threadgroup list cursor and array */
 	struct task_struct *leader = tsk;
 	struct task_and_cgroup *tc;
@@ -2016,13 +2031,11 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
 	/*
 	 * step 1: check that we can legitimately attach to the cgroup.
 	 */
-	for_each_root_subsys(root, ss) {
-		struct cgroup_subsys_state *css = cgroup_css(cgrp, ss);
-
-		if (ss->can_attach) {
-			retval = ss->can_attach(css, &tset);
+	for_each_css(css, i, cgrp) {
+		if (css->ss->can_attach) {
+			retval = css->ss->can_attach(css, &tset);
 			if (retval) {
-				failed_ss = ss;
+				failed_css = css;
 				goto out_cancel_attach;
 			}
 		}
@@ -2058,12 +2071,9 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
 	/*
 	 * step 4: do subsystem attach callbacks.
 	 */
-	for_each_root_subsys(root, ss) {
-		struct cgroup_subsys_state *css = cgroup_css(cgrp, ss);
-
-		if (ss->attach)
-			ss->attach(css, &tset);
-	}
+	for_each_css(css, i, cgrp)
+		if (css->ss->attach)
+			css->ss->attach(css, &tset);
 
 	/*
 	 * step 5: success! and cleanup
@@ -2080,13 +2090,11 @@ out_put_css_set_refs:
 	}
 out_cancel_attach:
 	if (retval) {
-		for_each_root_subsys(root, ss) {
-			struct cgroup_subsys_state *css = cgroup_css(cgrp, ss);
-
-			if (ss == failed_ss)
+		for_each_css(css, i, cgrp) {
+			if (css == failed_css)
 				break;
-			if (ss->cancel_attach)
-				ss->cancel_attach(css, &tset);
+			if (css->ss->cancel_attach)
+				css->ss->cancel_attach(css, &tset);
 		}
 	}
 out_free_group_list:
@@ -4375,9 +4383,10 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
 	__releases(&cgroup_mutex) __acquires(&cgroup_mutex)
 {
 	struct dentry *d = cgrp->dentry;
-	struct cgroup_subsys *ss;
+	struct cgroup_subsys_state *css;
 	struct cgroup *child;
 	bool empty;
+	int ssid;
 
 	lockdep_assert_held(&d->d_inode->i_mutex);
 	lockdep_assert_held(&cgroup_mutex);
@@ -4413,12 +4422,8 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
 	 * will be invoked to perform the rest of destruction once the
 	 * percpu refs of all css's are confirmed to be killed.
 	 */
-	for_each_root_subsys(cgrp->root, ss) {
-		struct cgroup_subsys_state *css = cgroup_css(cgrp, ss);
-
-		if (css)
-			kill_css(css);
-	}
+	for_each_css(css, ssid, cgrp)
+		kill_css(css);
 
 	/*
 	 * Mark @cgrp dead.  This prevents further task migration and child
-- 
cgit v0.10.2


From b85d20404cef6493f9d2edbafe83f9c72aece9a8 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 6 Dec 2013 15:11:57 -0500
Subject: cgroup: remove for_each_root_subsys()

After the previous patch which introduced for_each_css(),
for_each_root_subsys() only has two users left.  This patch replaces
it with for_each_subsys() + explicit subsys_mask testing and remove
for_each_root_subsys() along with cgroupfs_root->subsys_list handling.

This patch doesn't introduce any behavior changes.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 8b9a594..cfaf416 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -319,9 +319,6 @@ struct cgroupfs_root {
 	/* Unique id for this hierarchy. */
 	int hierarchy_id;
 
-	/* A list running through the attached subsystems */
-	struct list_head subsys_list;
-
 	/* The root cgroup for this hierarchy */
 	struct cgroup top_cgroup;
 
@@ -617,12 +614,8 @@ struct cgroup_subsys {
 #define MAX_CGROUP_TYPE_NAMELEN 32
 	const char *name;
 
-	/*
-	 * Link to parent, and list entry in parent's children.
-	 * Protected by cgroup_lock()
-	 */
+	/* link to parent, protected by cgroup_lock() */
 	struct cgroupfs_root *root;
-	struct list_head sibling;
 
 	/* list of cftype_sets */
 	struct list_head cftsets;
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 329fde8..fb1193b 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -283,10 +283,6 @@ static int notify_on_release(const struct cgroup *cgrp)
 	for ((i) = 0; (i) < CGROUP_BUILTIN_SUBSYS_COUNT &&		\
 	     (((ss) = cgroup_subsys[i]) || true); (i)++)
 
-/* iterate each subsystem attached to a hierarchy */
-#define for_each_root_subsys(root, ss)					\
-	list_for_each_entry((ss), &(root)->subsys_list, sibling)
-
 /* iterate across the active hierarchies */
 #define for_each_active_root(root)					\
 	list_for_each_entry((root), &cgroup_roots, root_list)
@@ -1033,7 +1029,6 @@ static int rebind_subsystems(struct cgroupfs_root *root,
 					   cgroup_css(cgroup_dummy_top, ss));
 			cgroup_css(cgrp, ss)->cgroup = cgrp;
 
-			list_move(&ss->sibling, &root->subsys_list);
 			ss->root = root;
 			if (ss->bind)
 				ss->bind(cgroup_css(cgrp, ss));
@@ -1052,7 +1047,6 @@ static int rebind_subsystems(struct cgroupfs_root *root,
 			RCU_INIT_POINTER(cgrp->subsys[i], NULL);
 
 			cgroup_subsys[i]->root = &cgroup_dummy_root;
-			list_move(&ss->sibling, &cgroup_dummy_root.subsys_list);
 
 			/* subsystem is now free - drop reference on module */
 			module_put(ss->module);
@@ -1079,10 +1073,12 @@ static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry)
 {
 	struct cgroupfs_root *root = dentry->d_sb->s_fs_info;
 	struct cgroup_subsys *ss;
+	int ssid;
 
 	mutex_lock(&cgroup_root_mutex);
-	for_each_root_subsys(root, ss)
-		seq_printf(seq, ",%s", ss->name);
+	for_each_subsys(ss, ssid)
+		if (root->subsys_mask & (1 << ssid))
+			seq_printf(seq, ",%s", ss->name);
 	if (root->flags & CGRP_ROOT_SANE_BEHAVIOR)
 		seq_puts(seq, ",sane_behavior");
 	if (root->flags & CGRP_ROOT_NOPREFIX)
@@ -1352,7 +1348,6 @@ static void init_cgroup_root(struct cgroupfs_root *root)
 {
 	struct cgroup *cgrp = &root->top_cgroup;
 
-	INIT_LIST_HEAD(&root->subsys_list);
 	INIT_LIST_HEAD(&root->root_list);
 	root->number_of_cgroups = 1;
 	cgrp->root = root;
@@ -4151,7 +4146,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
 	struct cgroup *cgrp;
 	struct cgroup_name *name;
 	struct cgroupfs_root *root = parent->root;
-	int err = 0;
+	int ssid, err = 0;
 	struct cgroup_subsys *ss;
 	struct super_block *sb = root->sb;
 
@@ -4237,10 +4232,12 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
 		goto err_destroy;
 
 	/* let's create and online css's */
-	for_each_root_subsys(root, ss) {
-		err = create_css(cgrp, ss);
-		if (err)
-			goto err_destroy;
+	for_each_subsys(ss, ssid) {
+		if (root->subsys_mask & (1 << ssid)) {
+			err = create_css(cgrp, ss);
+			if (err)
+				goto err_destroy;
+		}
 	}
 
 	mutex_unlock(&cgroup_mutex);
@@ -4536,7 +4533,6 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
 	cgroup_init_cftsets(ss);
 
 	/* Create the top cgroup state for this subsystem */
-	list_add(&ss->sibling, &cgroup_dummy_root.subsys_list);
 	ss->root = &cgroup_dummy_root;
 	css = ss->css_alloc(cgroup_css(cgroup_dummy_top, ss));
 	/* We don't handle early failures gracefully */
@@ -4626,7 +4622,6 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
 		return PTR_ERR(css);
 	}
 
-	list_add(&ss->sibling, &cgroup_dummy_root.subsys_list);
 	ss->root = &cgroup_dummy_root;
 
 	/* our new subsystem will be attached to the dummy hierarchy. */
@@ -4702,9 +4697,6 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)
 	/* deassign the subsys_id */
 	cgroup_subsys[ss->subsys_id] = NULL;
 
-	/* remove subsystem from the dummy root's list of subsystems */
-	list_del_init(&ss->sibling);
-
 	/*
 	 * disentangle the css from all css_sets attached to the dummy
 	 * top. as in loading, we need to pay our respects to the hashtable
@@ -4901,11 +4893,12 @@ int proc_cgroup_show(struct seq_file *m, void *v)
 	for_each_active_root(root) {
 		struct cgroup_subsys *ss;
 		struct cgroup *cgrp;
-		int count = 0;
+		int ssid, count = 0;
 
 		seq_printf(m, "%d:", root->hierarchy_id);
-		for_each_root_subsys(root, ss)
-			seq_printf(m, "%s%s", count++ ? "," : "", ss->name);
+		for_each_subsys(ss, ssid)
+			if (root->subsys_mask & (1 << ssid))
+				seq_printf(m, "%s%s", count++ ? "," : "", ss->name);
 		if (strlen(root->name))
 			seq_printf(m, "%sname=%s", count ? "," : "",
 				   root->name);
-- 
cgit v0.10.2


From 0be8669dd530f60cf3f59f084518570c1dfb47ee Mon Sep 17 00:00:00 2001
From: Wei Yongjun <yongjun_wei@trendmicro.com.cn>
Date: Mon, 9 Dec 2013 20:38:29 +0800
Subject: cgroup: fix missing unlock on error in cgroup_load_subsys()

Add the missing unlock before return from function cgroup_load_subsys()
in the error handling case.

Signed-off-by: Wei Yongjun <yongjun_wei@trendmicro.com.cn>
Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index fb1193b..402f7aa 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -4618,6 +4618,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
 	if (IS_ERR(css)) {
 		/* failure case - need to deassign the cgroup_subsys[] slot. */
 		cgroup_subsys[ss->subsys_id] = NULL;
+		mutex_unlock(&cgroup_root_mutex);
 		mutex_unlock(&cgroup_mutex);
 		return PTR_ERR(css);
 	}
-- 
cgit v0.10.2


From 10bf2f7e7db993273419ca9f51f5934e4cf71768 Mon Sep 17 00:00:00 2001
From: Vladimir Davydov <vdavydov@parallels.com>
Date: Thu, 12 Dec 2013 23:17:08 +0400
Subject: cgroup: fix fail path in cgroup_load_subsys()

Calling cgroup_unload_subsys() from cgroup_load_subsys() after
online_css() failure will result in a NULL ptr dereference on attempt to
offline_css(), because online_css() only assigns css to cgroup on
success. Let's fix that by skipping calls to offline_css() and
css_free() in cgroup_unload_subsys() if there is no css, and freeing css
in cgroup_load_subsys() on online_css() failure.

Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 402f7aa..7030f04 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -4652,8 +4652,10 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
 	write_unlock(&css_set_lock);
 
 	ret = online_css(css);
-	if (ret)
+	if (ret) {
+		ss->css_free(css);
 		goto err_unload;
+	}
 
 	/* success! */
 	mutex_unlock(&cgroup_root_mutex);
@@ -4680,6 +4682,7 @@ EXPORT_SYMBOL_GPL(cgroup_load_subsys);
 void cgroup_unload_subsys(struct cgroup_subsys *ss)
 {
 	struct cgrp_cset_link *link;
+	struct cgroup_subsys_state *css;
 
 	BUG_ON(ss->module == NULL);
 
@@ -4693,7 +4696,9 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)
 	mutex_lock(&cgroup_mutex);
 	mutex_lock(&cgroup_root_mutex);
 
-	offline_css(cgroup_css(cgroup_dummy_top, ss));
+	css = cgroup_css(cgroup_dummy_top, ss);
+	if (css)
+		offline_css(css);
 
 	/* deassign the subsys_id */
 	cgroup_subsys[ss->subsys_id] = NULL;
@@ -4720,7 +4725,8 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)
 	 * need to free before marking as null because ss->css_free needs
 	 * the cgrp->subsys pointer to find their state.
 	 */
-	ss->css_free(cgroup_css(cgroup_dummy_top, ss));
+	if (css)
+		ss->css_free(css);
 	RCU_INIT_POINTER(cgroup_dummy_top->subsys[ss->subsys_id], NULL);
 
 	mutex_unlock(&cgroup_root_mutex);
-- 
cgit v0.10.2


From 8173d5a495b5f859d56d8581d0e2ce5c1fab9c33 Mon Sep 17 00:00:00 2001
From: Masanari Iida <standby24x7@gmail.com>
Date: Sun, 22 Dec 2013 00:57:33 +0900
Subject: doc: cgroups: Fix typo in doc/cgroups

Correct spelling typo in memory.txt and
resource_counter.txt

Signed-off-by: Masanari Iida <standby24x7@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>

diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt
index e2bc132..2622115 100644
--- a/Documentation/cgroups/memory.txt
+++ b/Documentation/cgroups/memory.txt
@@ -577,7 +577,7 @@ Each memcg's numa_stat file includes "total", "file", "anon" and "unevictable"
 per-node page counts including "hierarchical_<counter>" which sums up all
 hierarchical children's values in addition to the memcg's own value.
 
-The ouput format of memory.numa_stat is:
+The output format of memory.numa_stat is:
 
 total=<total pages> N0=<node 0 pages> N1=<node 1 pages> ...
 file=<total file pages> N0=<node 0 pages> N1=<node 1 pages> ...
@@ -670,7 +670,7 @@ page tables.
 
 8.1 Interface
 
-This feature is disabled by default. It can be enabledi (and disabled again) by
+This feature is disabled by default. It can be enabled (and disabled again) by
 writing to memory.move_charge_at_immigrate of the destination cgroup.
 
 If you want to enable it:
diff --git a/Documentation/cgroups/resource_counter.txt b/Documentation/cgroups/resource_counter.txt
index c4d99ed..52e1da1 100644
--- a/Documentation/cgroups/resource_counter.txt
+++ b/Documentation/cgroups/resource_counter.txt
@@ -97,8 +97,8 @@ to work with it.
 		(struct res_counter *rc, struct res_counter *top,
 		 unsinged long val)
 
-	Almost same as res_cunter_uncharge() but propagation of uncharge
-	stops when rc == top. This is useful when kill a res_coutner in
+	Almost same as res_counter_uncharge() but propagation of uncharge
+	stops when rc == top. This is useful when kill a res_counter in
 	child cgroup.
 
  2.1 Other accounting routines
-- 
cgit v0.10.2


From b3ff8a2f9569fb41b9cf8902897d787a33bac84f Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Sun, 12 Jan 2014 20:23:27 -0800
Subject: cgroup: remove stray references to css_id

Trivial: remove the few stray references to css_id, which itself
was removed in v3.13's 2ff2a7d03bbe "cgroup: kill css_id".

Signed-off-by: Hugh Dickins <hughd@google.com>
Signed-off-by: Tejun Heo <tj@kernel.org>

diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index fbcc851..61bcfc2 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -163,7 +163,6 @@ static struct cgroup_subsys_state *bcachecg_create(struct cgroup *cgroup)
 static void bcachecg_destroy(struct cgroup *cgroup)
 {
 	struct bch_cgroup *cg = cgroup_to_bcache(cgroup);
-	free_css_id(&bcache_subsys, &cg->css);
 	kfree(cg);
 }
 
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index cfaf416..5c09759 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -29,7 +29,6 @@ struct cgroupfs_root;
 struct cgroup_subsys;
 struct inode;
 struct cgroup;
-struct css_id;
 
 extern int cgroup_init_early(void);
 extern int cgroup_init(void);
@@ -79,8 +78,6 @@ struct cgroup_subsys_state {
 	struct cgroup_subsys_state *parent;
 
 	unsigned long flags;
-	/* ID for this css, if possible */
-	struct css_id __rcu *id;
 
 	/* percpu_ref killing and RCU release */
 	struct rcu_head rcu_head;
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
index 6d757e3a..3bd0b8e 100644
--- a/mm/page_cgroup.c
+++ b/mm/page_cgroup.c
@@ -451,7 +451,7 @@ unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id)
  * lookup_swap_cgroup_id - lookup mem_cgroup id tied to swap entry
  * @ent: swap entry to be looked up.
  *
- * Returns CSS ID of mem_cgroup at success. 0 at failure. (0 is invalid ID)
+ * Returns ID of mem_cgroup at success. 0 at failure. (0 is invalid ID)
  */
 unsigned short lookup_swap_cgroup_id(swp_entry_t ent)
 {
-- 
cgit v0.10.2


From dd4b0a4676907481256d16d5de0851b315a6f22c Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj38.park@gmail.com>
Date: Sat, 18 Jan 2014 16:56:47 +0900
Subject: cgroup: trivial style updates

* Place newline before function opening brace in cgroup_kill_sb().

* Insert space before assignment in attach_task_by_pid()

tj: merged two patches into one.

Signed-off-by: SeongJae Park <sj38.park@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 7030f04..680e2be 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1669,7 +1669,8 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
 	return ERR_PTR(ret);
 }
 
-static void cgroup_kill_sb(struct super_block *sb) {
+static void cgroup_kill_sb(struct super_block *sb)
+{
 	struct cgroupfs_root *root = sb->s_fs_info;
 	struct cgroup *cgrp = &root->top_cgroup;
 	struct cgrp_cset_link *link, *tmp_link;
@@ -2117,7 +2118,7 @@ retry_find_task:
 		tsk = find_task_by_vpid(pid);
 		if (!tsk) {
 			rcu_read_unlock();
-			ret= -ESRCH;
+			ret = -ESRCH;
 			goto out_unlock_cgroup;
 		}
 		/*
-- 
cgit v0.10.2