From f9f0a7d0dcbd19e9705e8b96a4b408f035e25c93 Mon Sep 17 00:00:00 2001
From: Olof Johansson <olof@lixom.net>
Date: Mon, 8 Jul 2013 15:59:35 -0700
Subject: drivers/dma/iop-adma.c: fix new warnings

The recent "drivers/dma: remove unused support for MEMSET operations"
change has fallout from lack of build testing by the author.  This
fixes:

  drivers/dma/iop-adma.c:1020:13: warning: unused variable 'dma_addr' [-Wunused-variable]
  drivers/dma/iop-adma.c:1519:2: warning: format '%s' expects a matching 'char *' argument [-Wformat=]

Signed-off-by: Olof Johansson <olof@lixom.net>
Cc: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
Cc: Kyungmin Park <kyungmin.park@samsung.com>
Cc: Sebastian Hesselbarth <sebastian.hesselbarth@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/dma/iop-adma.c b/drivers/dma/iop-adma.c
index c9cc08c..cc727ec 100644
--- a/drivers/dma/iop-adma.c
+++ b/drivers/dma/iop-adma.c
@@ -1017,7 +1017,7 @@ iop_adma_xor_val_self_test(struct iop_adma_device *device)
 	struct page *xor_srcs[IOP_ADMA_NUM_SRC_TEST];
 	struct page *zero_sum_srcs[IOP_ADMA_NUM_SRC_TEST + 1];
 	dma_addr_t dma_srcs[IOP_ADMA_NUM_SRC_TEST + 1];
-	dma_addr_t dma_addr, dest_dma;
+	dma_addr_t dest_dma;
 	struct dma_async_tx_descriptor *tx;
 	struct dma_chan *dma_chan;
 	dma_cookie_t cookie;
@@ -1516,7 +1516,7 @@ static int iop_adma_probe(struct platform_device *pdev)
 			goto err_free_iop_chan;
 	}
 
-	dev_info(&pdev->dev, "Intel(R) IOP: ( %s%s%s%s%s%s%s)\n",
+	dev_info(&pdev->dev, "Intel(R) IOP: ( %s%s%s%s%s%s)\n",
 		 dma_has_cap(DMA_PQ, dma_dev->cap_mask) ? "pq " : "",
 		 dma_has_cap(DMA_PQ_VAL, dma_dev->cap_mask) ? "pq_val " : "",
 		 dma_has_cap(DMA_XOR, dma_dev->cap_mask) ? "xor " : "",
-- 
cgit v0.10.2


From 79f6530cb59e2a0af6953742a33cc29e98ca631c Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Mon, 8 Jul 2013 15:59:36 -0700
Subject: audit: fix mq_open and mq_unlink to add the MQ root as a hidden
 parent audit_names record

The old audit PATH records for mq_open looked like this:

  type=PATH msg=audit(1366282323.982:869): item=1 name=(null) inode=6777
  dev=00:0c mode=041777 ouid=0 ogid=0 rdev=00:00
  obj=system_u:object_r:tmpfs_t:s15:c0.c1023
  type=PATH msg=audit(1366282323.982:869): item=0 name="test_mq" inode=26732
  dev=00:0c mode=0100700 ouid=0 ogid=0 rdev=00:00
  obj=staff_u:object_r:user_tmpfs_t:s15:c0.c1023

...with the audit related changes that went into 3.7, they now look like this:

  type=PATH msg=audit(1366282236.776:3606): item=2 name=(null) inode=66655
  dev=00:0c mode=0100700 ouid=0 ogid=0 rdev=00:00
  obj=staff_u:object_r:user_tmpfs_t:s15:c0.c1023
  type=PATH msg=audit(1366282236.776:3606): item=1 name=(null) inode=6926
  dev=00:0c mode=041777 ouid=0 ogid=0 rdev=00:00
  obj=system_u:object_r:tmpfs_t:s15:c0.c1023
  type=PATH msg=audit(1366282236.776:3606): item=0 name="test_mq"

Both of these look wrong to me.  As Steve Grubb pointed out:

 "What we need is 1 PATH record that identifies the MQ.  The other PATH
  records probably should not be there."

Fix it to record the mq root as a parent, and flag it such that it
should be hidden from view when the names are logged, since the root of
the mq filesystem isn't terribly interesting.  With this change, we get
a single PATH record that looks more like this:

  type=PATH msg=audit(1368021604.836:484): item=0 name="test_mq" inode=16914
  dev=00:0c mode=0100644 ouid=0 ogid=0 rdev=00:00
  obj=unconfined_u:object_r:user_tmpfs_t:s0

In order to do this, a new audit_inode_parent_hidden() function is
added.  If we do it this way, then we avoid having the existing callers
of audit_inode needing to do any sort of flag conversion if auditing is
inactive.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Reported-by: Jiri Jaburek <jjaburek@redhat.com>
Cc: Steve Grubb <sgrubb@redhat.com>
Cc: Eric Paris <eparis@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/include/linux/audit.h b/include/linux/audit.h
index b20b038..729a4d1 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -103,8 +103,11 @@ extern void __audit_syscall_exit(int ret_success, long ret_value);
 extern struct filename *__audit_reusename(const __user char *uptr);
 extern void __audit_getname(struct filename *name);
 extern void audit_putname(struct filename *name);
+
+#define AUDIT_INODE_PARENT	1	/* dentry represents the parent */
+#define AUDIT_INODE_HIDDEN	2	/* audit record should be hidden */
 extern void __audit_inode(struct filename *name, const struct dentry *dentry,
-				unsigned int parent);
+				unsigned int flags);
 extern void __audit_inode_child(const struct inode *parent,
 				const struct dentry *dentry,
 				const unsigned char type);
@@ -148,10 +151,22 @@ static inline void audit_getname(struct filename *name)
 	if (unlikely(!audit_dummy_context()))
 		__audit_getname(name);
 }
-static inline void audit_inode(struct filename *name, const struct dentry *dentry,
+static inline void audit_inode(struct filename *name,
+				const struct dentry *dentry,
 				unsigned int parent) {
+	if (unlikely(!audit_dummy_context())) {
+		unsigned int flags = 0;
+		if (parent)
+			flags |= AUDIT_INODE_PARENT;
+		__audit_inode(name, dentry, flags);
+	}
+}
+static inline void audit_inode_parent_hidden(struct filename *name,
+						const struct dentry *dentry)
+{
 	if (unlikely(!audit_dummy_context()))
-		__audit_inode(name, dentry, parent);
+		__audit_inode(name, dentry,
+				AUDIT_INODE_PARENT | AUDIT_INODE_HIDDEN);
 }
 static inline void audit_inode_child(const struct inode *parent,
 				     const struct dentry *dentry,
@@ -311,7 +326,7 @@ static inline void audit_putname(struct filename *name)
 { }
 static inline void __audit_inode(struct filename *name,
 					const struct dentry *dentry,
-					unsigned int parent)
+					unsigned int flags)
 { }
 static inline void __audit_inode_child(const struct inode *parent,
 					const struct dentry *dentry,
@@ -321,6 +336,9 @@ static inline void audit_inode(struct filename *name,
 				const struct dentry *dentry,
 				unsigned int parent)
 { }
+static inline void audit_inode_parent_hidden(struct filename *name,
+				const struct dentry *dentry)
+{ }
 static inline void audit_inode_child(const struct inode *parent,
 				     const struct dentry *dentry,
 				     const unsigned char type)
diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index e4e47f6..ae1996d 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -823,6 +823,7 @@ SYSCALL_DEFINE4(mq_open, const char __user *, u_name, int, oflag, umode_t, mode,
 				error = ro;
 				goto out;
 			}
+			audit_inode_parent_hidden(name, root);
 			filp = do_create(ipc_ns, root->d_inode,
 						&path, oflag, mode,
 						u_attr ? &attr : NULL);
@@ -868,6 +869,7 @@ SYSCALL_DEFINE1(mq_unlink, const char __user *, u_name)
 	if (IS_ERR(name))
 		return PTR_ERR(name);
 
+	audit_inode_parent_hidden(name, mnt->mnt_root);
 	err = mnt_want_write(mnt);
 	if (err)
 		goto out_name;
diff --git a/kernel/audit.h b/kernel/audit.h
index 1c95131..123c9b7 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -85,6 +85,7 @@ struct audit_names {
 
 	struct filename		*name;
 	int			name_len;	/* number of chars to log */
+	bool			hidden;		/* don't log this record */
 	bool			name_put;	/* call __putname()? */
 
 	unsigned long		ino;
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 3c8a601..9845cb3 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1399,8 +1399,11 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
 	}
 
 	i = 0;
-	list_for_each_entry(n, &context->names_list, list)
+	list_for_each_entry(n, &context->names_list, list) {
+		if (n->hidden)
+			continue;
 		audit_log_name(context, n, NULL, i++, &call_panic);
+	}
 
 	/* Send end of event record to help user space know we are finished */
 	ab = audit_log_start(context, GFP_KERNEL, AUDIT_EOE);
@@ -1769,14 +1772,15 @@ void audit_putname(struct filename *name)
  * __audit_inode - store the inode and device from a lookup
  * @name: name being audited
  * @dentry: dentry being audited
- * @parent: does this dentry represent the parent?
+ * @flags: attributes for this particular entry
  */
 void __audit_inode(struct filename *name, const struct dentry *dentry,
-		   unsigned int parent)
+		   unsigned int flags)
 {
 	struct audit_context *context = current->audit_context;
 	const struct inode *inode = dentry->d_inode;
 	struct audit_names *n;
+	bool parent = flags & AUDIT_INODE_PARENT;
 
 	if (!context->in_syscall)
 		return;
@@ -1831,6 +1835,8 @@ out:
 	if (parent) {
 		n->name_len = n->name ? parent_len(n->name->name) : AUDIT_NAME_FULL;
 		n->type = AUDIT_TYPE_PARENT;
+		if (flags & AUDIT_INODE_HIDDEN)
+			n->hidden = true;
 	} else {
 		n->name_len = AUDIT_NAME_FULL;
 		n->type = AUDIT_TYPE_NORMAL;
-- 
cgit v0.10.2


From 6beb8a23b50d38a003e80c5f16b50c56e8ae3387 Mon Sep 17 00:00:00 2001
From: "Raphael S. Carvalho" <raphael.scarv@gmail.com>
Date: Mon, 8 Jul 2013 15:59:37 -0700
Subject: kernel/auditfilter.c: fixing build warning

  kernel/auditfilter.c:426: warning: this decimal constant is unsigned only in ISO C90

Signed-off-by: Raphael S. Carvalho <raphael.scarv@gmail.com>
Cc: Eric Paris <eparis@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 6bd4a90..0ee9eff 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -423,7 +423,7 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
 		f->lsm_rule = NULL;
 
 		/* Support legacy tests for a valid loginuid */
-		if ((f->type == AUDIT_LOGINUID) && (f->val == 4294967295)) {
+		if ((f->type == AUDIT_LOGINUID) && (f->val == 4294967295U)) {
 			f->type = AUDIT_LOGINUID_SET;
 			f->val = 0;
 		}
-- 
cgit v0.10.2


From 2f992ee85aaa7dfd2bda43efe4493af1e108d054 Mon Sep 17 00:00:00 2001
From: Chen Gang <gang.chen@asianux.com>
Date: Mon, 8 Jul 2013 15:59:38 -0700
Subject: kernel/auditfilter.c: fix leak in audit_add_rule() error path

If both 'tree' and 'watch' are valid we must call audit_put_tree(), just
like the preceding code within audit_add_rule().

Signed-off-by: Chen Gang <gang.chen@asianux.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Eric Paris <eparis@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 0ee9eff..3d15c66 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -865,6 +865,12 @@ static inline int audit_add_rule(struct audit_entry *entry)
 		err = audit_add_watch(&entry->rule, &list);
 		if (err) {
 			mutex_unlock(&audit_filter_mutex);
+			/*
+			 * normally audit_add_tree_rule() will free it
+			 * on failure
+			 */
+			if (tree)
+				audit_put_tree(tree);
 			goto error;
 		}
 	}
-- 
cgit v0.10.2


From b9ce54c9f59894e787e3067d2f758c297fcd6fd0 Mon Sep 17 00:00:00 2001
From: Michal Simek <michal.simek@xilinx.com>
Date: Mon, 8 Jul 2013 15:59:39 -0700
Subject: audit: Fix decimal constant description

Use proper decimal type for comparison with u32.

Compilation warning was introduced by 780a7654 ("audit: Make testing for
a valid loginuid explicit.")

  kernel/auditfilter.c: In function 'audit_data_to_entry':
  kernel/auditfilter.c:426:3: warning: this decimal constant is unsigned only in ISO C90 [enabled by default]
     if ((f->type == AUDIT_LOGINUID) && (f->val == 4294967295)) {

Signed-off-by: Michal Simek <michal.simek@xilinx.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Eric Paris <eparis@redhat.com>
Acked-by: Geert Uytterhoeven <geert@linux-m68k.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 3d15c66..f7aee8b 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -423,7 +423,7 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
 		f->lsm_rule = NULL;
 
 		/* Support legacy tests for a valid loginuid */
-		if ((f->type == AUDIT_LOGINUID) && (f->val == 4294967295U)) {
+		if ((f->type == AUDIT_LOGINUID) && (f->val == ~0U)) {
 			f->type = AUDIT_LOGINUID_SET;
 			f->val = 0;
 		}
-- 
cgit v0.10.2


From de1e0c40aceb9d5bff09c3a3b97b2f1b178af53f Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Mon, 8 Jul 2013 15:59:40 -0700
Subject: fanotify: info leak in copy_event_to_user()

The ->reserved field isn't cleared so we leak one byte of stack
information to userspace.

Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Cc: Eric Paris <eparis@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 1ea52f7..e16076d 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -122,6 +122,7 @@ static int fill_event_metadata(struct fsnotify_group *group,
 	metadata->event_len = FAN_EVENT_METADATA_LEN;
 	metadata->metadata_len = FAN_EVENT_METADATA_LEN;
 	metadata->vers = FANOTIFY_METADATA_VERSION;
+	metadata->reserved = 0;
 	metadata->mask = event->mask & FAN_ALL_OUTGOING_EVENTS;
 	metadata->pid = pid_vnr(event->tgid);
 	if (unlikely(event->mask & FAN_Q_OVERFLOW))
-- 
cgit v0.10.2


From 7b18527c4a95397b443c8c22f75634d5d11c9d47 Mon Sep 17 00:00:00 2001
From: Lino Sanfilippo <LinoSanfilippo@gmx.de>
Date: Mon, 8 Jul 2013 15:59:42 -0700
Subject: fanotify: fix races when adding/removing marks

For both adding an event to an existing mark and destroying a mark we
first have to find it via fsnotify_find_[inode|vfsmount]_mark().  But
getting the mark and adding an event (or destroying it) is not done
atomically.  This opens a race where a thread is about to destroy a mark
while another thread still finds the same mark and adds an event to its
mask although it will be destroyed.

Another race exists concerning the excess of a groups number of marks
limit: When a mark is added the number of group marks is checked against
the max number of marks per group and increased afterwards.  Since check
and increment is also not done atomically, this may result in 2 or more
processes passing the check at the same time and increasing the number
of group marks above the allowed limit.

With this patch both races are avoided by doing the concerning
operations with the groups mark mutex locked.

Signed-off-by: Lino Sanfilippo <LinoSanfilippo@gmx.de>
Cc: Eric Paris <eparis@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index e16076d..4e1d8ec 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -524,14 +524,18 @@ static int fanotify_remove_vfsmount_mark(struct fsnotify_group *group,
 	__u32 removed;
 	int destroy_mark;
 
+	mutex_lock(&group->mark_mutex);
 	fsn_mark = fsnotify_find_vfsmount_mark(group, mnt);
-	if (!fsn_mark)
+	if (!fsn_mark) {
+		mutex_unlock(&group->mark_mutex);
 		return -ENOENT;
+	}
 
 	removed = fanotify_mark_remove_from_mask(fsn_mark, mask, flags,
 						 &destroy_mark);
 	if (destroy_mark)
-		fsnotify_destroy_mark(fsn_mark, group);
+		fsnotify_destroy_mark_locked(fsn_mark, group);
+	mutex_unlock(&group->mark_mutex);
 
 	fsnotify_put_mark(fsn_mark);
 	if (removed & real_mount(mnt)->mnt_fsnotify_mask)
@@ -548,14 +552,19 @@ static int fanotify_remove_inode_mark(struct fsnotify_group *group,
 	__u32 removed;
 	int destroy_mark;
 
+	mutex_lock(&group->mark_mutex);
 	fsn_mark = fsnotify_find_inode_mark(group, inode);
-	if (!fsn_mark)
+	if (!fsn_mark) {
+		mutex_unlock(&group->mark_mutex);
 		return -ENOENT;
+	}
 
 	removed = fanotify_mark_remove_from_mask(fsn_mark, mask, flags,
 						 &destroy_mark);
 	if (destroy_mark)
-		fsnotify_destroy_mark(fsn_mark, group);
+		fsnotify_destroy_mark_locked(fsn_mark, group);
+	mutex_unlock(&group->mark_mutex);
+
 	/* matches the fsnotify_find_inode_mark() */
 	fsnotify_put_mark(fsn_mark);
 	if (removed & inode->i_fsnotify_mask)
@@ -599,21 +608,29 @@ static int fanotify_add_vfsmount_mark(struct fsnotify_group *group,
 	__u32 added;
 	int ret = 0;
 
+	mutex_lock(&group->mark_mutex);
 	fsn_mark = fsnotify_find_vfsmount_mark(group, mnt);
 	if (!fsn_mark) {
-		if (atomic_read(&group->num_marks) > group->fanotify_data.max_marks)
+		if (atomic_read(&group->num_marks) > group->fanotify_data.max_marks) {
+			mutex_unlock(&group->mark_mutex);
 			return -ENOSPC;
+		}
 
 		fsn_mark = kmem_cache_alloc(fanotify_mark_cache, GFP_KERNEL);
-		if (!fsn_mark)
+		if (!fsn_mark) {
+			mutex_unlock(&group->mark_mutex);
 			return -ENOMEM;
+		}
 
 		fsnotify_init_mark(fsn_mark, fanotify_free_mark);
-		ret = fsnotify_add_mark(fsn_mark, group, NULL, mnt, 0);
-		if (ret)
+		ret = fsnotify_add_mark_locked(fsn_mark, group, NULL, mnt, 0);
+		if (ret) {
+			mutex_unlock(&group->mark_mutex);
 			goto err;
+		}
 	}
 	added = fanotify_mark_add_to_mask(fsn_mark, mask, flags);
+	mutex_unlock(&group->mark_mutex);
 
 	if (added & ~real_mount(mnt)->mnt_fsnotify_mask)
 		fsnotify_recalc_vfsmount_mask(mnt);
@@ -642,21 +659,29 @@ static int fanotify_add_inode_mark(struct fsnotify_group *group,
 	    (atomic_read(&inode->i_writecount) > 0))
 		return 0;
 
+	mutex_lock(&group->mark_mutex);
 	fsn_mark = fsnotify_find_inode_mark(group, inode);
 	if (!fsn_mark) {
-		if (atomic_read(&group->num_marks) > group->fanotify_data.max_marks)
+		if (atomic_read(&group->num_marks) > group->fanotify_data.max_marks) {
+			mutex_unlock(&group->mark_mutex);
 			return -ENOSPC;
+		}
 
 		fsn_mark = kmem_cache_alloc(fanotify_mark_cache, GFP_KERNEL);
-		if (!fsn_mark)
+		if (!fsn_mark) {
+			mutex_unlock(&group->mark_mutex);
 			return -ENOMEM;
+		}
 
 		fsnotify_init_mark(fsn_mark, fanotify_free_mark);
-		ret = fsnotify_add_mark(fsn_mark, group, inode, NULL, 0);
-		if (ret)
+		ret = fsnotify_add_mark_locked(fsn_mark, group, inode, NULL, 0);
+		if (ret) {
+			mutex_unlock(&group->mark_mutex);
 			goto err;
+		}
 	}
 	added = fanotify_mark_add_to_mask(fsn_mark, mask, flags);
+	mutex_unlock(&group->mark_mutex);
 
 	if (added & ~inode->i_fsnotify_mask)
 		fsnotify_recalc_inode_mask(inode);
-- 
cgit v0.10.2


From 5e9c070ca085439fbec9e9629dd6171ae325d4d8 Mon Sep 17 00:00:00 2001
From: Lino Sanfilippo <LinoSanfilippo@gmx.de>
Date: Mon, 8 Jul 2013 15:59:43 -0700
Subject: fanotify: put duplicate code for adding vfsmount/inode marks into an
 own function

The code under the groups mark_mutex in fanotify_add_inode_mark() and
fanotify_add_vfsmount_mark() is almost identical.  So put it into a
seperate function.

Signed-off-by: Lino Sanfilippo <LinoSanfilippo@gmx.de>
Cc: Eric Paris <eparis@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 4e1d8ec..e44cb64 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -600,33 +600,45 @@ static __u32 fanotify_mark_add_to_mask(struct fsnotify_mark *fsn_mark,
 	return mask & ~oldmask;
 }
 
+static struct fsnotify_mark *fanotify_add_new_mark(struct fsnotify_group *group,
+						   struct inode *inode,
+						   struct vfsmount *mnt)
+{
+	struct fsnotify_mark *mark;
+	int ret;
+
+	if (atomic_read(&group->num_marks) > group->fanotify_data.max_marks)
+		return ERR_PTR(-ENOSPC);
+
+	mark = kmem_cache_alloc(fanotify_mark_cache, GFP_KERNEL);
+	if (!mark)
+		return ERR_PTR(-ENOMEM);
+
+	fsnotify_init_mark(mark, fanotify_free_mark);
+	ret = fsnotify_add_mark_locked(mark, group, inode, mnt, 0);
+	if (ret) {
+		fsnotify_put_mark(mark);
+		return ERR_PTR(ret);
+	}
+
+	return mark;
+}
+
+
 static int fanotify_add_vfsmount_mark(struct fsnotify_group *group,
 				      struct vfsmount *mnt, __u32 mask,
 				      unsigned int flags)
 {
 	struct fsnotify_mark *fsn_mark;
 	__u32 added;
-	int ret = 0;
 
 	mutex_lock(&group->mark_mutex);
 	fsn_mark = fsnotify_find_vfsmount_mark(group, mnt);
 	if (!fsn_mark) {
-		if (atomic_read(&group->num_marks) > group->fanotify_data.max_marks) {
-			mutex_unlock(&group->mark_mutex);
-			return -ENOSPC;
-		}
-
-		fsn_mark = kmem_cache_alloc(fanotify_mark_cache, GFP_KERNEL);
-		if (!fsn_mark) {
-			mutex_unlock(&group->mark_mutex);
-			return -ENOMEM;
-		}
-
-		fsnotify_init_mark(fsn_mark, fanotify_free_mark);
-		ret = fsnotify_add_mark_locked(fsn_mark, group, NULL, mnt, 0);
-		if (ret) {
+		fsn_mark = fanotify_add_new_mark(group, NULL, mnt);
+		if (IS_ERR(fsn_mark)) {
 			mutex_unlock(&group->mark_mutex);
-			goto err;
+			return PTR_ERR(fsn_mark);
 		}
 	}
 	added = fanotify_mark_add_to_mask(fsn_mark, mask, flags);
@@ -634,9 +646,9 @@ static int fanotify_add_vfsmount_mark(struct fsnotify_group *group,
 
 	if (added & ~real_mount(mnt)->mnt_fsnotify_mask)
 		fsnotify_recalc_vfsmount_mask(mnt);
-err:
+
 	fsnotify_put_mark(fsn_mark);
-	return ret;
+	return 0;
 }
 
 static int fanotify_add_inode_mark(struct fsnotify_group *group,
@@ -645,7 +657,6 @@ static int fanotify_add_inode_mark(struct fsnotify_group *group,
 {
 	struct fsnotify_mark *fsn_mark;
 	__u32 added;
-	int ret = 0;
 
 	pr_debug("%s: group=%p inode=%p\n", __func__, group, inode);
 
@@ -662,22 +673,10 @@ static int fanotify_add_inode_mark(struct fsnotify_group *group,
 	mutex_lock(&group->mark_mutex);
 	fsn_mark = fsnotify_find_inode_mark(group, inode);
 	if (!fsn_mark) {
-		if (atomic_read(&group->num_marks) > group->fanotify_data.max_marks) {
+		fsn_mark = fanotify_add_new_mark(group, inode, NULL);
+		if (IS_ERR(fsn_mark)) {
 			mutex_unlock(&group->mark_mutex);
-			return -ENOSPC;
-		}
-
-		fsn_mark = kmem_cache_alloc(fanotify_mark_cache, GFP_KERNEL);
-		if (!fsn_mark) {
-			mutex_unlock(&group->mark_mutex);
-			return -ENOMEM;
-		}
-
-		fsnotify_init_mark(fsn_mark, fanotify_free_mark);
-		ret = fsnotify_add_mark_locked(fsn_mark, group, inode, NULL, 0);
-		if (ret) {
-			mutex_unlock(&group->mark_mutex);
-			goto err;
+			return PTR_ERR(fsn_mark);
 		}
 	}
 	added = fanotify_mark_add_to_mask(fsn_mark, mask, flags);
@@ -685,9 +684,9 @@ static int fanotify_add_inode_mark(struct fsnotify_group *group,
 
 	if (added & ~inode->i_fsnotify_mask)
 		fsnotify_recalc_inode_mask(inode);
-err:
+
 	fsnotify_put_mark(fsn_mark);
-	return ret;
+	return 0;
 }
 
 /* fanotify syscalls */
-- 
cgit v0.10.2


From 52f85729805b7a0ec5a7a70e2c814193929de2f0 Mon Sep 17 00:00:00 2001
From: Lino Sanfilippo <LinoSanfilippo@gmx.de>
Date: Mon, 8 Jul 2013 15:59:44 -0700
Subject: dnotify: replace dnotify_mark_mutex with mark mutex of dnotify_group

There is no need to use a special mutex to protect against the
fcntl/close race (see dnotify.c for a description of this race).
Instead the dnotify_groups mark mutex can be used.

Signed-off-by: Lino Sanfilippo <LinoSanfilippo@gmx.de>
Cc: Eric Paris <eparis@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c
index 2bfe6dc..1fedd5f 100644
--- a/fs/notify/dnotify/dnotify.c
+++ b/fs/notify/dnotify/dnotify.c
@@ -31,7 +31,6 @@ int dir_notify_enable __read_mostly = 1;
 static struct kmem_cache *dnotify_struct_cache __read_mostly;
 static struct kmem_cache *dnotify_mark_cache __read_mostly;
 static struct fsnotify_group *dnotify_group __read_mostly;
-static DEFINE_MUTEX(dnotify_mark_mutex);
 
 /*
  * dnotify will attach one of these to each inode (i_fsnotify_marks) which
@@ -183,7 +182,7 @@ void dnotify_flush(struct file *filp, fl_owner_t id)
 		return;
 	dn_mark = container_of(fsn_mark, struct dnotify_mark, fsn_mark);
 
-	mutex_lock(&dnotify_mark_mutex);
+	mutex_lock(&dnotify_group->mark_mutex);
 
 	spin_lock(&fsn_mark->lock);
 	prev = &dn_mark->dn;
@@ -199,11 +198,12 @@ void dnotify_flush(struct file *filp, fl_owner_t id)
 
 	spin_unlock(&fsn_mark->lock);
 
-	/* nothing else could have found us thanks to the dnotify_mark_mutex */
+	/* nothing else could have found us thanks to the dnotify_groups
+	   mark_mutex */
 	if (dn_mark->dn == NULL)
-		fsnotify_destroy_mark(fsn_mark, dnotify_group);
+		fsnotify_destroy_mark_locked(fsn_mark, dnotify_group);
 
-	mutex_unlock(&dnotify_mark_mutex);
+	mutex_unlock(&dnotify_group->mark_mutex);
 
 	fsnotify_put_mark(fsn_mark);
 }
@@ -326,7 +326,7 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg)
 	new_dn_mark->dn = NULL;
 
 	/* this is needed to prevent the fcntl/close race described below */
-	mutex_lock(&dnotify_mark_mutex);
+	mutex_lock(&dnotify_group->mark_mutex);
 
 	/* add the new_fsn_mark or find an old one. */
 	fsn_mark = fsnotify_find_inode_mark(dnotify_group, inode);
@@ -334,7 +334,8 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg)
 		dn_mark = container_of(fsn_mark, struct dnotify_mark, fsn_mark);
 		spin_lock(&fsn_mark->lock);
 	} else {
-		fsnotify_add_mark(new_fsn_mark, dnotify_group, inode, NULL, 0);
+		fsnotify_add_mark_locked(new_fsn_mark, dnotify_group, inode,
+					 NULL, 0);
 		spin_lock(&new_fsn_mark->lock);
 		fsn_mark = new_fsn_mark;
 		dn_mark = new_dn_mark;
@@ -348,9 +349,9 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg)
 
 	/* if (f != filp) means that we lost a race and another task/thread
 	 * actually closed the fd we are still playing with before we grabbed
-	 * the dnotify_mark_mutex and fsn_mark->lock.  Since closing the fd is the
-	 * only time we clean up the marks we need to get our mark off
-	 * the list. */
+	 * the dnotify_groups mark_mutex and fsn_mark->lock.  Since closing the
+	 * fd is the only time we clean up the marks we need to get our mark
+	 * off the list. */
 	if (f != filp) {
 		/* if we added ourselves, shoot ourselves, it's possible that
 		 * the flush actually did shoot this fsn_mark.  That's fine too
@@ -385,9 +386,9 @@ out:
 	spin_unlock(&fsn_mark->lock);
 
 	if (destroy)
-		fsnotify_destroy_mark(fsn_mark, dnotify_group);
+		fsnotify_destroy_mark_locked(fsn_mark, dnotify_group);
 
-	mutex_unlock(&dnotify_mark_mutex);
+	mutex_unlock(&dnotify_group->mark_mutex);
 	fsnotify_put_mark(fsn_mark);
 out_err:
 	if (new_fsn_mark)
-- 
cgit v0.10.2


From e1e5a9f84e4dbd3567bb8b0d5e79db6e1e5ebc35 Mon Sep 17 00:00:00 2001
From: Lino Sanfilippo <LinoSanfilippo@gmx.de>
Date: Mon, 8 Jul 2013 15:59:45 -0700
Subject: inotify: fix race when adding a new watch

In inotify_new_watch() the number of watches for a group is compared
against the max number of allowed watches and increased afterwards.  The
check and incrementation is not done atomically, so it is possible for
multiple concurrent threads to pass the check and increment the number
of marks above the allowed max.

This patch uses an inotify groups mark_lock to ensure that both check
and incrementation are done atomic.  Furthermore we dont have to worry
about the race that allows a concurrent thread to add a watch just after
inotify_update_existing_watch() returned with -ENOENT anymore, since
this is also synchronized by the groups mark mutex now.

Signed-off-by: Lino Sanfilippo <LinoSanfilippo@gmx.de>
Cc: Eric Paris <eparis@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 959815c..60f954a 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -636,7 +636,8 @@ static int inotify_new_watch(struct fsnotify_group *group,
 		goto out_err;
 
 	/* we are on the idr, now get on the inode */
-	ret = fsnotify_add_mark(&tmp_i_mark->fsn_mark, group, inode, NULL, 0);
+	ret = fsnotify_add_mark_locked(&tmp_i_mark->fsn_mark, group, inode,
+				       NULL, 0);
 	if (ret) {
 		/* we failed to get on the inode, get off the idr */
 		inotify_remove_from_idr(group, tmp_i_mark);
@@ -660,19 +661,13 @@ static int inotify_update_watch(struct fsnotify_group *group, struct inode *inod
 {
 	int ret = 0;
 
-retry:
+	mutex_lock(&group->mark_mutex);
 	/* try to update and existing watch with the new arg */
 	ret = inotify_update_existing_watch(group, inode, arg);
 	/* no mark present, try to add a new one */
 	if (ret == -ENOENT)
 		ret = inotify_new_watch(group, inode, arg);
-	/*
-	 * inotify_new_watch could race with another thread which did an
-	 * inotify_new_watch between the update_existing and the add watch
-	 * here, go back and try to update an existing mark again.
-	 */
-	if (ret == -EEXIST)
-		goto retry;
+	mutex_unlock(&group->mark_mutex);
 
 	return ret;
 }
-- 
cgit v0.10.2


From 9756b9187eebb093b9f6a154ecceb67648e53391 Mon Sep 17 00:00:00 2001
From: Lino Sanfilippo <LinoSanfilippo@gmx.de>
Date: Mon, 8 Jul 2013 15:59:46 -0700
Subject: fsnotify: update comments concerning locking scheme

There have been changes in the locking scheme of fsnotify but the
comments in the source code have not been updated yet.  This patch
corrects this.

Signed-off-by: Lino Sanfilippo <LinoSanfilippo@gmx.de>
Cc: Eric Paris <eparis@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/fs/notify/mark.c b/fs/notify/mark.c
index fc6b49b..923fe4a 100644
--- a/fs/notify/mark.c
+++ b/fs/notify/mark.c
@@ -20,28 +20,29 @@
  * fsnotify inode mark locking/lifetime/and refcnting
  *
  * REFCNT:
- * The mark->refcnt tells how many "things" in the kernel currently are
- * referencing this object.  The object typically will live inside the kernel
- * with a refcnt of 2, one for each list it is on (i_list, g_list).  Any task
- * which can find this object holding the appropriete locks, can take a reference
- * and the object itself is guaranteed to survive until the reference is dropped.
+ * The group->recnt and mark->refcnt tell how many "things" in the kernel
+ * currently are referencing the objects. Both kind of objects typically will
+ * live inside the kernel with a refcnt of 2, one for its creation and one for
+ * the reference a group and a mark hold to each other.
+ * If you are holding the appropriate locks, you can take a reference and the
+ * object itself is guaranteed to survive until the reference is dropped.
  *
  * LOCKING:
- * There are 3 spinlocks involved with fsnotify inode marks and they MUST
- * be taken in order as follows:
+ * There are 3 locks involved with fsnotify inode marks and they MUST be taken
+ * in order as follows:
  *
+ * group->mark_mutex
  * mark->lock
- * group->mark_lock
  * inode->i_lock
  *
- * mark->lock protects 2 things, mark->group and mark->inode.  You must hold
- * that lock to dereference either of these things (they could be NULL even with
- * the lock)
- *
- * group->mark_lock protects the marks_list anchored inside a given group
- * and each mark is hooked via the g_list.  It also sorta protects the
- * free_g_list, which when used is anchored by a private list on the stack of the
- * task which held the group->mark_lock.
+ * group->mark_mutex protects the marks_list anchored inside a given group and
+ * each mark is hooked via the g_list.  It also protects the groups private
+ * data (i.e group limits).
+
+ * mark->lock protects the marks attributes like its masks and flags.
+ * Furthermore it protects the access to a reference of the group that the mark
+ * is assigned to as well as the access to a reference of the inode/vfsmount
+ * that is being watched by the mark.
  *
  * inode->i_lock protects the i_fsnotify_marks list anchored inside a
  * given inode and each mark is hooked via the i_list. (and sorta the
@@ -64,18 +65,11 @@
  * inode.  We take i_lock and walk the i_fsnotify_marks safely.  For each
  * mark on the list we take a reference (so the mark can't disappear under us).
  * We remove that mark form the inode's list of marks and we add this mark to a
- * private list anchored on the stack using i_free_list;  At this point we no
- * longer fear anything finding the mark using the inode's list of marks.
- *
- * We can safely and locklessly run the private list on the stack of everything
- * we just unattached from the original inode.  For each mark on the private list
- * we grab the mark-> and can thus dereference mark->group and mark->inode.  If
- * we see the group and inode are not NULL we take those locks.  Now holding all
- * 3 locks we can completely remove the mark from other tasks finding it in the
- * future.  Remember, 10 things might already be referencing this mark, but they
- * better be holding a ref.  We drop our reference we took before we unhooked it
- * from the inode.  When the ref hits 0 we can free the mark.
- *
+ * private list anchored on the stack using i_free_list; we walk i_free_list
+ * and before we destroy the mark we make sure that we dont race with a
+ * concurrent destroy_group by getting a ref to the marks group and taking the
+ * groups mutex.
+
  * Very similarly for freeing by group, except we use free_g_list.
  *
  * This has the very interesting property of being able to run concurrently with
-- 
cgit v0.10.2


From 34e3a58c66aafd90cc16c061569fbefc3ff451e9 Mon Sep 17 00:00:00 2001
From: Libo Chen <clbchenlibo.chen@huawei.com>
Date: Mon, 8 Jul 2013 15:59:47 -0700
Subject: drivers/iommu/msm_iommu_dev.c: fix leak and clean up error paths

Fix two obvious problems:

1. We have registered msm_iommu_driver first, and need unregister it
   when registered msm_iommu_ctx_driver fail

2. We don't need to kfree drvdata before kzalloc was successful.

[akpm@linux-foundation.org: remove now-unneeded initialization of ctx_drvdata, remove unneeded braces]
Signed-off-by: Libo Chen <libo.chen@huawei.com>
Acked-by: David Brown <davidb@codeaurora.org>
Cc: David Woodhouse <dwmw2@infradead.org>
Cc: James Hogan <james.hogan@imgtec.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/iommu/msm_iommu_dev.c b/drivers/iommu/msm_iommu_dev.c
index 9144a6b..6ba3514 100644
--- a/drivers/iommu/msm_iommu_dev.c
+++ b/drivers/iommu/msm_iommu_dev.c
@@ -291,25 +291,20 @@ static int msm_iommu_ctx_probe(struct platform_device *pdev)
 {
 	struct msm_iommu_ctx_dev *c = pdev->dev.platform_data;
 	struct msm_iommu_drvdata *drvdata;
-	struct msm_iommu_ctx_drvdata *ctx_drvdata = NULL;
+	struct msm_iommu_ctx_drvdata *ctx_drvdata;
 	int i, ret;
-	if (!c || !pdev->dev.parent) {
-		ret = -EINVAL;
-		goto fail;
-	}
 
-	drvdata = dev_get_drvdata(pdev->dev.parent);
+	if (!c || !pdev->dev.parent)
+		return -EINVAL;
 
-	if (!drvdata) {
-		ret = -ENODEV;
-		goto fail;
-	}
+	drvdata = dev_get_drvdata(pdev->dev.parent);
+	if (!drvdata)
+		return -ENODEV;
 
 	ctx_drvdata = kzalloc(sizeof(*ctx_drvdata), GFP_KERNEL);
-	if (!ctx_drvdata) {
-		ret = -ENOMEM;
-		goto fail;
-	}
+	if (!ctx_drvdata)
+		return -ENOMEM;
+
 	ctx_drvdata->num = c->num;
 	ctx_drvdata->pdev = pdev;
 
@@ -403,6 +398,7 @@ static int __init msm_iommu_driver_init(void)
 
 	ret = platform_driver_register(&msm_iommu_ctx_driver);
 	if (ret != 0) {
+		platform_driver_unregister(&msm_iommu_driver);
 		pr_err("Failed to register IOMMU context driver\n");
 		goto error;
 	}
-- 
cgit v0.10.2


From 9a2458a633d4b3c9e0eae506da40cf44dc075314 Mon Sep 17 00:00:00 2001
From: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Date: Mon, 8 Jul 2013 15:59:48 -0700
Subject: mm: mremap: validate input before taking lock

This patch is very similar to commit 84d96d897671 ("mm: madvise:
complete input validation before taking lock"): perform some basic
validation of the input to mremap() before taking the
&current->mm->mmap_sem lock.

This also makes the MREMAP_FIXED => MREMAP_MAYMOVE dependency slightly
more explicit.

Signed-off-by: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/mm/mremap.c b/mm/mremap.c
index 3708655..457d34e 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -456,13 +456,14 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
 	unsigned long charged = 0;
 	bool locked = false;
 
-	down_write(&current->mm->mmap_sem);
-
 	if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE))
-		goto out;
+		return ret;
+
+	if (flags & MREMAP_FIXED && !(flags & MREMAP_MAYMOVE))
+		return ret;
 
 	if (addr & ~PAGE_MASK)
-		goto out;
+		return ret;
 
 	old_len = PAGE_ALIGN(old_len);
 	new_len = PAGE_ALIGN(new_len);
@@ -473,12 +474,13 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
 	 * a zero new-len is nonsensical.
 	 */
 	if (!new_len)
-		goto out;
+		return ret;
+
+	down_write(&current->mm->mmap_sem);
 
 	if (flags & MREMAP_FIXED) {
-		if (flags & MREMAP_MAYMOVE)
-			ret = mremap_to(addr, old_len, new_addr, new_len,
-					&locked);
+		ret = mremap_to(addr, old_len, new_addr, new_len,
+				&locked);
 		goto out;
 	}
 
-- 
cgit v0.10.2


From 54f72fe022d9b2c4de40043a118881121190a117 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Mon, 8 Jul 2013 15:59:49 -0700
Subject: memcg: clean up memcg->nodeinfo

Remove struct mem_cgroup_lru_info and fold its single member, the
variably sized nodeinfo[0], directly into struct mem_cgroup.  This
should make it more obvious why it has to be the last member there.

Also move the comment that's above that special last member below it, so
it is more visible to somebody that considers appending to the struct
mem_cgroup.

Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: David Rientjes <rientjes@google.com>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: Glauber Costa <glommer@openvz.org>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 2e851f4..2b7cd24 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -187,10 +187,6 @@ struct mem_cgroup_per_node {
 	struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];
 };
 
-struct mem_cgroup_lru_info {
-	struct mem_cgroup_per_node *nodeinfo[0];
-};
-
 /*
  * Cgroups above their limits are maintained in a RB-Tree, independent of
  * their hierarchy representation
@@ -366,14 +362,8 @@ struct mem_cgroup {
 	atomic_t	numainfo_updating;
 #endif
 
-	/*
-	 * Per cgroup active and inactive list, similar to the
-	 * per zone LRU lists.
-	 *
-	 * WARNING: This has to be the last element of the struct. Don't
-	 * add new fields after this point.
-	 */
-	struct mem_cgroup_lru_info info;
+	struct mem_cgroup_per_node *nodeinfo[0];
+	/* WARNING: nodeinfo must be the last member here */
 };
 
 static size_t memcg_size(void)
@@ -683,7 +673,7 @@ static struct mem_cgroup_per_zone *
 mem_cgroup_zoneinfo(struct mem_cgroup *memcg, int nid, int zid)
 {
 	VM_BUG_ON((unsigned)nid >= nr_node_ids);
-	return &memcg->info.nodeinfo[nid]->zoneinfo[zid];
+	return &memcg->nodeinfo[nid]->zoneinfo[zid];
 }
 
 struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg)
@@ -6087,13 +6077,13 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
 		mz->on_tree = false;
 		mz->memcg = memcg;
 	}
-	memcg->info.nodeinfo[node] = pn;
+	memcg->nodeinfo[node] = pn;
 	return 0;
 }
 
 static void free_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
 {
-	kfree(memcg->info.nodeinfo[node]);
+	kfree(memcg->nodeinfo[node]);
 }
 
 static struct mem_cgroup *mem_cgroup_alloc(void)
-- 
cgit v0.10.2


From 609838cfed972d49a65aac7923a9ff5cbe482e30 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Mon, 8 Jul 2013 15:59:50 -0700
Subject: mm: invoke oom-killer from remaining unconverted page fault handlers

A few remaining architectures directly kill the page faulting task in an
out of memory situation.  This is usually not a good idea since that
task might not even use a significant amount of memory and so may not be
the optimal victim to resolve the situation.

Since 2.6.29's 1c0fe6e ("mm: invoke oom-killer from page fault") there
is a hook that architecture page fault handlers are supposed to call to
invoke the OOM killer and let it pick the right task to kill.  Convert
the remaining architectures over to this hook.

To have the previous behavior of simply taking out the faulting task the
vm.oom_kill_allocating_task sysctl can be set to 1.

Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Michal Hocko <mhocko@suse.cz>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Acked-by: David Rientjes <rientjes@google.com>
Acked-by: Vineet Gupta <vgupta@synopsys.com>   [arch/arc bits]
Cc: James Hogan <james.hogan@imgtec.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Jonas Bonn <jonas@southpole.se>
Cc: Chen Liqin <liqin.chen@sunplusct.com>
Cc: Lennox Wu <lennox.wu@gmail.com>
Cc: Chris Metcalf <cmetcalf@tilera.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/arch/arc/mm/fault.c b/arch/arc/mm/fault.c
index 318164c..0fd1f0d 100644
--- a/arch/arc/mm/fault.c
+++ b/arch/arc/mm/fault.c
@@ -207,8 +207,10 @@ out_of_memory:
 	}
 	up_read(&mm->mmap_sem);
 
-	if (user_mode(regs))
-		do_group_exit(SIGKILL);	/* This will never return */
+	if (user_mode(regs)) {
+		pagefault_out_of_memory();
+		return;
+	}
 
 	goto no_context;
 
diff --git a/arch/metag/mm/fault.c b/arch/metag/mm/fault.c
index 2c75bf7..8fddf46 100644
--- a/arch/metag/mm/fault.c
+++ b/arch/metag/mm/fault.c
@@ -224,8 +224,10 @@ do_sigbus:
 	 */
 out_of_memory:
 	up_read(&mm->mmap_sem);
-	if (user_mode(regs))
-		do_group_exit(SIGKILL);
+	if (user_mode(regs)) {
+		pagefault_out_of_memory();
+		return 1;
+	}
 
 no_context:
 	/* Are we prepared to handle this kernel fault?  */
diff --git a/arch/mn10300/mm/fault.c b/arch/mn10300/mm/fault.c
index d48a84f..8a2e6de 100644
--- a/arch/mn10300/mm/fault.c
+++ b/arch/mn10300/mm/fault.c
@@ -345,9 +345,10 @@ no_context:
  */
 out_of_memory:
 	up_read(&mm->mmap_sem);
-	printk(KERN_ALERT "VM: killing process %s\n", tsk->comm);
-	if ((fault_code & MMUFCR_xFC_ACCESS) == MMUFCR_xFC_ACCESS_USR)
-		do_exit(SIGKILL);
+	if ((fault_code & MMUFCR_xFC_ACCESS) == MMUFCR_xFC_ACCESS_USR) {
+		pagefault_out_of_memory();
+		return;
+	}
 	goto no_context;
 
 do_sigbus:
diff --git a/arch/openrisc/mm/fault.c b/arch/openrisc/mm/fault.c
index e2bfafc..4a41f84 100644
--- a/arch/openrisc/mm/fault.c
+++ b/arch/openrisc/mm/fault.c
@@ -267,10 +267,10 @@ out_of_memory:
 	__asm__ __volatile__("l.nop 1");
 
 	up_read(&mm->mmap_sem);
-	printk("VM: killing process %s\n", tsk->comm);
-	if (user_mode(regs))
-		do_exit(SIGKILL);
-	goto no_context;
+	if (!user_mode(regs))
+		goto no_context;
+	pagefault_out_of_memory();
+	return;
 
 do_sigbus:
 	up_read(&mm->mmap_sem);
diff --git a/arch/score/mm/fault.c b/arch/score/mm/fault.c
index 47b600e..6b18fb0 100644
--- a/arch/score/mm/fault.c
+++ b/arch/score/mm/fault.c
@@ -172,10 +172,10 @@ out_of_memory:
 		down_read(&mm->mmap_sem);
 		goto survive;
 	}
-	printk("VM: killing process %s\n", tsk->comm);
-	if (user_mode(regs))
-		do_group_exit(SIGKILL);
-	goto no_context;
+	if (!user_mode(regs))
+		goto no_context;
+	pagefault_out_of_memory();
+	return;
 
 do_sigbus:
 	up_read(&mm->mmap_sem);
diff --git a/arch/tile/mm/fault.c b/arch/tile/mm/fault.c
index 3d2b81c..f7f99f9 100644
--- a/arch/tile/mm/fault.c
+++ b/arch/tile/mm/fault.c
@@ -573,10 +573,10 @@ out_of_memory:
 		down_read(&mm->mmap_sem);
 		goto survive;
 	}
-	pr_alert("VM: killing process %s\n", tsk->comm);
-	if (!is_kernel_mode)
-		do_group_exit(SIGKILL);
-	goto no_context;
+	if (is_kernel_mode)
+		goto no_context;
+	pagefault_out_of_memory();
+	return 0;
 
 do_sigbus:
 	up_read(&mm->mmap_sem);
-- 
cgit v0.10.2


From 7960aedde8cfa72e4caf488806ea7ea7d2fa8dba Mon Sep 17 00:00:00 2001
From: Zhang Yanfei <zhangyanfei@cn.fujitsu.com>
Date: Mon, 8 Jul 2013 15:59:52 -0700
Subject: mm: remove duplicated call of get_pfn_range_for_nid

When calculating pages in a node, for each zone in that node, we will
have

  zone_spanned_pages_in_node
    --> get_pfn_range_for_nid
  zone_absent_pages_in_node
    --> get_pfn_range_for_nid

That is to say, we call the get_pfn_range_for_nid to get start_pfn and
end_pfn of the node for MAX_NR_ZONES * 2 times.  And this is totally
unnecessary if we call the get_pfn_range_for_nid before
zone_*_pages_in_node add two extra arguments node_start_pfn and
node_end_pfn for zone_*_pages_in_node, then we can remove the
get_pfn_range_in_node in zone_*_pages_in_node.

[akpm@linux-foundation.org: make definitions more readable]
Signed-off-by: Zhang Yanfei <zhangyanfei@cn.fujitsu.com>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 327516b..7d5e40f 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4421,13 +4421,13 @@ static void __meminit adjust_zone_range_for_zone_movable(int nid,
  */
 static unsigned long __meminit zone_spanned_pages_in_node(int nid,
 					unsigned long zone_type,
+					unsigned long node_start_pfn,
+					unsigned long node_end_pfn,
 					unsigned long *ignored)
 {
-	unsigned long node_start_pfn, node_end_pfn;
 	unsigned long zone_start_pfn, zone_end_pfn;
 
-	/* Get the start and end of the node and zone */
-	get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn);
+	/* Get the start and end of the zone */
 	zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type];
 	zone_end_pfn = arch_zone_highest_possible_pfn[zone_type];
 	adjust_zone_range_for_zone_movable(nid, zone_type,
@@ -4482,14 +4482,14 @@ unsigned long __init absent_pages_in_range(unsigned long start_pfn,
 /* Return the number of page frames in holes in a zone on a node */
 static unsigned long __meminit zone_absent_pages_in_node(int nid,
 					unsigned long zone_type,
+					unsigned long node_start_pfn,
+					unsigned long node_end_pfn,
 					unsigned long *ignored)
 {
 	unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type];
 	unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];
-	unsigned long node_start_pfn, node_end_pfn;
 	unsigned long zone_start_pfn, zone_end_pfn;
 
-	get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn);
 	zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high);
 	zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high);
 
@@ -4502,6 +4502,8 @@ static unsigned long __meminit zone_absent_pages_in_node(int nid,
 #else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
 static inline unsigned long __meminit zone_spanned_pages_in_node(int nid,
 					unsigned long zone_type,
+					unsigned long node_start_pfn,
+					unsigned long node_end_pfn,
 					unsigned long *zones_size)
 {
 	return zones_size[zone_type];
@@ -4509,6 +4511,8 @@ static inline unsigned long __meminit zone_spanned_pages_in_node(int nid,
 
 static inline unsigned long __meminit zone_absent_pages_in_node(int nid,
 						unsigned long zone_type,
+						unsigned long node_start_pfn,
+						unsigned long node_end_pfn,
 						unsigned long *zholes_size)
 {
 	if (!zholes_size)
@@ -4520,21 +4524,27 @@ static inline unsigned long __meminit zone_absent_pages_in_node(int nid,
 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
 
 static void __meminit calculate_node_totalpages(struct pglist_data *pgdat,
-		unsigned long *zones_size, unsigned long *zholes_size)
+						unsigned long node_start_pfn,
+						unsigned long node_end_pfn,
+						unsigned long *zones_size,
+						unsigned long *zholes_size)
 {
 	unsigned long realtotalpages, totalpages = 0;
 	enum zone_type i;
 
 	for (i = 0; i < MAX_NR_ZONES; i++)
 		totalpages += zone_spanned_pages_in_node(pgdat->node_id, i,
-								zones_size);
+							 node_start_pfn,
+							 node_end_pfn,
+							 zones_size);
 	pgdat->node_spanned_pages = totalpages;
 
 	realtotalpages = totalpages;
 	for (i = 0; i < MAX_NR_ZONES; i++)
 		realtotalpages -=
 			zone_absent_pages_in_node(pgdat->node_id, i,
-								zholes_size);
+						  node_start_pfn, node_end_pfn,
+						  zholes_size);
 	pgdat->node_present_pages = realtotalpages;
 	printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id,
 							realtotalpages);
@@ -4643,6 +4653,7 @@ static unsigned long __paginginit calc_memmap_size(unsigned long spanned_pages,
  * NOTE: pgdat should get zeroed by caller.
  */
 static void __paginginit free_area_init_core(struct pglist_data *pgdat,
+		unsigned long node_start_pfn, unsigned long node_end_pfn,
 		unsigned long *zones_size, unsigned long *zholes_size)
 {
 	enum zone_type j;
@@ -4664,8 +4675,11 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
 		struct zone *zone = pgdat->node_zones + j;
 		unsigned long size, realsize, freesize, memmap_pages;
 
-		size = zone_spanned_pages_in_node(nid, j, zones_size);
+		size = zone_spanned_pages_in_node(nid, j, node_start_pfn,
+						  node_end_pfn, zones_size);
 		realsize = freesize = size - zone_absent_pages_in_node(nid, j,
+								node_start_pfn,
+								node_end_pfn,
 								zholes_size);
 
 		/*
@@ -4779,6 +4793,8 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
 		unsigned long node_start_pfn, unsigned long *zholes_size)
 {
 	pg_data_t *pgdat = NODE_DATA(nid);
+	unsigned long start_pfn = 0;
+	unsigned long end_pfn = 0;
 
 	/* pg_data_t should be reset to zero when it's allocated */
 	WARN_ON(pgdat->nr_zones || pgdat->classzone_idx);
@@ -4786,7 +4802,11 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
 	pgdat->node_id = nid;
 	pgdat->node_start_pfn = node_start_pfn;
 	init_zone_allows_reclaim(nid);
-	calculate_node_totalpages(pgdat, zones_size, zholes_size);
+#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
+	get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
+#endif
+	calculate_node_totalpages(pgdat, start_pfn, end_pfn,
+				  zones_size, zholes_size);
 
 	alloc_node_mem_map(pgdat);
 #ifdef CONFIG_FLAT_NODE_MEM_MAP
@@ -4795,7 +4815,8 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
 		(unsigned long)pgdat->node_mem_map);
 #endif
 
-	free_area_init_core(pgdat, zones_size, zholes_size);
+	free_area_init_core(pgdat, start_pfn, end_pfn,
+			    zones_size, zholes_size);
 }
 
 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
-- 
cgit v0.10.2


From ab15d9b4cbc2b6497023f554a152c2573ca53671 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Mon, 8 Jul 2013 15:59:53 -0700
Subject: mm/vmalloc.c: unbreak __vunmap()

There is an extra semi-colon so the function always returns.

Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Acked-by: Zhang Yanfei <zhangyanfei@cn.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 91a1047..96b77a9 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1453,7 +1453,7 @@ static void __vunmap(const void *addr, int deallocate_pages)
 		return;
 
 	if (WARN(!PAGE_ALIGNED(addr), "Trying to vfree() bad address (%p)\n",
-			addr));
+			addr))
 		return;
 
 	area = remove_vm_area(addr);
-- 
cgit v0.10.2


From 3fcd76e8028e0be37b02a2002b4f56755daeda06 Mon Sep 17 00:00:00 2001
From: Zhang Yanfei <zhangyanfei@cn.fujitsu.com>
Date: Mon, 8 Jul 2013 15:59:54 -0700
Subject: mm/vmalloc.c: remove dead code in vb_alloc

Space in a vmap block that was once allocated is considered dirty and
not made available for allocation again before the whole block is
recycled.  The result is that free space within a vmap block is always
contiguous.

So if a vmap block has enough free space for allocation, the allocation
is impossible to fail.  Thus, the fragmented block purging was never
invoked from vb_alloc().  So remove this dead code.

[ Same patches also sent by:

    Chanho Min <chanho.min@lge.com>
    Johannes Weiner <hannes@cmpxchg.org>

  but git doesn't do "multiple authors" ]

Signed-off-by: Zhang Yanfei <zhangyanfei@cn.fujitsu.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 96b77a9..a35f4f5 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -910,7 +910,6 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
 	struct vmap_block *vb;
 	unsigned long addr = 0;
 	unsigned int order;
-	int purge = 0;
 
 	BUG_ON(size & ~PAGE_MASK);
 	BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
@@ -934,17 +933,7 @@ again:
 		if (vb->free < 1UL << order)
 			goto next;
 
-		i = bitmap_find_free_region(vb->alloc_map,
-						VMAP_BBMAP_BITS, order);
-
-		if (i < 0) {
-			if (vb->free + vb->dirty == VMAP_BBMAP_BITS) {
-				/* fragmented and no outstanding allocations */
-				BUG_ON(vb->dirty != VMAP_BBMAP_BITS);
-				purge = 1;
-			}
-			goto next;
-		}
+		i = VMAP_BBMAP_BITS - vb->free;
 		addr = vb->va->va_start + (i << PAGE_SHIFT);
 		BUG_ON(addr_to_vb_idx(addr) !=
 				addr_to_vb_idx(vb->va->va_start));
@@ -960,9 +949,6 @@ next:
 		spin_unlock(&vb->lock);
 	}
 
-	if (purge)
-		purge_fragmented_blocks_thiscpu();
-
 	put_cpu_var(vmap_block_queue);
 	rcu_read_unlock();
 
-- 
cgit v0.10.2


From 9da3f59fbdb57c9447ddb42681f6ab98faef353a Mon Sep 17 00:00:00 2001
From: Zhang Yanfei <zhangyanfei@cn.fujitsu.com>
Date: Mon, 8 Jul 2013 15:59:55 -0700
Subject: mm/vmalloc.c: remove unused purge_fragmented_blocks_thiscpu

This function is nowhere used now, so remove it.

Signed-off-by: Zhang Yanfei <zhangyanfei@cn.fujitsu.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index a35f4f5..99d045a 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -891,11 +891,6 @@ static void purge_fragmented_blocks(int cpu)
 	}
 }
 
-static void purge_fragmented_blocks_thiscpu(void)
-{
-	purge_fragmented_blocks(smp_processor_id());
-}
-
 static void purge_fragmented_blocks_allcpus(void)
 {
 	int cpu;
-- 
cgit v0.10.2


From b8e748b6c32999f221ea4786557b8e7e6c4e4e7a Mon Sep 17 00:00:00 2001
From: Zhang Yanfei <zhangyanfei@cn.fujitsu.com>
Date: Mon, 8 Jul 2013 15:59:56 -0700
Subject: mm/vmalloc.c: remove alloc_map from vmap_block

As we have removed the dead code in the vb_alloc, it seems there is no
place to use the alloc_map.  So there is no reason to maintain the
alloc_map in vmap_block.

Signed-off-by: Zhang Yanfei <zhangyanfei@cn.fujitsu.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 99d045a..7ac2a1f 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -754,7 +754,6 @@ struct vmap_block {
 	struct vmap_area *va;
 	struct vmap_block_queue *vbq;
 	unsigned long free, dirty;
-	DECLARE_BITMAP(alloc_map, VMAP_BBMAP_BITS);
 	DECLARE_BITMAP(dirty_map, VMAP_BBMAP_BITS);
 	struct list_head free_list;
 	struct rcu_head rcu_head;
@@ -820,7 +819,6 @@ static struct vmap_block *new_vmap_block(gfp_t gfp_mask)
 	vb->va = va;
 	vb->free = VMAP_BBMAP_BITS;
 	vb->dirty = 0;
-	bitmap_zero(vb->alloc_map, VMAP_BBMAP_BITS);
 	bitmap_zero(vb->dirty_map, VMAP_BBMAP_BITS);
 	INIT_LIST_HEAD(&vb->free_list);
 
@@ -873,7 +871,6 @@ static void purge_fragmented_blocks(int cpu)
 		if (vb->free + vb->dirty == VMAP_BBMAP_BITS && vb->dirty != VMAP_BBMAP_BITS) {
 			vb->free = 0; /* prevent further allocs after releasing lock */
 			vb->dirty = VMAP_BBMAP_BITS; /* prevent purging it again */
-			bitmap_fill(vb->alloc_map, VMAP_BBMAP_BITS);
 			bitmap_fill(vb->dirty_map, VMAP_BBMAP_BITS);
 			spin_lock(&vbq->lock);
 			list_del_rcu(&vb->free_list);
-- 
cgit v0.10.2


From 46c001a2753f47ffa621131baa3409e636515347 Mon Sep 17 00:00:00 2001
From: Zhang Yanfei <zhangyanfei@cn.fujitsu.com>
Date: Mon, 8 Jul 2013 15:59:57 -0700
Subject: mm/vmalloc.c: emit the failure message before return

Use goto to jump to the fail label to give a failure message before
returning NULL.  This makes the failure handling in this function
consistent.

Signed-off-by: Zhang Yanfei <zhangyanfei@cn.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 7ac2a1f..d81b9f7 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1642,7 +1642,7 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align,
 
 	addr = __vmalloc_area_node(area, gfp_mask, prot, node, caller);
 	if (!addr)
-		return NULL;
+		goto fail;
 
 	/*
 	 * In this function, newly allocated vm_struct has VM_UNLIST flag.
-- 
cgit v0.10.2


From 20fc02b477c526c6a85f84e3770373778ff2f97e Mon Sep 17 00:00:00 2001
From: Zhang Yanfei <zhangyanfei@cn.fujitsu.com>
Date: Mon, 8 Jul 2013 15:59:58 -0700
Subject: mm/vmalloc.c: rename VM_UNLIST to VM_UNINITIALIZED

VM_UNLIST was used to indicate that the vm_struct is not listed in
vmlist.

But after commit 4341fa454796 ("mm, vmalloc: remove list management of
vmlist after initializing vmalloc"), the meaning of this flag changed.
It now means the vm_struct is not fully initialized.  So renaming it to
VM_UNINITIALIZED seems more reasonable.

Also change clear_vm_unlist to clear_vm_uninitialized_flag.

Signed-off-by: Zhang Yanfei <zhangyanfei@cn.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index dd0a2c8..4b8a891 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -10,12 +10,12 @@
 struct vm_area_struct;		/* vma defining user mapping in mm_types.h */
 
 /* bits in flags of vmalloc's vm_struct below */
-#define VM_IOREMAP	0x00000001	/* ioremap() and friends */
-#define VM_ALLOC	0x00000002	/* vmalloc() */
-#define VM_MAP		0x00000004	/* vmap()ed pages */
-#define VM_USERMAP	0x00000008	/* suitable for remap_vmalloc_range */
-#define VM_VPAGES	0x00000010	/* buffer for pages was vmalloc'ed */
-#define VM_UNLIST	0x00000020	/* vm_struct is not listed in vmlist */
+#define VM_IOREMAP		0x00000001	/* ioremap() and friends */
+#define VM_ALLOC		0x00000002	/* vmalloc() */
+#define VM_MAP			0x00000004	/* vmap()ed pages */
+#define VM_USERMAP		0x00000008	/* suitable for remap_vmalloc_range */
+#define VM_VPAGES		0x00000010	/* buffer for pages was vmalloc'ed */
+#define VM_UNINITIALIZED	0x00000020	/* vm_struct is not fully initialized */
 /* bits [20..32] reserved for arch specific ioremap internals */
 
 /*
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index d81b9f7..af40068 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1289,15 +1289,15 @@ static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
 	spin_unlock(&vmap_area_lock);
 }
 
-static void clear_vm_unlist(struct vm_struct *vm)
+static void clear_vm_uninitialized_flag(struct vm_struct *vm)
 {
 	/*
-	 * Before removing VM_UNLIST,
+	 * Before removing VM_UNINITIALIZED,
 	 * we should make sure that vm has proper values.
 	 * Pair with smp_rmb() in show_numa_info().
 	 */
 	smp_wmb();
-	vm->flags &= ~VM_UNLIST;
+	vm->flags &= ~VM_UNINITIALIZED;
 }
 
 static struct vm_struct *__get_vm_area_node(unsigned long size,
@@ -1635,7 +1635,7 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align,
 	if (!size || (size >> PAGE_SHIFT) > totalram_pages)
 		goto fail;
 
-	area = __get_vm_area_node(size, align, VM_ALLOC | VM_UNLIST,
+	area = __get_vm_area_node(size, align, VM_ALLOC | VM_UNINITIALIZED,
 				  start, end, node, gfp_mask, caller);
 	if (!area)
 		goto fail;
@@ -1645,11 +1645,11 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align,
 		goto fail;
 
 	/*
-	 * In this function, newly allocated vm_struct has VM_UNLIST flag.
-	 * It means that vm_struct is not fully initialized.
+	 * In this function, newly allocated vm_struct has VM_UNINITIALIZED
+	 * flag. It means that vm_struct is not fully initialized.
 	 * Now, it is fully initialized, so remove this flag here.
 	 */
-	clear_vm_unlist(area);
+	clear_vm_uninitialized_flag(area);
 
 	/*
 	 * A ref_count = 3 is needed because the vm_struct and vmap_area
@@ -2569,9 +2569,9 @@ static void show_numa_info(struct seq_file *m, struct vm_struct *v)
 		if (!counters)
 			return;
 
-		/* Pair with smp_wmb() in clear_vm_unlist() */
+		/* Pair with smp_wmb() in clear_vm_uninitialized_flag() */
 		smp_rmb();
-		if (v->flags & VM_UNLIST)
+		if (v->flags & VM_UNINITIALIZED)
 			return;
 
 		memset(counters, 0, nr_node_ids * sizeof(unsigned int));
-- 
cgit v0.10.2


From d157a55815ffff48caec311dfb543ce8a79e283e Mon Sep 17 00:00:00 2001
From: Zhang Yanfei <zhangyanfei@cn.fujitsu.com>
Date: Mon, 8 Jul 2013 15:59:59 -0700
Subject: mm/vmalloc.c: check VM_UNINITIALIZED flag in s_show instead of
 show_numa_info

We should check the VM_UNITIALIZED flag in s_show().  If this flag is
set, that said, the vm_struct is not fully initialized.  So it is
unnecessary to try to show the information contained in vm_struct.

We checked this flag in show_numa_info(), but I think it's better to
check it earlier.

Signed-off-by: Zhang Yanfei <zhangyanfei@cn.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index af40068..318c500 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -2569,11 +2569,6 @@ static void show_numa_info(struct seq_file *m, struct vm_struct *v)
 		if (!counters)
 			return;
 
-		/* Pair with smp_wmb() in clear_vm_uninitialized_flag() */
-		smp_rmb();
-		if (v->flags & VM_UNINITIALIZED)
-			return;
-
 		memset(counters, 0, nr_node_ids * sizeof(unsigned int));
 
 		for (nr = 0; nr < v->nr_pages; nr++)
@@ -2602,6 +2597,11 @@ static int s_show(struct seq_file *m, void *p)
 
 	v = va->vm;
 
+	/* Pair with smp_wmb() in clear_vm_uninitialized_flag() */
+	smp_rmb();
+	if (v->flags & VM_UNINITIALIZED)
+		return 0;
+
 	seq_printf(m, "0x%pK-0x%pK %7ld",
 		v->addr, v->addr + v->size, v->size);
 
-- 
cgit v0.10.2


From 6d42c232bd1e77288b2660153299b7d12a5c8e15 Mon Sep 17 00:00:00 2001
From: Glauber Costa <glommer@gmail.com>
Date: Mon, 8 Jul 2013 16:00:00 -0700
Subject: memcg: also test for skip accounting at the page allocation level

The memory we used to hold the memcg arrays is currently accounted to
the current memcg.  But that creates a problem, because that memory can
only be freed after the last user is gone.  Our only way to know which
is the last user, is to hook up to freeing time, but the fact that we
still have some in flight kmallocs will prevent freeing to happen.  I
believe therefore to be just easier to account this memory as global
overhead.

This patch (of 2):

Disabling accounting is only relevant for some specific memcg internal
allocations.  Therefore we would initially not have such check at
memcg_kmem_newpage_charge, since direct calls to the page allocator that
are marked with GFP_KMEMCG only happen outside memcg core.  We are
mostly concerned with cache allocations and by having this test at
memcg_kmem_get_cache we are already able to relay the allocation to the
root cache and bypass the memcg caches altogether.

There is one exception, though: the SLUB allocator does not create large
order caches, but rather service large kmallocs directly from the page
allocator.  Therefore, the following sequence, when backed by the SLUB
allocator:

	memcg_stop_kmem_account();
	kmalloc(<large_number>)
	memcg_resume_kmem_account();

would effectively ignore the fact that we should skip accounting, since
it will drive us directly to this function without passing through the
cache selector memcg_kmem_get_cache.  Such large allocations are
extremely rare but can happen, for instance, for the cache arrays.

This was never a problem in practice, because we weren't skipping
accounting for the cache arrays.  All the allocations we were skipping
were fairly small.  However, the fact that we were not skipping those
allocations are a problem and can prevent the memcgs from going away.
As we fix that, we need to make sure that the fix will also work with
the SLUB allocator.

Signed-off-by: Glauber Costa <glommer@openvz.org>
Reported-by: Michal Hocko <mhocko@suze.cz>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Kamezawa Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 2b7cd24..06a595f 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -3637,6 +3637,34 @@ __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order)
 	int ret;
 
 	*_memcg = NULL;
+
+	/*
+	 * Disabling accounting is only relevant for some specific memcg
+	 * internal allocations. Therefore we would initially not have such
+	 * check here, since direct calls to the page allocator that are marked
+	 * with GFP_KMEMCG only happen outside memcg core. We are mostly
+	 * concerned with cache allocations, and by having this test at
+	 * memcg_kmem_get_cache, we are already able to relay the allocation to
+	 * the root cache and bypass the memcg cache altogether.
+	 *
+	 * There is one exception, though: the SLUB allocator does not create
+	 * large order caches, but rather service large kmallocs directly from
+	 * the page allocator. Therefore, the following sequence when backed by
+	 * the SLUB allocator:
+	 *
+	 * 	memcg_stop_kmem_account();
+	 * 	kmalloc(<large_number>)
+	 * 	memcg_resume_kmem_account();
+	 *
+	 * would effectively ignore the fact that we should skip accounting,
+	 * since it will drive us directly to this function without passing
+	 * through the cache selector memcg_kmem_get_cache. Such large
+	 * allocations are extremely rare but can happen, for instance, for the
+	 * cache arrays. We bring this test here.
+	 */
+	if (!current->mm || current->memcg_kmem_skip_account)
+		return true;
+
 	memcg = try_get_mem_cgroup_from_mm(current->mm);
 
 	/*
-- 
cgit v0.10.2


From 425c598d583883c33c75780225ba8e0794b43bd9 Mon Sep 17 00:00:00 2001
From: Glauber Costa <glommer@gmail.com>
Date: Mon, 8 Jul 2013 16:00:01 -0700
Subject: memcg: do not account memory used for cache creation

The memory we used to hold the memcg arrays is currently accounted to
the current memcg.  But that creates a problem, because that memory can
only be freed after the last user is gone.  Our only way to know which
is the last user, is to hook up to freeing time, but the fact that we
still have some in flight kmallocs will prevent freeing to happen.  I
believe therefore to be just easier to account this memory as global
overhead.

Signed-off-by: Glauber Costa <glommer@openvz.org>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Kamezawa Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 06a595f..64f7265 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -5232,7 +5232,9 @@ static int memcg_propagate_kmem(struct mem_cgroup *memcg)
 	static_key_slow_inc(&memcg_kmem_enabled_key);
 
 	mutex_lock(&set_limit_mutex);
+	memcg_stop_kmem_account();
 	ret = memcg_update_cache_sizes(memcg);
+	memcg_resume_kmem_account();
 	mutex_unlock(&set_limit_mutex);
 out:
 	return ret;
-- 
cgit v0.10.2


From 537926caedb335b198eb53930ebeeb6426a541f9 Mon Sep 17 00:00:00 2001
From: Zhang Yanfei <zhangyanfei@cn.fujitsu.com>
Date: Mon, 8 Jul 2013 16:00:02 -0700
Subject: include/linux/gfp.h: fix the comment for GFP_ZONE_TABLE

0xc just means MOVABLE + DMA32, which results in zone DMA32.

Signed-off-by: Zhang Yanfei <zhangyanfei@cn.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 0f615eb..9b4dd49 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -209,7 +209,7 @@ static inline int allocflags_to_migratetype(gfp_t gfp_flags)
  *       0x9    => DMA or NORMAL (MOVABLE+DMA)
  *       0xa    => MOVABLE (Movable is valid only if HIGHMEM is set too)
  *       0xb    => BAD (MOVABLE+HIGHMEM+DMA)
- *       0xc    => DMA32 (MOVABLE+HIGHMEM+DMA32)
+ *       0xc    => DMA32 (MOVABLE+DMA32)
  *       0xd    => BAD (MOVABLE+DMA32+DMA)
  *       0xe    => BAD (MOVABLE+DMA32+HIGHMEM)
  *       0xf    => BAD (MOVABLE+DMA32+HIGHMEM+DMA)
-- 
cgit v0.10.2


From 0cf31ec10e92253e2908cd830145a71043740d77 Mon Sep 17 00:00:00 2001
From: Seth Jennings <sjenning@linux.vnet.ibm.com>
Date: Mon, 8 Jul 2013 16:00:05 -0700
Subject: MAINTAINERS: add zswap and zbud maintainer

Add maintainer information for zswap and zbud into the MAINTAINERS file.

Signed-off-by: Seth Jennings <sjenning@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/MAINTAINERS b/MAINTAINERS
index 97762ad..70cf679 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -9263,6 +9263,13 @@ F:	Documentation/networking/z8530drv.txt
 F:	drivers/net/hamradio/*scc.c
 F:	drivers/net/hamradio/z8530.h
 
+ZBUD COMPRESSED PAGE ALLOCATOR
+M:	Seth Jennings <sjenning@linux.vnet.ibm.com>
+L:	linux-mm@kvack.org
+S:	Maintained
+F:	mm/zbud.c
+F:	include/linux/zbud.h
+
 ZD1211RW WIRELESS DRIVER
 M:	Daniel Drake <dsd@gentoo.org>
 M:	Ulrich Kunitz <kune@deine-taler.de>
@@ -9285,6 +9292,12 @@ M:	"Maciej W. Rozycki" <macro@linux-mips.org>
 S:	Maintained
 F:	drivers/tty/serial/zs.*
 
+ZSWAP COMPRESSED SWAP CACHING
+M:	Seth Jennings <sjenning@linux.vnet.ibm.com>
+L:	linux-mm@kvack.org
+S:	Maintained
+F:	mm/zswap.c
+
 THE REST
 M:	Linus Torvalds <torvalds@linux-foundation.org>
 L:	linux-kernel@vger.kernel.org
-- 
cgit v0.10.2


From bc732f1d55cf41627ee4c64078812b2fa592b394 Mon Sep 17 00:00:00 2001
From: Zhang Yanfei <zhangyanfei@cn.fujitsu.com>
Date: Mon, 8 Jul 2013 16:00:06 -0700
Subject: mm/page_alloc.c: remove zone_type argument of build_zonelists_node

The callers of build_zonelists_node always pass MAX_NR_ZONES -1 as the
zone_type argument, so we can directly use the value in
build_zonelists_node and remove zone_type argument.

Signed-off-by: Zhang Yanfei <zhangyanfei@cn.fujitsu.com>
Acked-by: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 7d5e40f..27f9d4b 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3153,12 +3153,10 @@ static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref)
  * Add all populated zones of a node to the zonelist.
  */
 static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist,
-				int nr_zones, enum zone_type zone_type)
+				int nr_zones)
 {
 	struct zone *zone;
-
-	BUG_ON(zone_type >= MAX_NR_ZONES);
-	zone_type++;
+	enum zone_type zone_type = MAX_NR_ZONES;
 
 	do {
 		zone_type--;
@@ -3168,8 +3166,8 @@ static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist,
 				&zonelist->_zonerefs[nr_zones++]);
 			check_highest_zone(zone_type);
 		}
-
 	} while (zone_type);
+
 	return nr_zones;
 }
 
@@ -3363,8 +3361,7 @@ static void build_zonelists_in_node_order(pg_data_t *pgdat, int node)
 	zonelist = &pgdat->node_zonelists[0];
 	for (j = 0; zonelist->_zonerefs[j].zone != NULL; j++)
 		;
-	j = build_zonelists_node(NODE_DATA(node), zonelist, j,
-							MAX_NR_ZONES - 1);
+	j = build_zonelists_node(NODE_DATA(node), zonelist, j);
 	zonelist->_zonerefs[j].zone = NULL;
 	zonelist->_zonerefs[j].zone_idx = 0;
 }
@@ -3378,7 +3375,7 @@ static void build_thisnode_zonelists(pg_data_t *pgdat)
 	struct zonelist *zonelist;
 
 	zonelist = &pgdat->node_zonelists[1];
-	j = build_zonelists_node(pgdat, zonelist, 0, MAX_NR_ZONES - 1);
+	j = build_zonelists_node(pgdat, zonelist, 0);
 	zonelist->_zonerefs[j].zone = NULL;
 	zonelist->_zonerefs[j].zone_idx = 0;
 }
@@ -3586,7 +3583,7 @@ static void build_zonelists(pg_data_t *pgdat)
 	local_node = pgdat->node_id;
 
 	zonelist = &pgdat->node_zonelists[0];
-	j = build_zonelists_node(pgdat, zonelist, 0, MAX_NR_ZONES - 1);
+	j = build_zonelists_node(pgdat, zonelist, 0);
 
 	/*
 	 * Now we build the zonelist so that it contains the zones
@@ -3599,14 +3596,12 @@ static void build_zonelists(pg_data_t *pgdat)
 	for (node = local_node + 1; node < MAX_NUMNODES; node++) {
 		if (!node_online(node))
 			continue;
-		j = build_zonelists_node(NODE_DATA(node), zonelist, j,
-							MAX_NR_ZONES - 1);
+		j = build_zonelists_node(NODE_DATA(node), zonelist, j);
 	}
 	for (node = 0; node < local_node; node++) {
 		if (!node_online(node))
 			continue;
-		j = build_zonelists_node(NODE_DATA(node), zonelist, j,
-							MAX_NR_ZONES - 1);
+		j = build_zonelists_node(NODE_DATA(node), zonelist, j);
 	}
 
 	zonelist->_zonerefs[j].zone = NULL;
-- 
cgit v0.10.2


From b21fbccd4b8aba805cbc231998ec7bf83616a79e Mon Sep 17 00:00:00 2001
From: Zhang Yanfei <zhangyanfei@cn.fujitsu.com>
Date: Mon, 8 Jul 2013 16:00:07 -0700
Subject: mm: remove unused functions is_{normal_idx, normal, dma32, dma}

These functions are nowhere used, so remove them.

Signed-off-by: Zhang Yanfei <zhangyanfei@cn.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index ae19af5..af4a3b7 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -869,11 +869,6 @@ static inline int is_highmem_idx(enum zone_type idx)
 #endif
 }
 
-static inline int is_normal_idx(enum zone_type idx)
-{
-	return (idx == ZONE_NORMAL);
-}
-
 /**
  * is_highmem - helper function to quickly check if a struct zone is a 
  *              highmem zone or not.  This is an attempt to keep references
@@ -892,29 +887,6 @@ static inline int is_highmem(struct zone *zone)
 #endif
 }
 
-static inline int is_normal(struct zone *zone)
-{
-	return zone == zone->zone_pgdat->node_zones + ZONE_NORMAL;
-}
-
-static inline int is_dma32(struct zone *zone)
-{
-#ifdef CONFIG_ZONE_DMA32
-	return zone == zone->zone_pgdat->node_zones + ZONE_DMA32;
-#else
-	return 0;
-#endif
-}
-
-static inline int is_dma(struct zone *zone)
-{
-#ifdef CONFIG_ZONE_DMA
-	return zone == zone->zone_pgdat->node_zones + ZONE_DMA;
-#else
-	return 0;
-#endif
-}
-
 /* These two functions are used to setup the per zone pages min values */
 struct ctl_table;
 int min_free_kbytes_sysctl_handler(struct ctl_table *, int,
-- 
cgit v0.10.2


From 345606d42971fc4ed164fbabac118708d51b8e0a Mon Sep 17 00:00:00 2001
From: Zhang Yanfei <zhangyanfei@cn.fujitsu.com>
Date: Mon, 8 Jul 2013 16:00:08 -0700
Subject: mm/page_alloc.c: remove unlikely() from the current_order test

In __rmqueue_fallback(), current_order loops down from MAX_ORDER - 1 to
the order passed.  MAX_ORDER is typically 11 and pageblock_order is
typically 9 on x86.  Integer division truncates, so pageblock_order / 2
is 4.  For the first eight iterations, it's guaranteed that
current_order >= pageblock_order / 2 if it even gets that far!

So just remove the unlikely(), it's completely bogus.

Signed-off-by: Zhang Yanfei <zhangyanfei@cn.fujitsu.com>
Suggested-by: David Rientjes <rientjes@google.com>
Acked-by: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 27f9d4b..b5855e5 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1046,7 +1046,7 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
 			 * MIGRATE_CMA areas.
 			 */
 			if (!is_migrate_cma(migratetype) &&
-			    (unlikely(current_order >= pageblock_order / 2) ||
+			    (current_order >= pageblock_order / 2 ||
 			     start_migratetype == MIGRATE_RECLAIMABLE ||
 			     page_group_by_mobility_disabled)) {
 				int pages;
-- 
cgit v0.10.2


From 59d3132f8abdc18301898febf205d00db5f0458c Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Mon, 8 Jul 2013 16:00:08 -0700
Subject: vfree: don't schedule free_work() if llist_add() returns false

vfree() only needs schedule_work(&p->wq) if p->list was empty, otherwise
vfree_deferred->wq is already pending or it is running and didn't do
llist_del_all() yet.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 318c500..a649186 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1477,7 +1477,6 @@ static void __vunmap(const void *addr, int deallocate_pages)
  *	conventions for vfree() arch-depenedent would be a really bad idea)
  *
  *	NOTE: assumes that the object at *addr has a size >= sizeof(llist_node)
- *	
  */
 void vfree(const void *addr)
 {
@@ -1489,8 +1488,8 @@ void vfree(const void *addr)
 		return;
 	if (unlikely(in_interrupt())) {
 		struct vfree_deferred *p = &__get_cpu_var(vfree_deferred);
-		llist_add((struct llist_node *)addr, &p->list);
-		schedule_work(&p->wq);
+		if (llist_add((struct llist_node *)addr, &p->list))
+			schedule_work(&p->wq);
 	} else
 		__vunmap(addr, 1);
 }
-- 
cgit v0.10.2


From 929aaf56958ab2300919653b923413af695470a5 Mon Sep 17 00:00:00 2001
From: Zhang Yanfei <zhangyanfei@cn.fujitsu.com>
Date: Mon, 8 Jul 2013 16:00:09 -0700
Subject: mm: remove unused __put_page()

This function is nowhere used, and it has a confusing name with put_page
in mm/swap.c.  So better to remove it.

Signed-off-by: Zhang Yanfei <zhangyanfei@cn.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/mm/internal.h b/mm/internal.h
index 8562de0..4390ac6 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -32,11 +32,6 @@ static inline void set_page_refcounted(struct page *page)
 	set_page_count(page, 1);
 }
 
-static inline void __put_page(struct page *page)
-{
-	atomic_dec(&page->_count);
-}
-
 static inline void __get_page_tail_foll(struct page *page,
 					bool get_page_head)
 {
-- 
cgit v0.10.2


From f3deb6872b946a851a3799b315f3c85ce4c027fc Mon Sep 17 00:00:00 2001
From: Zhang Yanfei <zhangyanfei@cn.fujitsu.com>
Date: Mon, 8 Jul 2013 16:00:10 -0700
Subject: mm/sparse.c: put clear_hwpoisoned_pages within
 CONFIG_MEMORY_HOTREMOVE

With CONFIG_MEMORY_HOTREMOVE unset, there is a compile warning:

  mm/sparse.c:755: warning: `clear_hwpoisoned_pages' defined but not used

And Bisecting it ended up pointing to 4edd7ceff ("mm, hotplug: avoid
compiling memory hotremove functions when disabled").

This is because the commit above put sparse_remove_one_section() within
the protection of CONFIG_MEMORY_HOTREMOVE but the only user of
clear_hwpoisoned_pages() is sparse_remove_one_section(), and it is not
within the protection of CONFIG_MEMORY_HOTREMOVE.

So put clear_hwpoisoned_pages within CONFIG_MEMORY_HOTREMOVE should fix
the warning.

Signed-off-by: Zhang Yanfei <zhangyanfei@cn.fujitsu.com>
Cc: David Rientjes <rientjes@google.com>
Acked-by: Toshi Kani <toshi.kani@hp.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/mm/sparse.c b/mm/sparse.c
index b38400f..308d503 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -753,6 +753,7 @@ out:
 	return ret;
 }
 
+#ifdef CONFIG_MEMORY_HOTREMOVE
 #ifdef CONFIG_MEMORY_FAILURE
 static void clear_hwpoisoned_pages(struct page *memmap, int nr_pages)
 {
@@ -774,7 +775,6 @@ static inline void clear_hwpoisoned_pages(struct page *memmap, int nr_pages)
 }
 #endif
 
-#ifdef CONFIG_MEMORY_HOTREMOVE
 static void free_section_usemap(struct page *memmap, unsigned long *usemap)
 {
 	struct page *usemap_page;
-- 
cgit v0.10.2


From 12057841008534236e52df3d3e63e089f27c5406 Mon Sep 17 00:00:00 2001
From: Haicheng Li <haicheng.li@linux.intel.com>
Date: Mon, 8 Jul 2013 16:00:11 -0700
Subject: fs/fs-writeback.c: : make wb_do_writeback() as static

It's not used globally and could be static.

Signed-off-by: Haicheng Li <haicheng.li@linux.intel.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index a85ac4e..aca8835 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -963,7 +963,7 @@ static long wb_check_old_data_flush(struct bdi_writeback *wb)
 /*
  * Retrieve work items and do the writeback they describe
  */
-long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
+static long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
 {
 	struct backing_dev_info *bdi = wb->bdi;
 	struct wb_writeback_work *work;
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index abfe117..e0efffa 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -95,7 +95,6 @@ int try_to_writeback_inodes_sb_nr(struct super_block *, unsigned long nr,
 void sync_inodes_sb(struct super_block *);
 long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages,
 				enum wb_reason reason);
-long wb_do_writeback(struct bdi_writeback *wb, int force_wait);
 void wakeup_flusher_threads(long nr_pages, enum wb_reason reason);
 void inode_wait_for_writeback(struct inode *inode);
 
-- 
cgit v0.10.2


From 6ce1bc86ae8b8f74095f2694732ccbab2f3849e5 Mon Sep 17 00:00:00 2001
From: Wanpeng Li <liwanp@linux.vnet.ibm.com>
Date: Mon, 8 Jul 2013 16:00:12 -0700
Subject: mm/writeback: remove wb_reason_name

wb_reason_name is not used any more - remove it.

Signed-off-by: Wanpeng Li <liwanp@linux.vnet.ibm.com>
Reviewed-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Fengguang Wu <fengguang.wu@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index e0efffa..e1703de 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -51,7 +51,6 @@ enum wb_reason {
 
 	WB_REASON_MAX,
 };
-extern const char *wb_reason_name[];
 
 /*
  * A control structure which tells the writeback code what to do.  These are
-- 
cgit v0.10.2


From 25d130ba22362757a90135fd8a0f75cc7fc71e79 Mon Sep 17 00:00:00 2001
From: Wanpeng Li <liwanp@linux.vnet.ibm.com>
Date: Mon, 8 Jul 2013 16:00:14 -0700
Subject: mm/writeback: don't check force_wait to handle bdi->work_list

After commit 839a8e8660b6 ("writeback: replace custom worker pool
implementation with unbound workqueue"), bdi_writeback_workfn runs off
bdi_writeback->dwork, on each execution, it processes bdi->work_list and
reschedules if there are more things to do instead of flush any work
that race with us existing.  It is unecessary to check force_wait in
wb_do_writeback since it is always 0 after the mentioned commit.  This
patch remove the force_wait in wb_do_writeback.

Signed-off-by: Wanpeng Li <liwanp@linux.vnet.ibm.com>
Reviewed-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Fengguang Wu <fengguang.wu@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index aca8835..68851ff 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -963,7 +963,7 @@ static long wb_check_old_data_flush(struct bdi_writeback *wb)
 /*
  * Retrieve work items and do the writeback they describe
  */
-static long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
+static long wb_do_writeback(struct bdi_writeback *wb)
 {
 	struct backing_dev_info *bdi = wb->bdi;
 	struct wb_writeback_work *work;
@@ -971,12 +971,6 @@ static long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
 
 	set_bit(BDI_writeback_running, &wb->bdi->state);
 	while ((work = get_next_work_item(bdi)) != NULL) {
-		/*
-		 * Override sync mode, in case we must wait for completion
-		 * because this thread is exiting now.
-		 */
-		if (force_wait)
-			work->sync_mode = WB_SYNC_ALL;
 
 		trace_writeback_exec(bdi, work);
 
@@ -1025,7 +1019,7 @@ void bdi_writeback_workfn(struct work_struct *work)
 		 * rescuer as work_list needs to be drained.
 		 */
 		do {
-			pages_written = wb_do_writeback(wb, 0);
+			pages_written = wb_do_writeback(wb);
 			trace_writeback_pages_written(pages_written);
 		} while (!list_empty(&bdi->work_list));
 	} else {
-- 
cgit v0.10.2


From fc6df808aaf00eed564e2e7fc0f246691363cd12 Mon Sep 17 00:00:00 2001
From: Wanpeng Li <liwanp@linux.vnet.ibm.com>
Date: Mon, 8 Jul 2013 16:00:15 -0700
Subject: mm/writeback: commit reason of WB_REASON_FORKER_THREAD mismatch name

After commit 839a8e8660b6 ("writeback: replace custom worker pool
implementation with unbound workqueue"), there is no bdi forker thread
any more.  However, WB_REASON_FORKER_THREAD is still used due to it is
TPs userland visible and we won't be exposing exactly the same
information with just a different name.

Signed-off-by: Wanpeng Li <liwanp@linux.vnet.ibm.com>
Reviewed-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index e1703de..4e198ca 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -47,6 +47,12 @@ enum wb_reason {
 	WB_REASON_LAPTOP_TIMER,
 	WB_REASON_FREE_MORE_MEM,
 	WB_REASON_FS_FREE_SPACE,
+	/*
+	 * There is no bdi forker thread any more and works are done
+	 * by emergency worker, however, this is TPs userland visible
+	 * and we'll be exposing exactly the same information,
+	 * so it has a mismatch name.
+	 */
 	WB_REASON_FORKER_THREAD,
 
 	WB_REASON_MAX,
-- 
cgit v0.10.2


From f8f191f1addf0b31f188fd88e71e97200871c99c Mon Sep 17 00:00:00 2001
From: Wanpeng Li <liwanp@linux.vnet.ibm.com>
Date: Mon, 8 Jul 2013 16:00:16 -0700
Subject: mm/page_alloc: fix doc for numa_zonelist_order

The default zonelist order selecter will select "node" order if any nodes
DMA zone comprises greater than 70% of its local memory instead of 60%,
according to default_zonelist_order::low_kmem_size > total * 70/100.

Signed-off-by: Wanpeng Li <liwanp@linux.vnet.ibm.com>
Reviewed-by: Michal Hocko <mhocko@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index dcc75a9..36ecc26 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -510,7 +510,7 @@ Specify "[Dd]efault" to request automatic configuration.  Autoconfiguration
 will select "node" order in following case.
 (1) if the DMA zone does not exist or
 (2) if the DMA zone comprises greater than 50% of the available memory or
-(3) if any node's DMA zone comprises greater than 60% of its local memory and
+(3) if any node's DMA zone comprises greater than 70% of its local memory and
     the amount of local memory is big enough.
 
 Otherwise, "zone" order will be selected. Default order is recommended unless
-- 
cgit v0.10.2


From f49cbdde495f62e1c2d906b16e833cec27de5e59 Mon Sep 17 00:00:00 2001
From: Wanpeng Li <liwanp@linux.vnet.ibm.com>
Date: Mon, 8 Jul 2013 16:00:16 -0700
Subject: mm/thp: fix doc for transparent huge zero page

Transparent huge zero page is used during the page fault instead of in
khugepaged.

  # ls /sys/kernel/mm/transparent_hugepage/
  defrag  enabled  khugepaged  use_zero_page
  # ls /sys/kernel/mm/transparent_hugepage/khugepaged/
  alloc_sleep_millisecs  defrag  full_scans  max_ptes_none  pages_collapsed  pages_to_scan  scan_sleep_millisecs

This patch corrects the documentation just like the codes done.

Signed-off-by: Wanpeng Li <liwanp@linux.vnet.ibm.com>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/Documentation/vm/transhuge.txt b/Documentation/vm/transhuge.txt
index 8785fb8..4a63953 100644
--- a/Documentation/vm/transhuge.txt
+++ b/Documentation/vm/transhuge.txt
@@ -120,8 +120,8 @@ By default kernel tries to use huge zero page on read page fault.
 It's possible to disable huge zero page by writing 0 or enable it
 back by writing 1:
 
-echo 0 >/sys/kernel/mm/transparent_hugepage/khugepaged/use_zero_page
-echo 1 >/sys/kernel/mm/transparent_hugepage/khugepaged/use_zero_page
+echo 0 >/sys/kernel/mm/transparent_hugepage/use_zero_page
+echo 1 >/sys/kernel/mm/transparent_hugepage/use_zero_page
 
 khugepaged will be automatically started when
 transparent_hugepage/enabled is set to "always" or "madvise, and it'll
-- 
cgit v0.10.2


From 73b44ff43c4b3cf517826da03c51948593f88753 Mon Sep 17 00:00:00 2001
From: Wanpeng Li <liwanp@linux.vnet.ibm.com>
Date: Mon, 8 Jul 2013 16:00:17 -0700
Subject: mm/pgtable: don't accumulate addr during pgd prepopulate pmd

The old codes accumulate addr to get right pmd, however, currently pmds
are preallocated and transfered as a parameter, there is unnecessary to
accumulate addr variable any more, this patch remove it.

Signed-off-by: Wanpeng Li <liwanp@linux.vnet.ibm.com>
Reviewed-by: Michal Hocko <mhocko@suse.cz>
Reviewed-by: Zhang Yanfei <zhangyanfei@cn.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 17fda6a..dfa537a 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -240,7 +240,6 @@ static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
 static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[])
 {
 	pud_t *pud;
-	unsigned long addr;
 	int i;
 
 	if (PREALLOCATED_PMDS == 0) /* Work around gcc-3.4.x bug */
@@ -248,8 +247,7 @@ static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[])
 
 	pud = pud_offset(pgd, 0);
 
- 	for (addr = i = 0; i < PREALLOCATED_PMDS;
-	     i++, pud++, addr += PUD_SIZE) {
+	for (i = 0; i < PREALLOCATED_PMDS; i++, pud++) {
 		pmd_t *pmd = pmds[i];
 
 		if (i >= KERNEL_PGD_BOUNDARY)
-- 
cgit v0.10.2


From 64363aad5ff1b878230e91223038c26a2205bff3 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Mon, 8 Jul 2013 16:00:18 -0700
Subject: mm: remove unused VM_<READfoo> macros and expand other in-place

These VM_<READfoo> macros aren't used very often and three of them
aren't used at all.

Expand the ones that are used in-place, and remove all the now unused
#define VM_<foo> macros.

VM_READHINTMASK, VM_NormalReadHint and VM_ClearReadHint were added just
before 2.4 and appears have never been used.

Signed-off-by: Joe Perches <joe@perches.com>
Acked-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/include/linux/mm.h b/include/linux/mm.h
index b87681a..f022460 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -151,12 +151,6 @@ extern unsigned int kobjsize(const void *objp);
 #define VM_STACK_FLAGS	(VM_GROWSDOWN | VM_STACK_DEFAULT_FLAGS | VM_ACCOUNT)
 #endif
 
-#define VM_READHINTMASK			(VM_SEQ_READ | VM_RAND_READ)
-#define VM_ClearReadHint(v)		(v)->vm_flags &= ~VM_READHINTMASK
-#define VM_NormalReadHint(v)		(!((v)->vm_flags & VM_READHINTMASK))
-#define VM_SequentialReadHint(v)	((v)->vm_flags & VM_SEQ_READ)
-#define VM_RandomReadHint(v)		((v)->vm_flags & VM_RAND_READ)
-
 /*
  * Special vmas that are non-mergable, non-mlock()able.
  * Note: mm/huge_memory.c VM_NO_THP depends on this definition.
diff --git a/mm/filemap.c b/mm/filemap.c
index 7905fe7..4b51ac1 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1539,12 +1539,12 @@ static void do_sync_mmap_readahead(struct vm_area_struct *vma,
 	struct address_space *mapping = file->f_mapping;
 
 	/* If we don't want any read-ahead, don't bother */
-	if (VM_RandomReadHint(vma))
+	if (vma->vm_flags & VM_RAND_READ)
 		return;
 	if (!ra->ra_pages)
 		return;
 
-	if (VM_SequentialReadHint(vma)) {
+	if (vma->vm_flags & VM_SEQ_READ) {
 		page_cache_sync_readahead(mapping, ra, file, offset,
 					  ra->ra_pages);
 		return;
@@ -1584,7 +1584,7 @@ static void do_async_mmap_readahead(struct vm_area_struct *vma,
 	struct address_space *mapping = file->f_mapping;
 
 	/* If we don't want any read-ahead, don't bother */
-	if (VM_RandomReadHint(vma))
+	if (vma->vm_flags & VM_RAND_READ)
 		return;
 	if (ra->mmap_miss > 0)
 		ra->mmap_miss--;
diff --git a/mm/memory.c b/mm/memory.c
index b68812d..1ce2e2a 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1150,7 +1150,7 @@ again:
 				if (pte_dirty(ptent))
 					set_page_dirty(page);
 				if (pte_young(ptent) &&
-				    likely(!VM_SequentialReadHint(vma)))
+				    likely(!(vma->vm_flags & VM_SEQ_READ)))
 					mark_page_accessed(page);
 				rss[MM_FILEPAGES]--;
 			}
diff --git a/mm/rmap.c b/mm/rmap.c
index e22ceeb..cd356df 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -720,7 +720,7 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma,
 			 * mapping is already gone, the unmap path will have
 			 * set PG_referenced or activated the page.
 			 */
-			if (likely(!VM_SequentialReadHint(vma)))
+			if (likely(!(vma->vm_flags & VM_SEQ_READ)))
 				referenced++;
 		}
 		pte_unmap_unlock(pte, ptl);
-- 
cgit v0.10.2


From bcb615a81b1765864c71c50afb56631e7a1e5283 Mon Sep 17 00:00:00 2001
From: Zhang Yanfei <zhangyanfei.yes@gmail.com>
Date: Mon, 8 Jul 2013 16:00:19 -0700
Subject: mm/vmalloc.c: fix an overflow bug in alloc_vmap_area()

When searching a vmap area in the vmalloc space, we use (addr + size -
1) to check if the value is less than addr, which is an overflow.  But
we assign (addr + size) to vmap_area->va_end.

So if we come across the below case:

  (addr + size - 1) : not overflow
  (addr + size)     : overflow

we will assign an overflow value (e.g 0) to vmap_area->va_end, And this
will trigger BUG in __insert_vmap_area, causing system panic.

So using (addr + size) to check the overflow should be the correct
behaviour, not (addr + size - 1).

Signed-off-by: Zhang Yanfei <zhangyanfei@cn.fujitsu.com>
Reported-by: Ghennadi Procopciuc <unix140@gmail.com>
Tested-by: Daniel Baluta <dbaluta@ixiacom.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index a649186..13a5495 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -388,12 +388,12 @@ nocache:
 		addr = ALIGN(first->va_end, align);
 		if (addr < vstart)
 			goto nocache;
-		if (addr + size - 1 < addr)
+		if (addr + size < addr)
 			goto overflow;
 
 	} else {
 		addr = ALIGN(vstart, align);
-		if (addr + size - 1 < addr)
+		if (addr + size < addr)
 			goto overflow;
 
 		n = vmap_area_root.rb_node;
@@ -420,7 +420,7 @@ nocache:
 		if (addr + cached_hole_size < first->va_start)
 			cached_hole_size = first->va_start - addr;
 		addr = ALIGN(first->va_end, align);
-		if (addr + size - 1 < addr)
+		if (addr + size < addr)
 			goto overflow;
 
 		if (list_is_last(&first->list, &vmap_area_list))
-- 
cgit v0.10.2


From ef277c73ca3b1aade278036ae11640090681d558 Mon Sep 17 00:00:00 2001
From: Tang Chen <tangchen@cn.fujitsu.com>
Date: Mon, 8 Jul 2013 16:00:21 -0700
Subject: page migration: fix wrong comment in
 address_space_operations.migratepage()

There is no parameter "sync" in address_space_operations->migratepage().
It should be migrate_mode.  And the comment is for MIGRATE_ASYNC.

Signed-off-by: Tang Chen <tangchen@cn.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 99be011..cb771ec 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -372,8 +372,8 @@ struct address_space_operations {
 	int (*get_xip_mem)(struct address_space *, pgoff_t, int,
 						void **, unsigned long *);
 	/*
-	 * migrate the contents of a page to the specified target. If sync
-	 * is false, it must not block.
+	 * migrate the contents of a page to the specified target. If
+	 * migrate_mode is MIGRATE_ASYNC, it must not block.
 	 */
 	int (*migratepage) (struct address_space *,
 			struct page *, struct page *, enum migrate_mode);
-- 
cgit v0.10.2


From d8bbdd773d64b30b6b36f027ad2e182ed2045f3c Mon Sep 17 00:00:00 2001
From: Tang Chen <tangchen@cn.fujitsu.com>
Date: Mon, 8 Jul 2013 16:00:22 -0700
Subject: mm/memblock.c: fix wrong comment in __next_free_mem_range()

Remove one redundant "nid" in the comment.

Signed-off-by: Tang Chen <tangchen@cn.fujitsu.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/mm/memblock.c b/mm/memblock.c
index c5fad93..a847bfe6 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -566,7 +566,7 @@ int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size)
 /**
  * __next_free_mem_range - next function for for_each_free_mem_range()
  * @idx: pointer to u64 loop variable
- * @nid: nid: node selector, %MAX_NUMNODES for all nodes
+ * @nid: node selector, %MAX_NUMNODES for all nodes
  * @out_start: ptr to phys_addr_t for start address of the range, can be %NULL
  * @out_end: ptr to phys_addr_t for end address of the range, can be %NULL
  * @out_nid: ptr to int for nid of the range, can be %NULL
-- 
cgit v0.10.2


From 7e9f5eb03d3762ec89dda1888c774ae7b4040af7 Mon Sep 17 00:00:00 2001
From: Tang Chen <tangchen@cn.fujitsu.com>
Date: Mon, 8 Jul 2013 16:00:23 -0700
Subject: mm/memory_hotplug.c: fix a comment typo in
 register_page_bootmem_info_node()

Signed-off-by: Tang Chen <tangchen@cn.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index f5ba127..cd2990f 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -208,13 +208,13 @@ void register_page_bootmem_info_node(struct pglist_data *pgdat)
 	pfn = pgdat->node_start_pfn;
 	end_pfn = pgdat_end_pfn(pgdat);
 
-	/* register_section info */
+	/* register section info */
 	for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
 		/*
 		 * Some platforms can assign the same pfn to multiple nodes - on
 		 * node0 as well as nodeN.  To avoid registering a pfn against
 		 * multiple nodes we check that this pfn does not already
-		 * reside in some other node.
+		 * reside in some other nodes.
 		 */
 		if (pfn_valid(pfn) && (pfn_to_nid(pfn) == node))
 			register_page_bootmem_info_section(pfn);
-- 
cgit v0.10.2


From 5a1c9cbc1550f93335d7c03eb6c271e642deff04 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@suse.de>
Date: Mon, 8 Jul 2013 16:00:24 -0700
Subject: mm: vmscan: do not continue scanning if reclaim was aborted for
 compaction

Direct reclaim is not aborting to allow compaction to go ahead properly.
do_try_to_free_pages is told to abort reclaim which is happily ignores
and instead increases priority instead until it reaches 0 and starts
shrinking file/anon equally.  This patch corrects the situation by
aborting reclaim when requested instead of raising priority.

Signed-off-by: Mel Gorman <mgorman@suse.de>
Cc: Rik van Riel <riel@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Kamezawa Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 99b3ac7..2385663 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2361,8 +2361,10 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
 		aborted_reclaim = shrink_zones(zonelist, sc);
 
 		/*
-		 * Don't shrink slabs when reclaiming memory from
-		 * over limit cgroups
+		 * Don't shrink slabs when reclaiming memory from over limit
+		 * cgroups but do shrink slab at least once when aborting
+		 * reclaim for compaction to avoid unevenly scanning file/anon
+		 * LRU pages over slab pages.
 		 */
 		if (global_reclaim(sc)) {
 			unsigned long lru_pages = 0;
@@ -2404,7 +2406,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
 						WB_REASON_TRY_TO_FREE_PAGES);
 			sc->may_writepage = 1;
 		}
-	} while (--sc->priority >= 0);
+	} while (--sc->priority >= 0 && !aborted_reclaim);
 
 out:
 	delayacct_freepages_end();
-- 
cgit v0.10.2


From 918fc718c5922520c499ad60f61b8df86b998ae9 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@suse.de>
Date: Mon, 8 Jul 2013 16:00:25 -0700
Subject: mm: vmscan: do not scale writeback pages when deciding whether to set
 ZONE_WRITEBACK

After the patch "mm: vmscan: Flatten kswapd priority loop" was merged
the scanning priority of kswapd changed.

The priority now rises until it is scanning enough pages to meet the
high watermark.  shrink_inactive_list sets ZONE_WRITEBACK if a number of
pages were encountered under writeback but this value is scaled based on
the priority.  As kswapd frequently scans with a higher priority now it
is relatively easy to set ZONE_WRITEBACK.  This patch removes the
scaling and treates writeback pages similar to how it treats unqueued
dirty pages and congested pages.  The user-visible effect should be that
kswapd will writeback fewer pages from reclaim context.

Signed-off-by: Mel Gorman <mgorman@suse.de>
Cc: Rik van Riel <riel@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Kamezawa Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 2385663..2cff0d4 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1443,25 +1443,11 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
 	 * as there is no guarantee the dirtying process is throttled in the
 	 * same way balance_dirty_pages() manages.
 	 *
-	 * This scales the number of dirty pages that must be under writeback
-	 * before a zone gets flagged ZONE_WRITEBACK. It is a simple backoff
-	 * function that has the most effect in the range DEF_PRIORITY to
-	 * DEF_PRIORITY-2 which is the priority reclaim is considered to be
-	 * in trouble and reclaim is considered to be in trouble.
-	 *
-	 * DEF_PRIORITY   100% isolated pages must be PageWriteback to throttle
-	 * DEF_PRIORITY-1  50% must be PageWriteback
-	 * DEF_PRIORITY-2  25% must be PageWriteback, kswapd in trouble
-	 * ...
-	 * DEF_PRIORITY-6 For SWAP_CLUSTER_MAX isolated pages, throttle if any
-	 *                     isolated page is PageWriteback
-	 *
 	 * Once a zone is flagged ZONE_WRITEBACK, kswapd will count the number
 	 * of pages under pages flagged for immediate reclaim and stall if any
 	 * are encountered in the nr_immediate check below.
 	 */
-	if (nr_writeback && nr_writeback >=
-			(nr_taken >> (DEF_PRIORITY - sc->priority)))
+	if (nr_writeback && nr_writeback == nr_taken)
 		zone_set_flag(zone, ZONE_WRITEBACK);
 
 	/*
-- 
cgit v0.10.2


From 493af578040e690f93f0fc8d9e7667ffff8155bb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=B6rn=20Engel?= <joern@logfs.org>
Date: Mon, 8 Jul 2013 16:00:26 -0700
Subject: mmap: allow MAP_HUGETLB for hugetlbfs files v2

It is counterintuitive at best that mmap'ing a hugetlbfs file with
MAP_HUGETLB fails, while mmap'ing it without will a) succeed and b)
return huge pages.

v2: use is_file_hugepages(), as suggested by Jianguo

Signed-off-by: Joern Engel <joern@logfs.org>
Cc: Jianguo Wu <wujianguo@huawei.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/mm/mmap.c b/mm/mmap.c
index 8468ffd..0718c17 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1358,13 +1358,14 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
 
 	if (!(flags & MAP_ANONYMOUS)) {
 		audit_mmap_fd(fd, flags);
-		if (unlikely(flags & MAP_HUGETLB))
-			return -EINVAL;
 		file = fget(fd);
 		if (!file)
 			goto out;
 		if (is_file_hugepages(file))
 			len = ALIGN(len, huge_page_size(hstate_file(file)));
+		retval = -EINVAL;
+		if (unlikely(flags & MAP_HUGETLB && !is_file_hugepages(file)))
+			goto out_fput;
 	} else if (flags & MAP_HUGETLB) {
 		struct user_struct *user = NULL;
 		struct hstate *hs = hstate_sizelog((flags >> MAP_HUGE_SHIFT) &
@@ -1391,6 +1392,7 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
 	flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
 
 	retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff);
+out_fput:
 	if (file)
 		fput(file);
 out:
-- 
cgit v0.10.2


From fa460c2d37870e0a6f94c70e8b76d05ca11b6db0 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.cz>
Date: Mon, 8 Jul 2013 16:00:27 -0700
Subject: Revert "memcg: avoid dangling reference count in creation failure"

This reverts commit e4715f01be697a.

mem_cgroup_put is hierarchy aware so mem_cgroup_put(memcg) already drops
an additional reference from all parents so the additional
mem_cgrroup_put(parent) potentially causes use-after-free.

Signed-off-by: Michal Hocko <mhocko@suse.cz>
Signed-off-by: Li Zefan <lizefan@huawei.com>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Glauber Costa <glommer@openvz.org>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: <stable@vger.kernel.org>	[3.9+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 64f7265..6b73d86 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -6352,8 +6352,6 @@ mem_cgroup_css_online(struct cgroup *cont)
 		 * call __mem_cgroup_free, so return directly
 		 */
 		mem_cgroup_put(memcg);
-		if (parent->use_hierarchy)
-			mem_cgroup_put(parent);
 	}
 	return error;
 }
-- 
cgit v0.10.2


From f37a96914d1aea10fed8d9af10251f0b9caea31b Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.cz>
Date: Mon, 8 Jul 2013 16:00:29 -0700
Subject: memcg, kmem: fix reference count handling on the error path

mem_cgroup_css_online calls mem_cgroup_put if memcg_init_kmem fails.
This is not correct because only memcg_propagate_kmem takes an
additional reference while mem_cgroup_sockets_init is allowed to fail as
well (although no current implementation fails) but it doesn't take any
reference.  This all suggests that it should be memcg_propagate_kmem
that should clean up after itself so this patch moves mem_cgroup_put
over there.

Unfortunately this is not that easy (as pointed out by Li Zefan) because
memcg_kmem_mark_dead marks the group dead (KMEM_ACCOUNTED_DEAD) if it is
marked active (KMEM_ACCOUNTED_ACTIVE) which is the case even if
memcg_propagate_kmem fails so the additional reference is dropped in
that case in kmem_cgroup_destroy which means that the reference would be
dropped two times.

The easiest way then would be to simply remove mem_cgrroup_put from
mem_cgroup_css_online and rely on kmem_cgroup_destroy doing the right
thing.

Signed-off-by: Michal Hocko <mhocko@suse.cz>
Signed-off-by: Li Zefan <lizefan@huawei.com>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Glauber Costa <glommer@openvz.org>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: <stable@vger.kernel.org>	[3.8]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 6b73d86..bdeb82c 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -6345,14 +6345,6 @@ mem_cgroup_css_online(struct cgroup *cont)
 
 	error = memcg_init_kmem(memcg, &mem_cgroup_subsys);
 	mutex_unlock(&memcg_create_mutex);
-	if (error) {
-		/*
-		 * We call put now because our (and parent's) refcnts
-		 * are already in place. mem_cgroup_put() will internally
-		 * call __mem_cgroup_free, so return directly
-		 */
-		mem_cgroup_put(memcg);
-	}
 	return error;
 }
 
-- 
cgit v0.10.2


From 5347e5ae13710420eebbbd0b22c045685704da80 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizefan@huawei.com>
Date: Mon, 8 Jul 2013 16:00:30 -0700
Subject: memcg: use css_get() in sock_update_memcg()

Use css_get/css_put instead of mem_cgroup_get/put.

Note, if at the same time someone is moving @current to a different
cgroup and removing the old cgroup, css_tryget() may return false, and
sock->sk_cgrp won't be initialized, which is fine.

Signed-off-by: Li Zefan <lizefan@huawei.com>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: Hugh Dickins <hughd@google.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Glauber Costa <glommer@openvz.org>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index bdeb82c..4c31a21a 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -551,15 +551,15 @@ void sock_update_memcg(struct sock *sk)
 		 */
 		if (sk->sk_cgrp) {
 			BUG_ON(mem_cgroup_is_root(sk->sk_cgrp->memcg));
-			mem_cgroup_get(sk->sk_cgrp->memcg);
+			css_get(&sk->sk_cgrp->memcg->css);
 			return;
 		}
 
 		rcu_read_lock();
 		memcg = mem_cgroup_from_task(current);
 		cg_proto = sk->sk_prot->proto_cgroup(memcg);
-		if (!mem_cgroup_is_root(memcg) && memcg_proto_active(cg_proto)) {
-			mem_cgroup_get(memcg);
+		if (!mem_cgroup_is_root(memcg) &&
+		    memcg_proto_active(cg_proto) && css_tryget(&memcg->css)) {
 			sk->sk_cgrp = cg_proto;
 		}
 		rcu_read_unlock();
@@ -573,7 +573,7 @@ void sock_release_memcg(struct sock *sk)
 		struct mem_cgroup *memcg;
 		WARN_ON(!sk->sk_cgrp->memcg);
 		memcg = sk->sk_cgrp->memcg;
-		mem_cgroup_put(memcg);
+		css_put(&sk->sk_cgrp->memcg->css);
 	}
 }
 
-- 
cgit v0.10.2


From 20f05310ba62d5816fb339d08effe78683137197 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizefan@huawei.com>
Date: Mon, 8 Jul 2013 16:00:31 -0700
Subject: memcg: don't use mem_cgroup_get() when creating a kmemcg cache

Use css_get()/css_put() instead of mem_cgroup_get()/mem_cgroup_put().

There are two things being done in the current code:

First, we acquired a css_ref to make sure that the underlying cgroup
would not go away.  That is a short lived reference, and it is put as
soon as the cache is created.

At this point, we acquire a long-lived per-cache memcg reference count
to guarantee that the memcg will still be alive.

so it is:

  enqueue: css_get
  create : memcg_get, css_put
  destroy: memcg_put

So we only need to get rid of the memcg_get, change the memcg_put to
css_put, and get rid of the now extra css_put.

(This changelog is mostly written by Glauber)

Signed-off-by: Li Zefan <lizefan@huawei.com>
Acked-by: Michal Hocko <mhocko@suse.cz>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Glauber Costa <glommer@openvz.org>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 4c31a21a..80175de 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -3242,7 +3242,7 @@ void memcg_release_cache(struct kmem_cache *s)
 	list_del(&s->memcg_params->list);
 	mutex_unlock(&memcg->slab_caches_mutex);
 
-	mem_cgroup_put(memcg);
+	css_put(&memcg->css);
 out:
 	kfree(s->memcg_params);
 }
@@ -3402,16 +3402,18 @@ static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg,
 
 	mutex_lock(&memcg_cache_mutex);
 	new_cachep = cachep->memcg_params->memcg_caches[idx];
-	if (new_cachep)
+	if (new_cachep) {
+		css_put(&memcg->css);
 		goto out;
+	}
 
 	new_cachep = kmem_cache_dup(memcg, cachep);
 	if (new_cachep == NULL) {
 		new_cachep = cachep;
+		css_put(&memcg->css);
 		goto out;
 	}
 
-	mem_cgroup_get(memcg);
 	atomic_set(&new_cachep->memcg_params->nr_pages , 0);
 
 	cachep->memcg_params->memcg_caches[idx] = new_cachep;
@@ -3499,8 +3501,6 @@ static void memcg_create_cache_work_func(struct work_struct *w)
 
 	cw = container_of(w, struct create_work, work);
 	memcg_create_kmem_cache(cw->memcg, cw->cachep);
-	/* Drop the reference gotten when we enqueued. */
-	css_put(&cw->memcg->css);
 	kfree(cw);
 }
 
-- 
cgit v0.10.2


From 10d5ebf40ff09db03b97cb177f24b9c7c8b4bb52 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizefan@huawei.com>
Date: Mon, 8 Jul 2013 16:00:33 -0700
Subject: memcg: use css_get/put when charging/uncharging kmem

Use css_get/put instead of mem_cgroup_get/put.

We can't do a simple replacement, because here mem_cgroup_put() is
called during mem_cgroup_css_free(), while mem_cgroup_css_free() won't
be called until css refcnt goes down to 0.

Instead we increment css refcnt in mem_cgroup_css_offline(), and then
check if there's still kmem charges.  If not, css refcnt will be
decremented immediately, otherwise the refcnt will be released after the
last kmem allocation is uncahred.

[akpm@linux-foundation.org: tweak comment]
Signed-off-by: Li Zefan <lizefan@huawei.com>
Acked-by: Michal Hocko <mhocko@suse.cz>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Reviewed-by: Tejun Heo <tj@kernel.org>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Glauber Costa <glommer@openvz.org>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 80175de..bdc9582 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -406,6 +406,11 @@ static void memcg_kmem_clear_activated(struct mem_cgroup *memcg)
 
 static void memcg_kmem_mark_dead(struct mem_cgroup *memcg)
 {
+	/*
+	 * Our caller must use css_get() first, because memcg_uncharge_kmem()
+	 * will call css_put() if it sees the memcg is dead.
+	 */
+	smp_wmb();
 	if (test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags))
 		set_bit(KMEM_ACCOUNTED_DEAD, &memcg->kmem_account_flags);
 }
@@ -3050,8 +3055,16 @@ static void memcg_uncharge_kmem(struct mem_cgroup *memcg, u64 size)
 	if (res_counter_uncharge(&memcg->kmem, size))
 		return;
 
+	/*
+	 * Releases a reference taken in kmem_cgroup_css_offline in case
+	 * this last uncharge is racing with the offlining code or it is
+	 * outliving the memcg existence.
+	 *
+	 * The memory barrier imposed by test&clear is paired with the
+	 * explicit one in memcg_kmem_mark_dead().
+	 */
 	if (memcg_kmem_test_and_clear_dead(memcg))
-		mem_cgroup_put(memcg);
+		css_put(&memcg->css);
 }
 
 void memcg_cache_list_add(struct mem_cgroup *memcg, struct kmem_cache *cachep)
@@ -5183,14 +5196,6 @@ static int memcg_update_kmem_limit(struct cgroup *cont, u64 val)
 		 * starts accounting before all call sites are patched
 		 */
 		memcg_kmem_set_active(memcg);
-
-		/*
-		 * kmem charges can outlive the cgroup. In the case of slab
-		 * pages, for instance, a page contain objects from various
-		 * processes, so it is unfeasible to migrate them away. We
-		 * need to reference count the memcg because of that.
-		 */
-		mem_cgroup_get(memcg);
 	} else
 		ret = res_counter_set_limit(&memcg->kmem, val);
 out:
@@ -5223,12 +5228,10 @@ static int memcg_propagate_kmem(struct mem_cgroup *memcg)
 		goto out;
 
 	/*
-	 * destroy(), called if we fail, will issue static_key_slow_inc() and
-	 * mem_cgroup_put() if kmem is enabled. We have to either call them
-	 * unconditionally, or clear the KMEM_ACTIVE flag. I personally find
-	 * this more consistent, since it always leads to the same destroy path
+	 * __mem_cgroup_free() will issue static_key_slow_dec() because this
+	 * memcg is active already. If the later initialization fails then the
+	 * cgroup core triggers the cleanup so we do not have to do it here.
 	 */
-	mem_cgroup_get(memcg);
 	static_key_slow_inc(&memcg_kmem_enabled_key);
 
 	mutex_lock(&set_limit_mutex);
@@ -5913,23 +5916,43 @@ static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
 	return mem_cgroup_sockets_init(memcg, ss);
 }
 
-static void kmem_cgroup_destroy(struct mem_cgroup *memcg)
+static void memcg_destroy_kmem(struct mem_cgroup *memcg)
 {
 	mem_cgroup_sockets_destroy(memcg);
+}
+
+static void kmem_cgroup_css_offline(struct mem_cgroup *memcg)
+{
+	if (!memcg_kmem_is_active(memcg))
+		return;
+
+	/*
+	 * kmem charges can outlive the cgroup. In the case of slab
+	 * pages, for instance, a page contain objects from various
+	 * processes. As we prevent from taking a reference for every
+	 * such allocation we have to be careful when doing uncharge
+	 * (see memcg_uncharge_kmem) and here during offlining.
+	 *
+	 * The idea is that that only the _last_ uncharge which sees
+	 * the dead memcg will drop the last reference. An additional
+	 * reference is taken here before the group is marked dead
+	 * which is then paired with css_put during uncharge resp. here.
+	 *
+	 * Although this might sound strange as this path is called from
+	 * css_offline() when the referencemight have dropped down to 0
+	 * and shouldn't be incremented anymore (css_tryget would fail)
+	 * we do not have other options because of the kmem allocations
+	 * lifetime.
+	 */
+	css_get(&memcg->css);
 
 	memcg_kmem_mark_dead(memcg);
 
 	if (res_counter_read_u64(&memcg->kmem, RES_USAGE) != 0)
 		return;
 
-	/*
-	 * Charges already down to 0, undo mem_cgroup_get() done in the charge
-	 * path here, being careful not to race with memcg_uncharge_kmem: it is
-	 * possible that the charges went down to 0 between mark_dead and the
-	 * res_counter read, so in that case, we don't need the put
-	 */
 	if (memcg_kmem_test_and_clear_dead(memcg))
-		mem_cgroup_put(memcg);
+		css_put(&memcg->css);
 }
 #else
 static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
@@ -5937,7 +5960,11 @@ static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
 	return 0;
 }
 
-static void kmem_cgroup_destroy(struct mem_cgroup *memcg)
+static void memcg_destroy_kmem(struct mem_cgroup *memcg)
+{
+}
+
+static void kmem_cgroup_css_offline(struct mem_cgroup *memcg)
 {
 }
 #endif
@@ -6370,6 +6397,8 @@ static void mem_cgroup_css_offline(struct cgroup *cont)
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
 
+	kmem_cgroup_css_offline(memcg);
+
 	mem_cgroup_invalidate_reclaim_iterators(memcg);
 	mem_cgroup_reparent_charges(memcg);
 	mem_cgroup_destroy_all_caches(memcg);
@@ -6379,9 +6408,8 @@ static void mem_cgroup_css_free(struct cgroup *cont)
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
 
-	kmem_cgroup_destroy(memcg);
-
-	mem_cgroup_put(memcg);
+	memcg_destroy_kmem(memcg);
+	__mem_cgroup_free(memcg);
 }
 
 #ifdef CONFIG_MMU
-- 
cgit v0.10.2


From 4050377b509b326c14b275fedb2f69b46f37a7a9 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizefan@huawei.com>
Date: Mon, 8 Jul 2013 16:00:34 -0700
Subject: memcg: use css_get/put for swap memcg

Use css_get/put instead of mem_cgroup_get/put.  A simple replacement
will do.

The historical reason that memcg has its own refcnt instead of always
using css_get/put, is that cgroup couldn't be removed if there're still
css refs, so css refs can't be used as long-lived reference.  The
situation has changed so that rmdir a cgroup will succeed regardless css
refs, but won't be freed until css refs goes down to 0.

Signed-off-by: Li Zefan <lizefan@huawei.com>
Acked-by: Michal Hocko <mhocko@suse.cz>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Glauber Costa <glommer@openvz.org>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index bdc9582..76c0c99 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -4231,12 +4231,12 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype,
 	unlock_page_cgroup(pc);
 	/*
 	 * even after unlock, we have memcg->res.usage here and this memcg
-	 * will never be freed.
+	 * will never be freed, so it's safe to call css_get().
 	 */
 	memcg_check_events(memcg, page);
 	if (do_swap_account && ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) {
 		mem_cgroup_swap_statistics(memcg, true);
-		mem_cgroup_get(memcg);
+		css_get(&memcg->css);
 	}
 	/*
 	 * Migration does not charge the res_counter for the
@@ -4348,7 +4348,7 @@ mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout)
 
 	/*
 	 * record memcg information,  if swapout && memcg != NULL,
-	 * mem_cgroup_get() was called in uncharge().
+	 * css_get() was called in uncharge().
 	 */
 	if (do_swap_account && swapout && memcg)
 		swap_cgroup_record(ent, css_id(&memcg->css));
@@ -4379,7 +4379,7 @@ void mem_cgroup_uncharge_swap(swp_entry_t ent)
 		if (!mem_cgroup_is_root(memcg))
 			res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
 		mem_cgroup_swap_statistics(memcg, false);
-		mem_cgroup_put(memcg);
+		css_put(&memcg->css);
 	}
 	rcu_read_unlock();
 }
@@ -4413,11 +4413,14 @@ static int mem_cgroup_move_swap_account(swp_entry_t entry,
 		 * This function is only called from task migration context now.
 		 * It postpones res_counter and refcount handling till the end
 		 * of task migration(mem_cgroup_clear_mc()) for performance
-		 * improvement. But we cannot postpone mem_cgroup_get(to)
-		 * because if the process that has been moved to @to does
-		 * swap-in, the refcount of @to might be decreased to 0.
+		 * improvement. But we cannot postpone css_get(to)  because if
+		 * the process that has been moved to @to does swap-in, the
+		 * refcount of @to might be decreased to 0.
+		 *
+		 * We are in attach() phase, so the cgroup is guaranteed to be
+		 * alive, so we can just call css_get().
 		 */
-		mem_cgroup_get(to);
+		css_get(&to->css);
 		return 0;
 	}
 	return -EINVAL;
@@ -6718,6 +6721,7 @@ static void __mem_cgroup_clear_mc(void)
 {
 	struct mem_cgroup *from = mc.from;
 	struct mem_cgroup *to = mc.to;
+	int i;
 
 	/* we must uncharge all the leftover precharges from mc.to */
 	if (mc.precharge) {
@@ -6738,7 +6742,9 @@ static void __mem_cgroup_clear_mc(void)
 		if (!mem_cgroup_is_root(mc.from))
 			res_counter_uncharge(&mc.from->memsw,
 						PAGE_SIZE * mc.moved_swap);
-		__mem_cgroup_put(mc.from, mc.moved_swap);
+
+		for (i = 0; i < mc.moved_swap; i++)
+			css_put(&mc.from->css);
 
 		if (!mem_cgroup_is_root(mc.to)) {
 			/*
@@ -6748,7 +6754,7 @@ static void __mem_cgroup_clear_mc(void)
 			res_counter_uncharge(&mc.to->res,
 						PAGE_SIZE * mc.moved_swap);
 		}
-		/* we've already done mem_cgroup_get(mc.to) */
+		/* we've already done css_get(mc.to) */
 		mc.moved_swap = 0;
 	}
 	memcg_oom_recover(from);
-- 
cgit v0.10.2


From 8d76a9797882fc517d87e2b5db2a4f04edaeccec Mon Sep 17 00:00:00 2001
From: Li Zefan <lizefan@huawei.com>
Date: Mon, 8 Jul 2013 16:00:36 -0700
Subject: memcg: don't need to get a reference to the parent

The cgroup core guarantees it's always safe to access the parent.

Signed-off-by: Li Zefan <lizefan@huawei.com>
Acked-by: Michal Hocko <mhocko@suse.cz>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Glauber Costa <glommer@openvz.org>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 76c0c99..c508258 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -503,7 +503,6 @@ enum res_type {
  */
 static DEFINE_MUTEX(memcg_create_mutex);
 
-static void mem_cgroup_get(struct mem_cgroup *memcg);
 static void mem_cgroup_put(struct mem_cgroup *memcg);
 
 static inline
@@ -6239,19 +6238,10 @@ static void free_rcu(struct rcu_head *rcu_head)
 	schedule_work(&memcg->work_freeing);
 }
 
-static void mem_cgroup_get(struct mem_cgroup *memcg)
-{
-	atomic_inc(&memcg->refcnt);
-}
-
 static void __mem_cgroup_put(struct mem_cgroup *memcg, int count)
 {
-	if (atomic_sub_and_test(count, &memcg->refcnt)) {
-		struct mem_cgroup *parent = parent_mem_cgroup(memcg);
+	if (atomic_sub_and_test(count, &memcg->refcnt))
 		call_rcu(&memcg->rcu_freeing, free_rcu);
-		if (parent)
-			mem_cgroup_put(parent);
-	}
 }
 
 static void mem_cgroup_put(struct mem_cgroup *memcg)
@@ -6354,12 +6344,9 @@ mem_cgroup_css_online(struct cgroup *cont)
 		res_counter_init(&memcg->kmem, &parent->kmem);
 
 		/*
-		 * We increment refcnt of the parent to ensure that we can
-		 * safely access it on res_counter_charge/uncharge.
-		 * This refcnt will be decremented when freeing this
-		 * mem_cgroup(see mem_cgroup_put).
+		 * No need to take a reference to the parent because cgroup
+		 * core guarantees its existence.
 		 */
-		mem_cgroup_get(parent);
 	} else {
 		res_counter_init(&memcg->res, NULL);
 		res_counter_init(&memcg->memsw, NULL);
-- 
cgit v0.10.2


From e0743e6bc5b7587dd0bfa902d67d3f81ef3f6618 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizefan@huawei.com>
Date: Mon, 8 Jul 2013 16:00:37 -0700
Subject: memcg: kill memcg refcnt

Now memcg has the same life cycle as its corresponding cgroup.  Kill the
useless refcnt.

Signed-off-by: Li Zefan <lizefan@huawei.com>
Acked-by: Michal Hocko <mhocko@suse.cz>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Glauber Costa <glommer@openvz.org>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index c508258..fa521a2 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -299,8 +299,6 @@ struct mem_cgroup {
 	bool		oom_lock;
 	atomic_t	under_oom;
 
-	atomic_t	refcnt;
-
 	int	swappiness;
 	/* OOM-Killer disable */
 	int		oom_kill_disable;
@@ -503,8 +501,6 @@ enum res_type {
  */
 static DEFINE_MUTEX(memcg_create_mutex);
 
-static void mem_cgroup_put(struct mem_cgroup *memcg);
-
 static inline
 struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *s)
 {
@@ -6238,17 +6234,6 @@ static void free_rcu(struct rcu_head *rcu_head)
 	schedule_work(&memcg->work_freeing);
 }
 
-static void __mem_cgroup_put(struct mem_cgroup *memcg, int count)
-{
-	if (atomic_sub_and_test(count, &memcg->refcnt))
-		call_rcu(&memcg->rcu_freeing, free_rcu);
-}
-
-static void mem_cgroup_put(struct mem_cgroup *memcg)
-{
-	__mem_cgroup_put(memcg, 1);
-}
-
 /*
  * Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled.
  */
@@ -6308,7 +6293,6 @@ mem_cgroup_css_alloc(struct cgroup *cont)
 
 	memcg->last_scanned_node = MAX_NUMNODES;
 	INIT_LIST_HEAD(&memcg->oom_notify);
-	atomic_set(&memcg->refcnt, 1);
 	memcg->move_charge_at_immigrate = 0;
 	mutex_init(&memcg->thresholds_lock);
 	spin_lock_init(&memcg->move_lock);
@@ -6399,7 +6383,7 @@ static void mem_cgroup_css_free(struct cgroup *cont)
 	struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
 
 	memcg_destroy_kmem(memcg);
-	__mem_cgroup_free(memcg);
+	call_rcu(&memcg->rcu_freeing, free_rcu);
 }
 
 #ifdef CONFIG_MMU
-- 
cgit v0.10.2


From 465939a1fa283cf2a5194362c5accf4429c99c42 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizefan@huawei.com>
Date: Mon, 8 Jul 2013 16:00:38 -0700
Subject: memcg: don't need to free memcg via RCU or workqueue

Now memcg has the same life cycle with its corresponding cgroup, and a
cgroup is freed via RCU and then mem_cgroup_css_free() will be called in
a work function, so we can simply call __mem_cgroup_free() in
mem_cgroup_css_free().

This actually reverts commit 59927fb984d ("memcg: free mem_cgroup by RCU
to fix oops").

Signed-off-by: Li Zefan <lizefan@huawei.com>
Cc: Hugh Dickins <hughd@google.com>
Acked-by: Michal Hocko <mhocko@suse.cz>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Glauber Costa <glommer@openvz.org>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index fa521a2..d12ca6f 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -263,28 +263,10 @@ struct mem_cgroup {
 	/* vmpressure notifications */
 	struct vmpressure vmpressure;
 
-	union {
-		/*
-		 * the counter to account for mem+swap usage.
-		 */
-		struct res_counter memsw;
-
-		/*
-		 * rcu_freeing is used only when freeing struct mem_cgroup,
-		 * so put it into a union to avoid wasting more memory.
-		 * It must be disjoint from the css field.  It could be
-		 * in a union with the res field, but res plays a much
-		 * larger part in mem_cgroup life than memsw, and might
-		 * be of interest, even at time of free, when debugging.
-		 * So share rcu_head with the less interesting memsw.
-		 */
-		struct rcu_head rcu_freeing;
-		/*
-		 * We also need some space for a worker in deferred freeing.
-		 * By the time we call it, rcu_freeing is no longer in use.
-		 */
-		struct work_struct work_freeing;
-	};
+	/*
+	 * the counter to account for mem+swap usage.
+	 */
+	struct res_counter memsw;
 
 	/*
 	 * the counter to account for kernel memory usage.
@@ -6211,29 +6193,6 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
 		vfree(memcg);
 }
 
-
-/*
- * Helpers for freeing a kmalloc()ed/vzalloc()ed mem_cgroup by RCU,
- * but in process context.  The work_freeing structure is overlaid
- * on the rcu_freeing structure, which itself is overlaid on memsw.
- */
-static void free_work(struct work_struct *work)
-{
-	struct mem_cgroup *memcg;
-
-	memcg = container_of(work, struct mem_cgroup, work_freeing);
-	__mem_cgroup_free(memcg);
-}
-
-static void free_rcu(struct rcu_head *rcu_head)
-{
-	struct mem_cgroup *memcg;
-
-	memcg = container_of(rcu_head, struct mem_cgroup, rcu_freeing);
-	INIT_WORK(&memcg->work_freeing, free_work);
-	schedule_work(&memcg->work_freeing);
-}
-
 /*
  * Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled.
  */
@@ -6383,7 +6342,7 @@ static void mem_cgroup_css_free(struct cgroup *cont)
 	struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
 
 	memcg_destroy_kmem(memcg);
-	call_rcu(&memcg->rcu_freeing, free_rcu);
+	__mem_cgroup_free(memcg);
 }
 
 #ifdef CONFIG_MMU
-- 
cgit v0.10.2


From 5f12733e9d976132e6cbbae9d08f71406fdacdfb Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.cz>
Date: Mon, 8 Jul 2013 16:00:40 -0700
Subject: mm: honor min_free_kbytes set by user

min_free_kbytes is updated during memory hotplug (by
init_per_zone_wmark_min) currently which is right thing to do in most
cases but this could be unexpected if admin increased the value to
prevent from allocation failures and the new min_free_kbytes would be
decreased as a result of memory hotadd.

This patch saves the user defined value and allows updating
min_free_kbytes only if it is higher than the saved one.

A warning is printed when the new value is ignored.

Signed-off-by: Michal Hocko <mhocko@suse.cz>
Cc: Mel Gorman <mgorman@suse.de>
Acked-by: Zhang Yanfei <zhangyanfei@cn.fujitsu.com>
Acked-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index b5855e5..b100255 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -204,6 +204,7 @@ static char * const zone_names[MAX_NR_ZONES] = {
 };
 
 int min_free_kbytes = 1024;
+int user_min_free_kbytes;
 
 static unsigned long __meminitdata nr_kernel_pages;
 static unsigned long __meminitdata nr_all_pages;
@@ -5589,14 +5590,21 @@ static void __meminit setup_per_zone_inactive_ratio(void)
 int __meminit init_per_zone_wmark_min(void)
 {
 	unsigned long lowmem_kbytes;
+	int new_min_free_kbytes;
 
 	lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10);
-
-	min_free_kbytes = int_sqrt(lowmem_kbytes * 16);
-	if (min_free_kbytes < 128)
-		min_free_kbytes = 128;
-	if (min_free_kbytes > 65536)
-		min_free_kbytes = 65536;
+	new_min_free_kbytes = int_sqrt(lowmem_kbytes * 16);
+
+	if (new_min_free_kbytes > user_min_free_kbytes) {
+		min_free_kbytes = new_min_free_kbytes;
+		if (min_free_kbytes < 128)
+			min_free_kbytes = 128;
+		if (min_free_kbytes > 65536)
+			min_free_kbytes = 65536;
+	} else {
+		pr_warn("min_free_kbytes is not updated to %d because user defined value %d is preferred\n",
+				new_min_free_kbytes, user_min_free_kbytes);
+	}
 	setup_per_zone_wmarks();
 	refresh_zone_stat_thresholds();
 	setup_per_zone_lowmem_reserve();
@@ -5614,8 +5622,10 @@ int min_free_kbytes_sysctl_handler(ctl_table *table, int write,
 	void __user *buffer, size_t *length, loff_t *ppos)
 {
 	proc_dointvec(table, write, buffer, length, ppos);
-	if (write)
+	if (write) {
+		user_min_free_kbytes = min_free_kbytes;
 		setup_per_zone_wmarks();
+	}
 	return 0;
 }
 
-- 
cgit v0.10.2


From 0a1be15097a5f5ee8cbaf7cf0a55146363db0e4d Mon Sep 17 00:00:00 2001
From: Toshi Kani <toshi.kani@hp.com>
Date: Mon, 8 Jul 2013 16:00:41 -0700
Subject: mm/memory_hotplug.c: fix return value of online_pages()

online_pages() is called from memory_block_action() when a user requests
to online a memory block via sysfs.  This function needs to return a
proper error value in case of error.

Signed-off-by: Toshi Kani <toshi.kani@hp.com>
Cc: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Cc: Tang Chen <tangchen@cn.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index cd2990f..ca1dd3a 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -914,19 +914,19 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
 	if ((zone_idx(zone) > ZONE_NORMAL || online_type == ONLINE_MOVABLE) &&
 	    !can_online_high_movable(zone)) {
 		unlock_memory_hotplug();
-		return -1;
+		return -EINVAL;
 	}
 
 	if (online_type == ONLINE_KERNEL && zone_idx(zone) == ZONE_MOVABLE) {
 		if (move_pfn_range_left(zone - 1, zone, pfn, pfn + nr_pages)) {
 			unlock_memory_hotplug();
-			return -1;
+			return -EINVAL;
 		}
 	}
 	if (online_type == ONLINE_MOVABLE && zone_idx(zone) == ZONE_MOVABLE - 1) {
 		if (move_pfn_range_right(zone, zone + 1, pfn, pfn + nr_pages)) {
 			unlock_memory_hotplug();
-			return -1;
+			return -EINVAL;
 		}
 	}
 
-- 
cgit v0.10.2


From dcb6b45254e2281b6f99ea7f2d51343954aa3ba8 Mon Sep 17 00:00:00 2001
From: Alex Thorlton <athorlton@sgi.com>
Date: Mon, 8 Jul 2013 16:00:42 -0700
Subject: panic: add cpu/pid to warn_slowpath_common in WARNING printk()s

Add the cpu/pid that called WARN() so that the stack traces can be
matched up with the WARNING messages.

[akpm@linux-foundation.org: remove stray quote]
Signed-off-by: Alex Thorlton <athorlton@sgi.com>
Reviewed-by: Robin Holt <holt@sgi.com>
Cc: Stephen Boyd <sboyd@codeaurora.org>
Cc: Vikram Mulukutla <markivx@codeaurora.org>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/kernel/panic.c b/kernel/panic.c
index 167ec09..9771231 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -399,8 +399,9 @@ struct slowpath_args {
 static void warn_slowpath_common(const char *file, int line, void *caller,
 				 unsigned taint, struct slowpath_args *args)
 {
-	printk(KERN_WARNING "------------[ cut here ]------------\n");
-	printk(KERN_WARNING "WARNING: at %s:%d %pS()\n", file, line, caller);
+	pr_warn("------------[ cut here ]------------\n");
+	pr_warn("WARNING: CPU: %d PID: %d at %s:%d %pS()\n",
+		raw_smp_processor_id(), current->pid, file, line, caller);
 
 	if (args)
 		vprintk(args->fmt, args->args);
-- 
cgit v0.10.2


From c707a81de71a27a499fde60fbb963f60602c1a94 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Mon, 8 Jul 2013 16:00:43 -0700
Subject: checkpatch: make the CamelCase cache work for non-git trees too

Might as well check include timestamps and cache the include file
CamelCase uses for the non-git case too.

The camelcase cache file is now named:

  for git:      .checkpatch-camelcase.git.<commit_id>
  for non-git:  .checkpatch-camelcase.date.<YYYYMMDDhhmm>

All .checkpatch-camelcase* files are deleted if not current.

Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
index 6afcd12..2ee9eb7 100755
--- a/scripts/checkpatch.pl
+++ b/scripts/checkpatch.pl
@@ -6,6 +6,7 @@
 # Licensed under the terms of the GNU GPL License version 2
 
 use strict;
+use POSIX;
 
 my $P = $0;
 $P =~ s@.*/@@g;
@@ -399,37 +400,52 @@ sub seed_camelcase_includes {
 	return if ($camelcase_seeded);
 
 	my $files;
-	my $camelcase_git_file = "";
+	my $camelcase_cache = "";
+	my @include_files = ();
+
+	$camelcase_seeded = 1;
 
 	if (-d ".git") {
 		my $git_last_include_commit = `git log --no-merges --pretty=format:"%h%n" -1 -- include`;
 		chomp $git_last_include_commit;
-		$camelcase_git_file = ".checkpatch-camelcase.$git_last_include_commit";
-		if (-f $camelcase_git_file) {
-			open(my $camelcase_file, '<', "$camelcase_git_file")
-			    or warn "$P: Can't read '$camelcase_git_file' $!\n";
-			while (<$camelcase_file>) {
-				chomp;
-				$camelcase{$_} = 1;
-			}
-			close($camelcase_file);
-
-			return;
-		}
-		$files = `git ls-files include`;
+		$camelcase_cache = ".checkpatch-camelcase.git.$git_last_include_commit";
 	} else {
+		my $last_mod_date = 0;
 		$files = `find $root/include -name "*.h"`;
+		@include_files = split('\n', $files);
+		foreach my $file (@include_files) {
+			my $date = POSIX::strftime("%Y%m%d%H%M",
+						   localtime((stat $file)[9]));
+			$last_mod_date = $date if ($last_mod_date < $date);
+		}
+		$camelcase_cache = ".checkpatch-camelcase.date.$last_mod_date";
+	}
+
+	if ($camelcase_cache ne "" && -f $camelcase_cache) {
+		open(my $camelcase_file, '<', "$camelcase_cache")
+		    or warn "$P: Can't read '$camelcase_cache' $!\n";
+		while (<$camelcase_file>) {
+			chomp;
+			$camelcase{$_} = 1;
+		}
+		close($camelcase_file);
+
+		return;
+	}
+
+	if (-d ".git") {
+		$files = `git ls-files "include/*.h"`;
+		@include_files = split('\n', $files);
 	}
-	my @include_files = split('\n', $files);
+
 	foreach my $file (@include_files) {
 		seed_camelcase_file($file);
 	}
-	$camelcase_seeded = 1;
 
-	if ($camelcase_git_file ne "") {
+	if ($camelcase_cache ne "") {
 		unlink glob ".checkpatch-camelcase.*";
-		open(my $camelcase_file, '>', "$camelcase_git_file")
-		    or warn "$P: Can't write '$camelcase_git_file' $!\n";
+		open(my $camelcase_file, '>', "$camelcase_cache")
+		    or warn "$P: Can't write '$camelcase_cache' $!\n";
 		foreach (sort { lc($a) cmp lc($b) } keys(%camelcase)) {
 			print $camelcase_file ("$_\n");
 		}
-- 
cgit v0.10.2


From 2417898b34ad3fbf2f31771c97ba87792bf97f0c Mon Sep 17 00:00:00 2001
From: Wei Yongjun <yongjun_wei@trendmicro.com.cn>
Date: Mon, 8 Jul 2013 16:00:44 -0700
Subject: ncpfs: fix error return code in ncp_parse_options()

Fix to return -EINVAL from the option parse error handling case instead
of 0, as done elsewhere in this function.

Signed-off-by: Wei Yongjun <yongjun_wei@trendmicro.com.cn>
Cc: Petr Vandrovec <petr@vandrovec.name>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index 0765ad1..4659da6 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -403,18 +403,24 @@ static int ncp_parse_options(struct ncp_mount_data_kernel *data, char *options)
 		switch (optval) {
 			case 'u':
 				data->uid = make_kuid(current_user_ns(), optint);
-				if (!uid_valid(data->uid))
+				if (!uid_valid(data->uid)) {
+					ret = -EINVAL;
 					goto err;
+				}
 				break;
 			case 'g':
 				data->gid = make_kgid(current_user_ns(), optint);
-				if (!gid_valid(data->gid))
+				if (!gid_valid(data->gid)) {
+					ret = -EINVAL;
 					goto err;
+				}
 				break;
 			case 'o':
 				data->mounted_uid = make_kuid(current_user_ns(), optint);
-				if (!uid_valid(data->mounted_uid))
+				if (!uid_valid(data->mounted_uid)) {
+					ret = -EINVAL;
 					goto err;
+				}
 				break;
 			case 'm':
 				data->file_mode = optint;
-- 
cgit v0.10.2


From 4e80b1880c5a31d051d1e4a7377dec0a20701c23 Mon Sep 17 00:00:00 2001
From: Fabio Estevam <fabio.estevam@freescale.com>
Date: Mon, 8 Jul 2013 16:00:45 -0700
Subject: drivers/rtc/rtc-stmp3xxx.c: check the return value from
 stmp_reset_block()

stmp_reset_block() may fail, so let's check its return value and
propagate it in the case of error.

Signed-off-by: Fabio Estevam <fabio.estevam@freescale.com>
Acked-by: Shawn Guo <shawn.guo@linaro.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/rtc/rtc-stmp3xxx.c b/drivers/rtc/rtc-stmp3xxx.c
index 90a3e86..767fee2 100644
--- a/drivers/rtc/rtc-stmp3xxx.c
+++ b/drivers/rtc/rtc-stmp3xxx.c
@@ -261,7 +261,12 @@ static int stmp3xxx_rtc_probe(struct platform_device *pdev)
 
 	platform_set_drvdata(pdev, rtc_data);
 
-	stmp_reset_block(rtc_data->io);
+	err = stmp_reset_block(rtc_data->io);
+	if (err) {
+		dev_err(&pdev->dev, "stmp_reset_block failed: %d\n", err);
+		return err;
+	}
+
 	writel(STMP3XXX_RTC_PERSISTENT0_ALARM_EN |
 			STMP3XXX_RTC_PERSISTENT0_ALARM_WAKE_EN |
 			STMP3XXX_RTC_PERSISTENT0_ALARM_WAKE,
-- 
cgit v0.10.2


From 6e5b93ee55d401f1619092fb675b57c28c9ed7ec Mon Sep 17 00:00:00 2001
From: Mike Lockwood <lockwood@android.com>
Date: Mon, 8 Jul 2013 16:00:46 -0700
Subject: fatfs: add FAT_IOCTL_GET_VOLUME_ID

This patch, originally from Android kernel, adds vfat ioctl command
FAT_IOCTL_GET_VOLUME_ID, with this command we can get the vfat volume ID
using following code:

	ioctl(fd, FAT_IOCTL_GET_VOLUME_ID, &volume_ID)

This patch is a modified version of the patch by Mike Lockwood, with
changes from Dmitry Pervushin, who noticed the original patch makes some
volume IDs abiguous with error returns: for example, if volume id is
0xFFFFFDAD, that matches -ENOIOCTLCMD, we get "FFFFFFFF" from the user
space.

So add a parameter to ioctl to get the correct volume ID.

Android uses vfat volume ID to identify different sd card, when a new sd
card is inserted to device, android can scan the media on it and pop up
new contents.

Signed-off-by: Bintian Wang <bintian.wang@linaro.org>
Cc: dmitry pervushin <dpervushin@gmail.com>
Cc: Mike Lockwood <lockwood@android.com>
Cc: Colin Cross <ccross@android.com>
Acked-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Sean McNeil <sean@mcneil.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/fs/fat/fat.h b/fs/fat/fat.h
index 21664fc..4241e6f 100644
--- a/fs/fat/fat.h
+++ b/fs/fat/fat.h
@@ -86,6 +86,7 @@ struct msdos_sb_info {
 	const void *dir_ops;	      /* Opaque; default directory operations */
 	int dir_per_block;	      /* dir entries per block */
 	int dir_per_block_bits;	      /* log2(dir_per_block) */
+	unsigned int vol_id;		/*volume ID*/
 
 	int fatent_shift;
 	struct fatent_operations *fatent_ops;
diff --git a/fs/fat/file.c b/fs/fat/file.c
index b0b632e..9b104f5 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -114,6 +114,12 @@ out:
 	return err;
 }
 
+static int fat_ioctl_get_volume_id(struct inode *inode, u32 __user *user_attr)
+{
+	struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb);
+	return put_user(sbi->vol_id, user_attr);
+}
+
 long fat_generic_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 {
 	struct inode *inode = file_inode(filp);
@@ -124,6 +130,8 @@ long fat_generic_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 		return fat_ioctl_get_attributes(inode, user_attr);
 	case FAT_IOCTL_SET_ATTRIBUTES:
 		return fat_ioctl_set_attributes(filp, user_attr);
+	case FAT_IOCTL_GET_VOLUME_ID:
+		return fat_ioctl_get_volume_id(inode, user_attr);
 	default:
 		return -ENOTTY;	/* Inappropriate ioctl for device */
 	}
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 5d4513c..11b51bb 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -1415,6 +1415,18 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat,
 		brelse(fsinfo_bh);
 	}
 
+	/* interpret volume ID as a little endian 32 bit integer */
+	if (sbi->fat_bits == 32)
+		sbi->vol_id = (((u32)b->fat32.vol_id[0]) |
+					((u32)b->fat32.vol_id[1] << 8) |
+					((u32)b->fat32.vol_id[2] << 16) |
+					((u32)b->fat32.vol_id[3] << 24));
+	else /* fat 16 or 12 */
+		sbi->vol_id = (((u32)b->fat16.vol_id[0]) |
+					((u32)b->fat16.vol_id[1] << 8) |
+					((u32)b->fat16.vol_id[2] << 16) |
+					((u32)b->fat16.vol_id[3] << 24));
+
 	sbi->dir_per_block = sb->s_blocksize / sizeof(struct msdos_dir_entry);
 	sbi->dir_per_block_bits = ffs(sbi->dir_per_block) - 1;
 
diff --git a/include/uapi/linux/msdos_fs.h b/include/uapi/linux/msdos_fs.h
index f055e58..e284ff9 100644
--- a/include/uapi/linux/msdos_fs.h
+++ b/include/uapi/linux/msdos_fs.h
@@ -104,6 +104,8 @@ struct __fat_dirent {
 /* <linux/videotext.h> has used 0x72 ('r') in collision, so skip a few */
 #define FAT_IOCTL_GET_ATTRIBUTES	_IOR('r', 0x10, __u32)
 #define FAT_IOCTL_SET_ATTRIBUTES	_IOW('r', 0x11, __u32)
+/*Android kernel has used 0x12, so we use 0x13*/
+#define FAT_IOCTL_GET_VOLUME_ID		_IOR('r', 0x13, __u32)
 
 struct fat_boot_sector {
 	__u8	ignored[3];	/* Boot strap short or near jump */
@@ -128,6 +130,10 @@ struct fat_boot_sector {
 			__u8	drive_number;	/* Physical drive number */
 			__u8	state;		/* undocumented, but used
 						   for mount state. */
+			__u8	signature;  /* extended boot signature */
+			__u8	vol_id[4];	/* volume ID */
+			__u8	vol_label[11];	/* volume label */
+			__u8	fs_type[8];		/* file system type */
 			/* other fiealds are not added here */
 		} fat16;
 
@@ -147,6 +153,10 @@ struct fat_boot_sector {
 			__u8	drive_number;   /* Physical drive number */
 			__u8    state;       	/* undocumented, but used
 						   for mount state. */
+			__u8	signature;  /* extended boot signature */
+			__u8	vol_id[4];	/* volume ID */
+			__u8	vol_label[11];	/* volume label */
+			__u8	fs_type[8];		/* file system type */
 			/* other fiealds are not added here */
 		} fat32;
 	};
-- 
cgit v0.10.2


From 02be46fba4b154b4a201a729b2d2b4ff6affd031 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Mon, 8 Jul 2013 16:00:47 -0700
Subject: ptrace/x86: revert "hw_breakpoints: Fix racy access to ptrace
 breakpoints"

This reverts commit 87dc669ba257 ("hw_breakpoints: Fix racy access to
ptrace breakpoints").

The patch was fine but we can no longer race with SIGKILL after commit
9899d11f6544 ("ptrace: ensure arch_ptrace/ptrace_request can never race
with SIGKILL"), the __TASK_TRACED tracee can't be woken up and
->ptrace_bps[] can't go away.

The patch only removes ptrace_get_breakpoints/ptrace_put_breakpoints and
does a couple of "while at it" cleanups, it doesn't remove other changes
from the reverted commit.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Jan Kratochvil <jan.kratochvil@redhat.com>
Cc: Michael Neuling <mikey@neuling.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Prasad <prasad@linux.vnet.ibm.com>
Cc: Russell King <linux@arm.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 29a8120..7a98b21 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -641,9 +641,6 @@ static int ptrace_write_dr7(struct task_struct *tsk, unsigned long data)
 	unsigned len, type;
 	struct perf_event *bp;
 
-	if (ptrace_get_breakpoints(tsk) < 0)
-		return -ESRCH;
-
 	data &= ~DR_CONTROL_RESERVED;
 	old_dr7 = ptrace_get_dr7(thread->ptrace_bps);
 restore:
@@ -692,9 +689,7 @@ restore:
 		goto restore;
 	}
 
-	ptrace_put_breakpoints(tsk);
-
-	return ((orig_ret < 0) ? orig_ret : rc);
+	return orig_ret < 0 ? orig_ret : rc;
 }
 
 /*
@@ -706,18 +701,10 @@ static unsigned long ptrace_get_debugreg(struct task_struct *tsk, int n)
 	unsigned long val = 0;
 
 	if (n < HBP_NUM) {
-		struct perf_event *bp;
+		struct perf_event *bp = thread->ptrace_bps[n];
 
-		if (ptrace_get_breakpoints(tsk) < 0)
-			return -ESRCH;
-
-		bp = thread->ptrace_bps[n];
-		if (!bp)
-			val = 0;
-		else
+		if (bp)
 			val = bp->hw.info.address;
-
-		ptrace_put_breakpoints(tsk);
 	} else if (n == 6) {
 		val = thread->debugreg6;
 	 } else if (n == 7) {
@@ -734,9 +721,6 @@ static int ptrace_set_breakpoint_addr(struct task_struct *tsk, int nr,
 	struct perf_event_attr attr;
 	int err = 0;
 
-	if (ptrace_get_breakpoints(tsk) < 0)
-		return -ESRCH;
-
 	if (!t->ptrace_bps[nr]) {
 		ptrace_breakpoint_init(&attr);
 		/*
@@ -762,7 +746,7 @@ static int ptrace_set_breakpoint_addr(struct task_struct *tsk, int nr,
 		 */
 		if (IS_ERR(bp)) {
 			err = PTR_ERR(bp);
-			goto put;
+			goto out;
 		}
 
 		t->ptrace_bps[nr] = bp;
@@ -773,9 +757,7 @@ static int ptrace_set_breakpoint_addr(struct task_struct *tsk, int nr,
 		attr.bp_addr = addr;
 		err = modify_user_hw_breakpoint(bp, &attr);
 	}
-
-put:
-	ptrace_put_breakpoints(tsk);
+out:
 	return err;
 }
 
-- 
cgit v0.10.2


From 6961ed96f14463d7c6e38d8c2093f5d53bd70574 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Mon, 8 Jul 2013 16:00:49 -0700
Subject: ptrace/powerpc: revert "hw_breakpoints: Fix racy access to ptrace
 breakpoints"

This reverts commit 07fa7a0a8a58 ("hw_breakpoints: Fix racy access to
ptrace breakpoints") and removes ptrace_get/put_breakpoints() added by
other commits.

The patch was fine but we can no longer race with SIGKILL after commit
9899d11f6544 ("ptrace: ensure arch_ptrace/ptrace_request can never race
with SIGKILL"), the __TASK_TRACED tracee can't be woken up and
->ptrace_bps[] can't go away.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Michael Neuling <mikey@neuling.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jan Kratochvil <jan.kratochvil@redhat.com>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Prasad <prasad@linux.vnet.ibm.com>
Cc: Russell King <linux@arm.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/arch/powerpc/kernel/ptrace.c b/arch/powerpc/kernel/ptrace.c
index 64f7bd5..9a0d24c 100644
--- a/arch/powerpc/kernel/ptrace.c
+++ b/arch/powerpc/kernel/ptrace.c
@@ -975,16 +975,12 @@ int ptrace_set_debugreg(struct task_struct *task, unsigned long addr,
 	hw_brk.type = (data & HW_BRK_TYPE_DABR) | HW_BRK_TYPE_PRIV_ALL;
 	hw_brk.len = 8;
 #ifdef CONFIG_HAVE_HW_BREAKPOINT
-	if (ptrace_get_breakpoints(task) < 0)
-		return -ESRCH;
-
 	bp = thread->ptrace_bps[0];
 	if ((!data) || !(hw_brk.type & HW_BRK_TYPE_RDWR)) {
 		if (bp) {
 			unregister_hw_breakpoint(bp);
 			thread->ptrace_bps[0] = NULL;
 		}
-		ptrace_put_breakpoints(task);
 		return 0;
 	}
 	if (bp) {
@@ -997,11 +993,9 @@ int ptrace_set_debugreg(struct task_struct *task, unsigned long addr,
 
 		ret =  modify_user_hw_breakpoint(bp, &attr);
 		if (ret) {
-			ptrace_put_breakpoints(task);
 			return ret;
 		}
 		thread->ptrace_bps[0] = bp;
-		ptrace_put_breakpoints(task);
 		thread->hw_brk = hw_brk;
 		return 0;
 	}
@@ -1016,12 +1010,9 @@ int ptrace_set_debugreg(struct task_struct *task, unsigned long addr,
 					       ptrace_triggered, NULL, task);
 	if (IS_ERR(bp)) {
 		thread->ptrace_bps[0] = NULL;
-		ptrace_put_breakpoints(task);
 		return PTR_ERR(bp);
 	}
 
-	ptrace_put_breakpoints(task);
-
 #endif /* CONFIG_HAVE_HW_BREAKPOINT */
 	task->thread.hw_brk = hw_brk;
 #else /* CONFIG_PPC_ADV_DEBUG_REGS */
@@ -1440,26 +1431,19 @@ static long ppc_set_hwdebug(struct task_struct *child,
 	if (bp_info->trigger_type & PPC_BREAKPOINT_TRIGGER_WRITE)
 		brk.type |= HW_BRK_TYPE_WRITE;
 #ifdef CONFIG_HAVE_HW_BREAKPOINT
-	if (ptrace_get_breakpoints(child) < 0)
-		return -ESRCH;
-
 	/*
 	 * Check if the request is for 'range' breakpoints. We can
 	 * support it if range < 8 bytes.
 	 */
-	if (bp_info->addr_mode == PPC_BREAKPOINT_MODE_RANGE_INCLUSIVE) {
+	if (bp_info->addr_mode == PPC_BREAKPOINT_MODE_RANGE_INCLUSIVE)
 		len = bp_info->addr2 - bp_info->addr;
-	} else if (bp_info->addr_mode == PPC_BREAKPOINT_MODE_EXACT)
+	else if (bp_info->addr_mode == PPC_BREAKPOINT_MODE_EXACT)
 		len = 1;
-	else {
-		ptrace_put_breakpoints(child);
+	else
 		return -EINVAL;
-	}
 	bp = thread->ptrace_bps[0];
-	if (bp) {
-		ptrace_put_breakpoints(child);
+	if (bp)
 		return -ENOSPC;
-	}
 
 	/* Create a new breakpoint request if one doesn't exist already */
 	hw_breakpoint_init(&attr);
@@ -1471,11 +1455,9 @@ static long ppc_set_hwdebug(struct task_struct *child,
 					       ptrace_triggered, NULL, child);
 	if (IS_ERR(bp)) {
 		thread->ptrace_bps[0] = NULL;
-		ptrace_put_breakpoints(child);
 		return PTR_ERR(bp);
 	}
 
-	ptrace_put_breakpoints(child);
 	return 1;
 #endif /* CONFIG_HAVE_HW_BREAKPOINT */
 
@@ -1519,16 +1501,12 @@ static long ppc_del_hwdebug(struct task_struct *child, long data)
 		return -EINVAL;
 
 #ifdef CONFIG_HAVE_HW_BREAKPOINT
-	if (ptrace_get_breakpoints(child) < 0)
-		return -ESRCH;
-
 	bp = thread->ptrace_bps[0];
 	if (bp) {
 		unregister_hw_breakpoint(bp);
 		thread->ptrace_bps[0] = NULL;
 	} else
 		ret = -ENOENT;
-	ptrace_put_breakpoints(child);
 	return ret;
 #else /* CONFIG_HAVE_HW_BREAKPOINT */
 	if (child->thread.hw_brk.address == 0)
-- 
cgit v0.10.2


From 6af9df7f5ba35806a5919d3a36d95fd40e210b89 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Mon, 8 Jul 2013 16:00:51 -0700
Subject: ptrace/arm: revert "hw_breakpoints: Fix racy access to ptrace
 breakpoints"

This reverts commit bf0b8f4b55e5 ("hw_breakpoints: Fix racy access to
ptrace breakpoints").

The patch was fine but we can no longer race with SIGKILL after commit
9899d11f6544 ("ptrace: ensure arch_ptrace/ptrace_request can never race
with SIGKILL"), the __TASK_TRACED tracee can't be woken up and
->ptrace_bps[] can't go away.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Will Deacon <will.deacon@arm.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jan Kratochvil <jan.kratochvil@redhat.com>
Cc: Michael Neuling <mikey@neuling.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: Prasad <prasad@linux.vnet.ibm.com>
Cc: Russell King <linux@arm.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/arch/arm/kernel/ptrace.c b/arch/arm/kernel/ptrace.c
index 2bc1514..0dd3b79 100644
--- a/arch/arm/kernel/ptrace.c
+++ b/arch/arm/kernel/ptrace.c
@@ -886,20 +886,12 @@ long arch_ptrace(struct task_struct *child, long request,
 
 #ifdef CONFIG_HAVE_HW_BREAKPOINT
 		case PTRACE_GETHBPREGS:
-			if (ptrace_get_breakpoints(child) < 0)
-				return -ESRCH;
-
 			ret = ptrace_gethbpregs(child, addr,
 						(unsigned long __user *)data);
-			ptrace_put_breakpoints(child);
 			break;
 		case PTRACE_SETHBPREGS:
-			if (ptrace_get_breakpoints(child) < 0)
-				return -ESRCH;
-
 			ret = ptrace_sethbpregs(child, addr,
 						(unsigned long __user *)data);
-			ptrace_put_breakpoints(child);
 			break;
 #endif
 
-- 
cgit v0.10.2


From e8c073c4ff51207f5c1c37fb054360bbc0f38251 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Mon, 8 Jul 2013 16:00:52 -0700
Subject: ptrace/sh: revert "hw_breakpoints: Fix racy access to ptrace
 breakpoints"

This reverts commit e0ac8457d020 ("hw_breakpoints: Fix racy access to
ptrace breakpoints").

The patch was fine but we can no longer race with SIGKILL after commit
9899d11f6544 ("ptrace: ensure arch_ptrace/ptrace_request can never race
with SIGKILL"), the __TASK_TRACED tracee can't be woken up and
->ptrace_bps[] can't go away.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jan Kratochvil <jan.kratochvil@redhat.com>
Cc: Michael Neuling <mikey@neuling.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Prasad <prasad@linux.vnet.ibm.com>
Cc: Russell King <linux@arm.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/arch/sh/kernel/ptrace_32.c b/arch/sh/kernel/ptrace_32.c
index 81f999a..668c816 100644
--- a/arch/sh/kernel/ptrace_32.c
+++ b/arch/sh/kernel/ptrace_32.c
@@ -117,11 +117,7 @@ void user_enable_single_step(struct task_struct *child)
 
 	set_tsk_thread_flag(child, TIF_SINGLESTEP);
 
-	if (ptrace_get_breakpoints(child) < 0)
-		return;
-
 	set_single_step(child, pc);
-	ptrace_put_breakpoints(child);
 }
 
 void user_disable_single_step(struct task_struct *child)
-- 
cgit v0.10.2


From 7c8df28633bf0b7eb253f866029be0ac59ddb062 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Mon, 8 Jul 2013 16:00:54 -0700
Subject: ptrace: revert "Prepare to fix racy accesses on task breakpoints"

This reverts commit bf26c018490c ("Prepare to fix racy accesses on task
breakpoints").

The patch was fine but we can no longer race with SIGKILL after commit
9899d11f6544 ("ptrace: ensure arch_ptrace/ptrace_request can never race
with SIGKILL"), the __TASK_TRACED tracee can't be woken up and
->ptrace_bps[] can't go away.

Now that ptrace_get_breakpoints/ptrace_put_breakpoints have no callers,
we can kill them and remove task->ptrace_bp_refcnt.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Michael Neuling <mikey@neuling.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jan Kratochvil <jan.kratochvil@redhat.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Prasad <prasad@linux.vnet.ibm.com>
Cc: Russell King <linux@arm.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h
index 89573a3..07d0df6 100644
--- a/include/linux/ptrace.h
+++ b/include/linux/ptrace.h
@@ -142,9 +142,6 @@ static inline void ptrace_init_task(struct task_struct *child, bool ptrace)
 {
 	INIT_LIST_HEAD(&child->ptrace_entry);
 	INIT_LIST_HEAD(&child->ptraced);
-#ifdef CONFIG_HAVE_HW_BREAKPOINT
-	atomic_set(&child->ptrace_bp_refcnt, 1);
-#endif
 	child->jobctl = 0;
 	child->ptrace = 0;
 	child->parent = child->real_parent;
@@ -351,11 +348,4 @@ extern int task_current_syscall(struct task_struct *target, long *callno,
 				unsigned long args[6], unsigned int maxargs,
 				unsigned long *sp, unsigned long *pc);
 
-#ifdef CONFIG_HAVE_HW_BREAKPOINT
-extern int ptrace_get_breakpoints(struct task_struct *tsk);
-extern void ptrace_put_breakpoints(struct task_struct *tsk);
-#else
-static inline void ptrace_put_breakpoints(struct task_struct *tsk) { }
-#endif /* CONFIG_HAVE_HW_BREAKPOINT */
-
 #endif
diff --git a/include/linux/sched.h b/include/linux/sched.h
index cdd5407..75324d8 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1401,9 +1401,6 @@ struct task_struct {
 	} memcg_batch;
 	unsigned int memcg_kmem_skip_account;
 #endif
-#ifdef CONFIG_HAVE_HW_BREAKPOINT
-	atomic_t ptrace_bp_refcnt;
-#endif
 #ifdef CONFIG_UPROBES
 	struct uprobe_task *utask;
 #endif
diff --git a/kernel/exit.c b/kernel/exit.c
index fafe75d..a949819 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -808,7 +808,7 @@ void do_exit(long code)
 	/*
 	 * FIXME: do that only when needed, using sched_exit tracepoint
 	 */
-	ptrace_put_breakpoints(tsk);
+	flush_ptrace_hw_breakpoint(tsk);
 
 	exit_notify(tsk, group_dead);
 #ifdef CONFIG_NUMA
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index ba5e6ce..a146ee3 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -1221,19 +1221,3 @@ asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid,
 	return ret;
 }
 #endif	/* CONFIG_COMPAT */
-
-#ifdef CONFIG_HAVE_HW_BREAKPOINT
-int ptrace_get_breakpoints(struct task_struct *tsk)
-{
-	if (atomic_inc_not_zero(&tsk->ptrace_bp_refcnt))
-		return 0;
-
-	return -1;
-}
-
-void ptrace_put_breakpoints(struct task_struct *tsk)
-{
-	if (atomic_dec_and_test(&tsk->ptrace_bp_refcnt))
-		flush_ptrace_hw_breakpoint(tsk);
-}
-#endif /* CONFIG_HAVE_HW_BREAKPOINT */
-- 
cgit v0.10.2


From e6a7d6077106e5c72f0519ec113d986df67ee001 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Mon, 8 Jul 2013 16:00:56 -0700
Subject: ptrace/x86: simplify the "disable" logic in ptrace_write_dr7()

ptrace_write_dr7() looks unnecessarily overcomplicated.  We can factor
out ptrace_modify_breakpoint() and do not do "continue" twice, just we
need to pass the proper "disabled" argument to
ptrace_modify_breakpoint().

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jan Kratochvil <jan.kratochvil@redhat.com>
Cc: Michael Neuling <mikey@neuling.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Prasad <prasad@linux.vnet.ibm.com>
Cc: Russell King <linux@arm.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 7a98b21..0649f16 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -637,9 +637,7 @@ static int ptrace_write_dr7(struct task_struct *tsk, unsigned long data)
 	struct thread_struct *thread = &(tsk->thread);
 	unsigned long old_dr7;
 	int i, orig_ret = 0, rc = 0;
-	int enabled, second_pass = 0;
-	unsigned len, type;
-	struct perf_event *bp;
+	int second_pass = 0;
 
 	data &= ~DR_CONTROL_RESERVED;
 	old_dr7 = ptrace_get_dr7(thread->ptrace_bps);
@@ -649,30 +647,22 @@ restore:
 	 * appropriate changes to each.
 	 */
 	for (i = 0; i < HBP_NUM; i++) {
-		enabled = decode_dr7(data, i, &len, &type);
-		bp = thread->ptrace_bps[i];
-
-		if (!enabled) {
-			if (bp) {
-				/*
-				 * Don't unregister the breakpoints right-away,
-				 * unless all register_user_hw_breakpoint()
-				 * requests have succeeded. This prevents
-				 * any window of opportunity for debug
-				 * register grabbing by other users.
-				 */
-				if (!second_pass)
-					continue;
-
-				rc = ptrace_modify_breakpoint(bp, len, type,
-							      tsk, 1);
-				if (rc)
-					break;
-			}
-			continue;
+		unsigned len, type;
+		bool disabled = !decode_dr7(data, i, &len, &type);
+		struct perf_event *bp = thread->ptrace_bps[i];
+
+		if (disabled) {
+			/*
+			 * Don't unregister the breakpoints right-away, unless
+			 * all register_user_hw_breakpoint() requests have
+			 * succeeded. This prevents any window of opportunity
+			 * for debug register grabbing by other users.
+			 */
+			if (!bp || !second_pass)
+				continue;
 		}
 
-		rc = ptrace_modify_breakpoint(bp, len, type, tsk, 0);
+		rc = ptrace_modify_breakpoint(bp, len, type, tsk, disabled);
 		if (rc)
 			break;
 	}
-- 
cgit v0.10.2


From 29a55513414187b50d3cebb99884955a78d97283 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Mon, 8 Jul 2013 16:00:58 -0700
Subject: ptrace/x86: dont delay "disable" till second pass in
 ptrace_write_dr7()

ptrace_write_dr7() skips ptrace_modify_breakpoint(disabled => true)
unless second_pass, this buys nothing but complicates the code and means
that we always do the main loop twice even if "disabled" was never true.

The comment says:

	Don't unregister the breakpoints right-away,
	unless all register_user_hw_breakpoint()
	requests have succeeded.

Firstly, we do not do register_user_hw_breakpoint(), it was removed by
commit 24f1e32c60c4 ("hw-breakpoints: Rewrite the hw-breakpoints layer
on top of perf events").

We are going to restore register_user_hw_breakpoint() (see the next
patch) but this doesn't matter: after commit 44234adcdce3
("hw-breakpoints: Modify breakpoints without unregistering them")
perf_event_disable() can not hurt, hw_breakpoint_del() does not free the
slot.

Remove the "second_pass" check from the main loop and simplify the code.
Since we have to check "bp != NULL" anyway, the patch also removes the
same check in ptrace_modify_breakpoint() and moves the comment into
ptrace_write_dr7().

With this patch the second pass is only needed to restore the saved
old_dr7.  This should never fail, so the patch adds WARN_ON() to catch
the potential problems as Frederic suggested.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jan Kratochvil <jan.kratochvil@redhat.com>
Cc: Michael Neuling <mikey@neuling.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Prasad <prasad@linux.vnet.ibm.com>
Cc: Russell King <linux@arm.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 0649f16..98b0a2c 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -609,14 +609,6 @@ ptrace_modify_breakpoint(struct perf_event *bp, int len, int type,
 	int gen_len, gen_type;
 	struct perf_event_attr attr;
 
-	/*
-	 * We should have at least an inactive breakpoint at this
-	 * slot. It means the user is writing dr7 without having
-	 * written the address register first
-	 */
-	if (!bp)
-		return -EINVAL;
-
 	err = arch_bp_generic_fields(len, type, &gen_len, &gen_type);
 	if (err)
 		return err;
@@ -634,52 +626,47 @@ ptrace_modify_breakpoint(struct perf_event *bp, int len, int type,
  */
 static int ptrace_write_dr7(struct task_struct *tsk, unsigned long data)
 {
-	struct thread_struct *thread = &(tsk->thread);
+	struct thread_struct *thread = &tsk->thread;
 	unsigned long old_dr7;
-	int i, orig_ret = 0, rc = 0;
-	int second_pass = 0;
+	bool second_pass = false;
+	int i, rc, ret = 0;
 
 	data &= ~DR_CONTROL_RESERVED;
 	old_dr7 = ptrace_get_dr7(thread->ptrace_bps);
+
 restore:
-	/*
-	 * Loop through all the hardware breakpoints, making the
-	 * appropriate changes to each.
-	 */
+	rc = 0;
 	for (i = 0; i < HBP_NUM; i++) {
 		unsigned len, type;
 		bool disabled = !decode_dr7(data, i, &len, &type);
 		struct perf_event *bp = thread->ptrace_bps[i];
 
-		if (disabled) {
+		if (!bp) {
+			if (disabled)
+				continue;
 			/*
-			 * Don't unregister the breakpoints right-away, unless
-			 * all register_user_hw_breakpoint() requests have
-			 * succeeded. This prevents any window of opportunity
-			 * for debug register grabbing by other users.
+			 * We should have at least an inactive breakpoint at
+			 * this slot. It means the user is writing dr7 without
+			 * having written the address register first.
 			 */
-			if (!bp || !second_pass)
-				continue;
+			rc = -EINVAL;
+			break;
 		}
 
 		rc = ptrace_modify_breakpoint(bp, len, type, tsk, disabled);
 		if (rc)
 			break;
 	}
-	/*
-	 * Make a second pass to free the remaining unused breakpoints
-	 * or to restore the original breakpoints if an error occurred.
-	 */
-	if (!second_pass) {
-		second_pass = 1;
-		if (rc < 0) {
-			orig_ret = rc;
-			data = old_dr7;
-		}
+
+	/* Restore if the first pass failed, second_pass shouldn't fail. */
+	if (rc && !WARN_ON(second_pass)) {
+		ret = rc;
+		data = old_dr7;
+		second_pass = true;
 		goto restore;
 	}
 
-	return orig_ret < 0 ? orig_ret : rc;
+	return ret;
 }
 
 /*
-- 
cgit v0.10.2


From 9afe33ada275f2413dfeae27cc58fbb27474ac72 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Mon, 8 Jul 2013 16:00:59 -0700
Subject: ptrace/x86: introduce ptrace_register_breakpoint()

No functional changes, preparation.

Extract the "register breakpoint" code from ptrace_get_debugreg() into
the new/generic helper, ptrace_register_breakpoint().  It will have more
users.

The patch also adds another simple helper, ptrace_fill_bp_fields(), to
factor out the arch_bp_generic_fields() logic in register/modify.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jan Kratochvil <jan.kratochvil@redhat.com>
Cc: Michael Neuling <mikey@neuling.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Prasad <prasad@linux.vnet.ibm.com>
Cc: Russell King <linux@arm.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 98b0a2c..0526368 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -601,22 +601,48 @@ static unsigned long ptrace_get_dr7(struct perf_event *bp[])
 	return dr7;
 }
 
-static int
-ptrace_modify_breakpoint(struct perf_event *bp, int len, int type,
-			 struct task_struct *tsk, int disabled)
+static int ptrace_fill_bp_fields(struct perf_event_attr *attr,
+					int len, int type, bool disabled)
+{
+	int err, bp_len, bp_type;
+
+	err = arch_bp_generic_fields(len, type, &bp_len, &bp_type);
+	if (!err) {
+		attr->bp_len = bp_len;
+		attr->bp_type = bp_type;
+		attr->disabled = disabled;
+	}
+
+	return err;
+}
+
+static struct perf_event *
+ptrace_register_breakpoint(struct task_struct *tsk, int len, int type,
+				unsigned long addr, bool disabled)
 {
-	int err;
-	int gen_len, gen_type;
 	struct perf_event_attr attr;
+	int err;
+
+	ptrace_breakpoint_init(&attr);
+	attr.bp_addr = addr;
 
-	err = arch_bp_generic_fields(len, type, &gen_len, &gen_type);
+	err = ptrace_fill_bp_fields(&attr, len, type, disabled);
 	if (err)
-		return err;
+		return ERR_PTR(err);
+
+	return register_user_hw_breakpoint(&attr, ptrace_triggered,
+						 NULL, tsk);
+}
 
-	attr = bp->attr;
-	attr.bp_len = gen_len;
-	attr.bp_type = gen_type;
-	attr.disabled = disabled;
+static int ptrace_modify_breakpoint(struct perf_event *bp, int len, int type,
+					int disabled)
+{
+	struct perf_event_attr attr = bp->attr;
+	int err;
+
+	err = ptrace_fill_bp_fields(&attr, len, type, disabled);
+	if (err)
+		return err;
 
 	return modify_user_hw_breakpoint(bp, &attr);
 }
@@ -653,7 +679,7 @@ restore:
 			break;
 		}
 
-		rc = ptrace_modify_breakpoint(bp, len, type, tsk, disabled);
+		rc = ptrace_modify_breakpoint(bp, len, type, disabled);
 		if (rc)
 			break;
 	}
@@ -693,26 +719,14 @@ static unsigned long ptrace_get_debugreg(struct task_struct *tsk, int n)
 static int ptrace_set_breakpoint_addr(struct task_struct *tsk, int nr,
 				      unsigned long addr)
 {
-	struct perf_event *bp;
 	struct thread_struct *t = &tsk->thread;
-	struct perf_event_attr attr;
+	struct perf_event *bp = t->ptrace_bps[nr];
 	int err = 0;
 
-	if (!t->ptrace_bps[nr]) {
-		ptrace_breakpoint_init(&attr);
-		/*
-		 * Put stub len and type to register (reserve) an inactive but
-		 * correct bp
-		 */
-		attr.bp_addr = addr;
-		attr.bp_len = HW_BREAKPOINT_LEN_1;
-		attr.bp_type = HW_BREAKPOINT_W;
-		attr.disabled = 1;
-
-		bp = register_user_hw_breakpoint(&attr, ptrace_triggered,
-						 NULL, tsk);
-
+	if (!bp) {
 		/*
+		 * Put stub len and type to create an inactive but correct bp.
+		 *
 		 * CHECKME: the previous code returned -EIO if the addr wasn't
 		 * a valid task virtual addr. The new one will return -EINVAL in
 		 *  this case.
@@ -721,20 +735,20 @@ static int ptrace_set_breakpoint_addr(struct task_struct *tsk, int nr,
 		 * writing for the user. And anyway this is the previous
 		 * behaviour.
 		 */
-		if (IS_ERR(bp)) {
+		bp = ptrace_register_breakpoint(tsk,
+				X86_BREAKPOINT_LEN_1, X86_BREAKPOINT_WRITE,
+				addr, true);
+		if (IS_ERR(bp))
 			err = PTR_ERR(bp);
-			goto out;
-		}
-
-		t->ptrace_bps[nr] = bp;
+		else
+			t->ptrace_bps[nr] = bp;
 	} else {
-		bp = t->ptrace_bps[nr];
+		struct perf_event_attr attr = bp->attr;
 
-		attr = bp->attr;
 		attr.bp_addr = addr;
 		err = modify_user_hw_breakpoint(bp, &attr);
 	}
-out:
+
 	return err;
 }
 
-- 
cgit v0.10.2


From b87a95ad609619482df0690320d5ace33ace8e7a Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Mon, 8 Jul 2013 16:01:01 -0700
Subject: ptrace/x86: ptrace_write_dr7() should create bp if !disabled

Commit 24f1e32c60c4 ("hw-breakpoints: Rewrite the hw-breakpoints layer
on top of perf events") introduced the minor regression.  Before this
commit

	PTRACE_POKEUSER DR7, enableDR0
	PTRACE_POKEUSER DR0, address

was perfectly valid, now PTRACE_POKEUSER(DR7) fails if DR0 was not
previously initialized by PTRACE_POKEUSER(DR0).

Change ptrace_write_dr7() to do ptrace_register_breakpoint(addr => 0) if
!bp && !disabled.

This fixes watchpoint-zeroaddr from ptrace-tests, see

    https://bugzilla.redhat.com/show_bug.cgi?id=660204.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Reported-by: Jan Kratochvil <jan.kratochvil@redhat.com>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Michael Neuling <mikey@neuling.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Prasad <prasad@linux.vnet.ibm.com>
Cc: Russell King <linux@arm.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 0526368..5c387b3 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -670,13 +670,16 @@ restore:
 		if (!bp) {
 			if (disabled)
 				continue;
-			/*
-			 * We should have at least an inactive breakpoint at
-			 * this slot. It means the user is writing dr7 without
-			 * having written the address register first.
-			 */
-			rc = -EINVAL;
-			break;
+
+			bp = ptrace_register_breakpoint(tsk,
+					len, type, 0, disabled);
+			if (IS_ERR(bp)) {
+				rc = PTR_ERR(bp);
+				break;
+			}
+
+			thread->ptrace_bps[i] = bp;
+			continue;
 		}
 
 		rc = ptrace_modify_breakpoint(bp, len, type, disabled);
-- 
cgit v0.10.2


From 61e305c716c0737c97bd133313cc90e99a93712e Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Mon, 8 Jul 2013 16:01:03 -0700
Subject: ptrace/x86: cleanup ptrace_set_debugreg()

ptrace_set_debugreg() is trivial but looks horrible.  Kill the unnecessary
goto's and return's to cleanup the code.

This matches ptrace_get_debugreg() which also needs the trivial whitespace
cleanups.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jan Kratochvil <jan.kratochvil@redhat.com>
Cc: Michael Neuling <mikey@neuling.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Prasad <prasad@linux.vnet.ibm.com>
Cc: Russell King <linux@arm.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 5c387b3..7461f50 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -703,7 +703,7 @@ restore:
  */
 static unsigned long ptrace_get_debugreg(struct task_struct *tsk, int n)
 {
-	struct thread_struct *thread = &(tsk->thread);
+	struct thread_struct *thread = &tsk->thread;
 	unsigned long val = 0;
 
 	if (n < HBP_NUM) {
@@ -713,7 +713,7 @@ static unsigned long ptrace_get_debugreg(struct task_struct *tsk, int n)
 			val = bp->hw.info.address;
 	} else if (n == 6) {
 		val = thread->debugreg6;
-	 } else if (n == 7) {
+	} else if (n == 7) {
 		val = thread->ptrace_dr7;
 	}
 	return val;
@@ -761,30 +761,20 @@ static int ptrace_set_breakpoint_addr(struct task_struct *tsk, int nr,
 static int ptrace_set_debugreg(struct task_struct *tsk, int n,
 			       unsigned long val)
 {
-	struct thread_struct *thread = &(tsk->thread);
-	int rc = 0;
-
+	struct thread_struct *thread = &tsk->thread;
 	/* There are no DR4 or DR5 registers */
-	if (n == 4 || n == 5)
-		return -EIO;
+	int rc = -EIO;
 
-	if (n == 6) {
-		thread->debugreg6 = val;
-		goto ret_path;
-	}
 	if (n < HBP_NUM) {
 		rc = ptrace_set_breakpoint_addr(tsk, n, val);
-		if (rc)
-			return rc;
-	}
-	/* All that's left is DR7 */
-	if (n == 7) {
+	} else if (n == 6) {
+		thread->debugreg6 = val;
+		rc = 0;
+	} else if (n == 7) {
 		rc = ptrace_write_dr7(tsk, val);
 		if (!rc)
 			thread->ptrace_dr7 = val;
 	}
-
-ret_path:
 	return rc;
 }
 
-- 
cgit v0.10.2


From fab840fc2d542fabcab903db8e03589a6702ba5f Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Mon, 8 Jul 2013 16:01:05 -0700
Subject: ptrace: PTRACE_DETACH should do flush_ptrace_hw_breakpoint(child)

Change ptrace_detach() to call flush_ptrace_hw_breakpoint(child).  This
frees the slots for non-ptrace PERF_TYPE_BREAKPOINT users, and this
ensures that the tracee won't be killed by SIGTRAP triggered by the
active breakpoints.

Test-case:

	unsigned long encode_dr7(int drnum, int enable, unsigned int type, unsigned int len)
	{
		unsigned long dr7;

		dr7 = ((len | type) & 0xf)
			<< (DR_CONTROL_SHIFT + drnum * DR_CONTROL_SIZE);
		if (enable)
			dr7 |= (DR_GLOBAL_ENABLE << (drnum * DR_ENABLE_SIZE));

		return dr7;
	}

	int write_dr(int pid, int dr, unsigned long val)
	{
		return ptrace(PTRACE_POKEUSER, pid,
				offsetof (struct user, u_debugreg[dr]),
				val);
	}

	void func(void)
	{
	}

	int main(void)
	{
		int pid, stat;
		unsigned long dr7;

		pid = fork();
		if (!pid) {
			assert(ptrace(PTRACE_TRACEME, 0,0,0) == 0);
			kill(getpid(), SIGHUP);

			func();
			return 0x13;
		}

		assert(pid == waitpid(-1, &stat, 0));
		assert(WSTOPSIG(stat) == SIGHUP);

		assert(write_dr(pid, 0, (long)func) == 0);
		dr7 = encode_dr7(0, 1, DR_RW_EXECUTE, DR_LEN_1);
		assert(write_dr(pid, 7, dr7) == 0);

		assert(ptrace(PTRACE_DETACH, pid, 0,0) == 0);
		assert(pid == waitpid(-1, &stat, 0));
		assert(stat == 0x1300);

		return 0;
	}

Before this patch the child is killed after PTRACE_DETACH.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jan Kratochvil <jan.kratochvil@redhat.com>
Cc: Michael Neuling <mikey@neuling.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Prasad <prasad@linux.vnet.ibm.com>
Cc: Russell King <linux@arm.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index a146ee3..4041f57 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -469,6 +469,7 @@ static int ptrace_detach(struct task_struct *child, unsigned int data)
 	/* Architecture-specific hardware disable .. */
 	ptrace_disable(child);
 	clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
+	flush_ptrace_hw_breakpoint(child);
 
 	write_lock_irq(&tasklist_lock);
 	/*
-- 
cgit v0.10.2


From f7da04c9e363e479258135ac825734d78aecd2b0 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Mon, 8 Jul 2013 16:01:06 -0700
Subject: ptrace/x86: flush_ptrace_hw_breakpoint() shoule clear the virtual
 debug registers

flush_ptrace_hw_breakpoint() destroys the counters set by ptrace, but
"leaks" ->debugreg6 and ->ptrace_dr7.

The problem is minor, but still it doesn't look right and flush_thread()
did this until commit 66cb59172959 ("hw-breakpoints: use the new wrapper
routines to access debug registers in process/thread code").  Now that
PTRACE_DETACH does flush_ too this makes even more sense.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jan Kratochvil <jan.kratochvil@redhat.com>
Cc: Michael Neuling <mikey@neuling.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Prasad <prasad@linux.vnet.ibm.com>
Cc: Russell King <linux@arm.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c
index 02f0763..f66ff16 100644
--- a/arch/x86/kernel/hw_breakpoint.c
+++ b/arch/x86/kernel/hw_breakpoint.c
@@ -393,6 +393,9 @@ void flush_ptrace_hw_breakpoint(struct task_struct *tsk)
 		unregister_hw_breakpoint(t->ptrace_bps[i]);
 		t->ptrace_bps[i] = NULL;
 	}
+
+	t->debugreg6 = 0;
+	t->ptrace_dr7 = 0;
 }
 
 void hw_breakpoint_restore(void)
-- 
cgit v0.10.2


From c103a4dc4a32f53f095b66cd798d648c652f05b4 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Mon, 8 Jul 2013 16:01:08 -0700
Subject: ipc/shmc.c: eliminate ugly 80-col tricks

Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/ipc/shm.c b/ipc/shm.c
index 7e199fa..85dc001 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -491,10 +491,10 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params)
 
 	sprintf (name, "SYSV%08x", key);
 	if (shmflg & SHM_HUGETLB) {
-		struct hstate *hs = hstate_sizelog((shmflg >> SHM_HUGE_SHIFT)
-						& SHM_HUGE_MASK);
+		struct hstate *hs;
 		size_t hugesize;
 
+		hs = hstate_sizelog((shmflg >> SHM_HUGE_SHIFT) & SHM_HUGE_MASK);
 		if (!hs) {
 			error = -EINVAL;
 			goto no_file;
diff --git a/mm/mmap.c b/mm/mmap.c
index 0718c17..f813111 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1368,9 +1368,9 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
 			goto out_fput;
 	} else if (flags & MAP_HUGETLB) {
 		struct user_struct *user = NULL;
-		struct hstate *hs = hstate_sizelog((flags >> MAP_HUGE_SHIFT) &
-						   SHM_HUGE_MASK);
+		struct hstate *hs;
 
+		hs = hstate_sizelog((flags >> MAP_HUGE_SHIFT) & SHM_HUGE_MASK);
 		if (!hs)
 			return -EINVAL;
 
-- 
cgit v0.10.2


From dbfcd91f06f0e2d5564b2fd184e9c2a43675f9ab Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <davidlohr.bueso@hp.com>
Date: Mon, 8 Jul 2013 16:01:09 -0700
Subject: ipc: move rcu lock out of ipc_addid

This patchset continues the work that began in the sysv ipc semaphore
scaling series, see

  https://lkml.org/lkml/2013/3/20/546

Just like semaphores used to be, sysv shared memory and msg queues also
abuse the ipc lock, unnecessarily holding it for operations such as
permission and security checks.

This patchset mostly deals with mqueues, and while shared mem can be
done in a very similar way, I want to get these patches out in the open
first.  It also does some pending cleanups, mostly focused on the two
level locking we have in ipc code, taking care of ipc_addid() and
ipcctl_pre_down_nolock() - yes there are still functions that need to be
updated as well.

This patch:

Make all callers explicitly take and release the RCU read lock.

This addresses the two level locking seen in newary(), newseg() and
newqueue().  For the last two, explicitly unlock the ipc object and the
rcu lock, instead of calling the custom shm_unlock and msg_unlock
functions.  The next patch will deal with the open coded locking for
->perm.lock

Signed-off-by: Davidlohr Bueso <davidlohr.bueso@hp.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/ipc/msg.c b/ipc/msg.c
index d0c6d96..996feb8 100644
--- a/ipc/msg.c
+++ b/ipc/msg.c
@@ -199,9 +199,7 @@ static int newque(struct ipc_namespace *ns, struct ipc_params *params)
 		return retval;
 	}
 
-	/*
-	 * ipc_addid() locks msq
-	 */
+	/* ipc_addid() locks msq upon success. */
 	id = ipc_addid(&msg_ids(ns), &msq->q_perm, ns->msg_ctlmni);
 	if (id < 0) {
 		security_msg_queue_free(msq);
@@ -218,7 +216,8 @@ static int newque(struct ipc_namespace *ns, struct ipc_params *params)
 	INIT_LIST_HEAD(&msq->q_receivers);
 	INIT_LIST_HEAD(&msq->q_senders);
 
-	msg_unlock(msq);
+	spin_unlock(&msq->q_perm.lock);
+	rcu_read_unlock();
 
 	return msq->q_perm.id;
 }
diff --git a/ipc/shm.c b/ipc/shm.c
index 85dc001..bd2b14e 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -535,6 +535,7 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params)
 	shp->shm_nattch = 0;
 	shp->shm_file = file;
 	shp->shm_creator = current;
+
 	/*
 	 * shmid gets reported as "inode#" in /proc/pid/maps.
 	 * proc-ps tools use this. Changing this will break them.
@@ -543,7 +544,9 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params)
 
 	ns->shm_tot += numpages;
 	error = shp->shm_perm.id;
-	shm_unlock(shp);
+
+	spin_unlock(&shp->shm_perm.lock);
+	rcu_read_unlock();
 	return error;
 
 no_id:
diff --git a/ipc/util.c b/ipc/util.c
index 809ec5e..399821a 100644
--- a/ipc/util.c
+++ b/ipc/util.c
@@ -246,9 +246,8 @@ int ipc_get_maxid(struct ipc_ids *ids)
  *	is returned. The 'new' entry is returned in a locked state on success.
  *	On failure the entry is not locked and a negative err-code is returned.
  *
- *	Called with ipc_ids.rw_mutex held as a writer.
+ *	Called with writer ipc_ids.rw_mutex held.
  */
- 
 int ipc_addid(struct ipc_ids* ids, struct kern_ipc_perm* new, int size)
 {
 	kuid_t euid;
-- 
cgit v0.10.2


From 1ca7003ab41152d673d9e359632283d05294f3d6 Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <davidlohr.bueso@hp.com>
Date: Mon, 8 Jul 2013 16:01:10 -0700
Subject: ipc: introduce ipc object locking helpers

Simple helpers around the (kern_ipc_perm *)->lock spinlock.

Signed-off-by: Davidlohr Bueso <davidlohr.bueso@hp.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/ipc/util.h b/ipc/util.h
index 2b0bdd5..da65e8a 100644
--- a/ipc/util.h
+++ b/ipc/util.h
@@ -159,23 +159,33 @@ static inline int ipc_checkid(struct kern_ipc_perm *ipcp, int uid)
 	return uid / SEQ_MULTIPLIER != ipcp->seq;
 }
 
-static inline void ipc_lock_by_ptr(struct kern_ipc_perm *perm)
+static inline void ipc_lock_object(struct kern_ipc_perm *perm)
 {
-	rcu_read_lock();
 	spin_lock(&perm->lock);
 }
 
-static inline void ipc_unlock(struct kern_ipc_perm *perm)
+static inline void ipc_unlock_object(struct kern_ipc_perm *perm)
 {
 	spin_unlock(&perm->lock);
-	rcu_read_unlock();
 }
 
-static inline void ipc_lock_object(struct kern_ipc_perm *perm)
+static inline void ipc_assert_locked_object(struct kern_ipc_perm *perm)
 {
+	assert_spin_locked(&perm->lock);
+}
+
+static inline void ipc_lock_by_ptr(struct kern_ipc_perm *perm)
+{
+	rcu_read_lock();
 	spin_lock(&perm->lock);
 }
 
+static inline void ipc_unlock(struct kern_ipc_perm *perm)
+{
+	spin_unlock(&perm->lock);
+	rcu_read_unlock();
+}
+
 struct kern_ipc_perm *ipc_lock_check(struct ipc_ids *ids, int id);
 struct kern_ipc_perm *ipc_obtain_object_check(struct ipc_ids *ids, int id);
 int ipcget(struct ipc_namespace *ns, struct ipc_ids *ids,
-- 
cgit v0.10.2


From cf9d5d78d05bca96df7618dfc3a5ee4414dcae58 Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <davidlohr.bueso@hp.com>
Date: Mon, 8 Jul 2013 16:01:11 -0700
Subject: ipc: close open coded spin lock calls

Signed-off-by: Davidlohr Bueso <davidlohr.bueso@hp.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/ipc/msg.c b/ipc/msg.c
index 996feb8..7a3d6aa 100644
--- a/ipc/msg.c
+++ b/ipc/msg.c
@@ -216,7 +216,7 @@ static int newque(struct ipc_namespace *ns, struct ipc_params *params)
 	INIT_LIST_HEAD(&msq->q_receivers);
 	INIT_LIST_HEAD(&msq->q_senders);
 
-	spin_unlock(&msq->q_perm.lock);
+	ipc_unlock_object(&msq->q_perm);
 	rcu_read_unlock();
 
 	return msq->q_perm.id;
diff --git a/ipc/sem.c b/ipc/sem.c
index 70480a3..92ec6c6 100644
--- a/ipc/sem.c
+++ b/ipc/sem.c
@@ -246,7 +246,7 @@ static inline int sem_lock(struct sem_array *sma, struct sembuf *sops,
 		 * their critical section while the array lock is held.
 		 */
  lock_array:
-		spin_lock(&sma->sem_perm.lock);
+		ipc_lock_object(&sma->sem_perm);
 		for (i = 0; i < sma->sem_nsems; i++) {
 			struct sem *sem = sma->sem_base + i;
 			spin_unlock_wait(&sem->lock);
@@ -259,7 +259,7 @@ static inline int sem_lock(struct sem_array *sma, struct sembuf *sops,
 static inline void sem_unlock(struct sem_array *sma, int locknum)
 {
 	if (locknum == -1) {
-		spin_unlock(&sma->sem_perm.lock);
+		ipc_unlock_object(&sma->sem_perm);
 	} else {
 		struct sem *sem = sma->sem_base + locknum;
 		spin_unlock(&sem->lock);
@@ -872,7 +872,7 @@ static void freeary(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp)
 	int i;
 
 	/* Free the existing undo structures for this semaphore set.  */
-	assert_spin_locked(&sma->sem_perm.lock);
+	ipc_assert_locked_object(&sma->sem_perm);
 	list_for_each_entry_safe(un, tu, &sma->list_id, list_id) {
 		list_del(&un->list_id);
 		spin_lock(&un->ulp->lock);
@@ -1070,7 +1070,7 @@ static int semctl_setval(struct ipc_namespace *ns, int semid, int semnum,
 
 	curr = &sma->sem_base[semnum];
 
-	assert_spin_locked(&sma->sem_perm.lock);
+	ipc_assert_locked_object(&sma->sem_perm);
 	list_for_each_entry(un, &sma->list_id, list_id)
 		un->semadj[semnum] = 0;
 
@@ -1199,7 +1199,7 @@ static int semctl_main(struct ipc_namespace *ns, int semid, int semnum,
 		for (i = 0; i < nsems; i++)
 			sma->sem_base[i].semval = sem_io[i];
 
-		assert_spin_locked(&sma->sem_perm.lock);
+		ipc_assert_locked_object(&sma->sem_perm);
 		list_for_each_entry(un, &sma->list_id, list_id) {
 			for (i = 0; i < nsems; i++)
 				un->semadj[i] = 0;
@@ -1496,7 +1496,7 @@ static struct sem_undo *find_alloc_undo(struct ipc_namespace *ns, int semid)
 	new->semid = semid;
 	assert_spin_locked(&ulp->lock);
 	list_add_rcu(&new->list_proc, &ulp->list_proc);
-	assert_spin_locked(&sma->sem_perm.lock);
+	ipc_assert_locked_object(&sma->sem_perm);
 	list_add(&new->list_id, &sma->list_id);
 	un = new;
 
@@ -1833,7 +1833,7 @@ void exit_sem(struct task_struct *tsk)
 		}
 
 		/* remove un from the linked lists */
-		assert_spin_locked(&sma->sem_perm.lock);
+		ipc_assert_locked_object(&sma->sem_perm);
 		list_del(&un->list_id);
 
 		spin_lock(&ulp->lock);
diff --git a/ipc/shm.c b/ipc/shm.c
index bd2b14e..e7d5107 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -141,7 +141,7 @@ static inline struct shmid_kernel *shm_lock(struct ipc_namespace *ns, int id)
 static inline void shm_lock_by_ptr(struct shmid_kernel *ipcp)
 {
 	rcu_read_lock();
-	spin_lock(&ipcp->shm_perm.lock);
+	ipc_lock_object(&ipcp->shm_perm);
 }
 
 static inline struct shmid_kernel *shm_lock_check(struct ipc_namespace *ns,
@@ -545,7 +545,7 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params)
 	ns->shm_tot += numpages;
 	error = shp->shm_perm.id;
 
-	spin_unlock(&shp->shm_perm.lock);
+	ipc_unlock_object(&shp->shm_perm);
 	rcu_read_unlock();
 	return error;
 
diff --git a/ipc/util.h b/ipc/util.h
index da65e8a..b6a6a88 100644
--- a/ipc/util.h
+++ b/ipc/util.h
@@ -177,12 +177,12 @@ static inline void ipc_assert_locked_object(struct kern_ipc_perm *perm)
 static inline void ipc_lock_by_ptr(struct kern_ipc_perm *perm)
 {
 	rcu_read_lock();
-	spin_lock(&perm->lock);
+	ipc_lock_object(perm);
 }
 
 static inline void ipc_unlock(struct kern_ipc_perm *perm)
 {
-	spin_unlock(&perm->lock);
+	ipc_unlock_object(perm);
 	rcu_read_unlock();
 }
 
-- 
cgit v0.10.2


From 7b4cc5d8411bd4e9d61d8714f53859740cf830c2 Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <davidlohr.bueso@hp.com>
Date: Mon, 8 Jul 2013 16:01:12 -0700
Subject: ipc: move locking out of ipcctl_pre_down_nolock

This function currently acquires both the rw_mutex and the rcu lock on
successful lookups, leaving the callers to explicitly unlock them,
creating another two level locking situation.

Make the callers (including those that still use ipcctl_pre_down())
explicitly lock and unlock the rwsem and rcu lock.

Signed-off-by: Davidlohr Bueso <davidlohr.bueso@hp.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/ipc/msg.c b/ipc/msg.c
index 7a3d6aa..f62fa5e 100644
--- a/ipc/msg.c
+++ b/ipc/msg.c
@@ -407,31 +407,38 @@ static int msgctl_down(struct ipc_namespace *ns, int msqid, int cmd,
 			return -EFAULT;
 	}
 
+	down_write(&msg_ids(ns).rw_mutex);
+	rcu_read_lock();
+
 	ipcp = ipcctl_pre_down(ns, &msg_ids(ns), msqid, cmd,
 			       &msqid64.msg_perm, msqid64.msg_qbytes);
-	if (IS_ERR(ipcp))
-		return PTR_ERR(ipcp);
+	if (IS_ERR(ipcp)) {
+		err = PTR_ERR(ipcp);
+		/* the ipc lock is not held upon failure */
+		goto out_unlock1;
+	}
 
 	msq = container_of(ipcp, struct msg_queue, q_perm);
 
 	err = security_msg_queue_msgctl(msq, cmd);
 	if (err)
-		goto out_unlock;
+		goto out_unlock0;
 
 	switch (cmd) {
 	case IPC_RMID:
+		/* freeque unlocks the ipc object and rcu */
 		freeque(ns, ipcp);
 		goto out_up;
 	case IPC_SET:
 		if (msqid64.msg_qbytes > ns->msg_ctlmnb &&
 		    !capable(CAP_SYS_RESOURCE)) {
 			err = -EPERM;
-			goto out_unlock;
+			goto out_unlock0;
 		}
 
 		err = ipc_update_perm(&msqid64.msg_perm, ipcp);
 		if (err)
-			goto out_unlock;
+			goto out_unlock0;
 
 		msq->q_qbytes = msqid64.msg_qbytes;
 
@@ -448,8 +455,11 @@ static int msgctl_down(struct ipc_namespace *ns, int msqid, int cmd,
 	default:
 		err = -EINVAL;
 	}
-out_unlock:
-	msg_unlock(msq);
+
+out_unlock0:
+	ipc_unlock_object(&msq->q_perm);
+out_unlock1:
+	rcu_read_unlock();
 out_up:
 	up_write(&msg_ids(ns).rw_mutex);
 	return err;
diff --git a/ipc/sem.c b/ipc/sem.c
index 92ec6c6..b4b892b 100644
--- a/ipc/sem.c
+++ b/ipc/sem.c
@@ -1289,39 +1289,44 @@ static int semctl_down(struct ipc_namespace *ns, int semid,
 			return -EFAULT;
 	}
 
+	down_write(&sem_ids(ns).rw_mutex);
+	rcu_read_lock();
+
 	ipcp = ipcctl_pre_down_nolock(ns, &sem_ids(ns), semid, cmd,
 				      &semid64.sem_perm, 0);
-	if (IS_ERR(ipcp))
-		return PTR_ERR(ipcp);
+	if (IS_ERR(ipcp)) {
+		err = PTR_ERR(ipcp);
+		/* the ipc lock is not held upon failure */
+		goto out_unlock1;
+	}
 
 	sma = container_of(ipcp, struct sem_array, sem_perm);
 
 	err = security_sem_semctl(sma, cmd);
-	if (err) {
-		rcu_read_unlock();
-		goto out_up;
-	}
+	if (err)
+		goto out_unlock1;
 
-	switch(cmd){
+	switch (cmd) {
 	case IPC_RMID:
 		sem_lock(sma, NULL, -1);
+		/* freeary unlocks the ipc object and rcu */
 		freeary(ns, ipcp);
 		goto out_up;
 	case IPC_SET:
 		sem_lock(sma, NULL, -1);
 		err = ipc_update_perm(&semid64.sem_perm, ipcp);
 		if (err)
-			goto out_unlock;
+			goto out_unlock0;
 		sma->sem_ctime = get_seconds();
 		break;
 	default:
-		rcu_read_unlock();
 		err = -EINVAL;
-		goto out_up;
+		goto out_unlock1;
 	}
 
-out_unlock:
+out_unlock0:
 	sem_unlock(sma, -1);
+out_unlock1:
 	rcu_read_unlock();
 out_up:
 	up_write(&sem_ids(ns).rw_mutex);
diff --git a/ipc/shm.c b/ipc/shm.c
index e7d5107..c6b4ad5 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -757,31 +757,42 @@ static int shmctl_down(struct ipc_namespace *ns, int shmid, int cmd,
 			return -EFAULT;
 	}
 
+	down_write(&shm_ids(ns).rw_mutex);
+	rcu_read_lock();
+
 	ipcp = ipcctl_pre_down(ns, &shm_ids(ns), shmid, cmd,
 			       &shmid64.shm_perm, 0);
-	if (IS_ERR(ipcp))
-		return PTR_ERR(ipcp);
+	if (IS_ERR(ipcp)) {
+		err = PTR_ERR(ipcp);
+		/* the ipc lock is not held upon failure */
+		goto out_unlock1;
+	}
 
 	shp = container_of(ipcp, struct shmid_kernel, shm_perm);
 
 	err = security_shm_shmctl(shp, cmd);
 	if (err)
-		goto out_unlock;
+		goto out_unlock0;
+
 	switch (cmd) {
 	case IPC_RMID:
+		/* do_shm_rmid unlocks the ipc object and rcu */
 		do_shm_rmid(ns, ipcp);
 		goto out_up;
 	case IPC_SET:
 		err = ipc_update_perm(&shmid64.shm_perm, ipcp);
 		if (err)
-			goto out_unlock;
+			goto out_unlock0;
 		shp->shm_ctim = get_seconds();
 		break;
 	default:
 		err = -EINVAL;
 	}
-out_unlock:
-	shm_unlock(shp);
+
+out_unlock0:
+	ipc_unlock_object(&shp->shm_perm);
+out_unlock1:
+	rcu_read_unlock();
 out_up:
 	up_write(&shm_ids(ns).rw_mutex);
 	return err;
diff --git a/ipc/util.c b/ipc/util.c
index 399821a..a0c139f 100644
--- a/ipc/util.c
+++ b/ipc/util.c
@@ -746,8 +746,10 @@ int ipc_update_perm(struct ipc64_perm *in, struct kern_ipc_perm *out)
  * It must be called without any lock held and
  *  - retrieves the ipc with the given id in the given table.
  *  - performs some audit and permission check, depending on the given cmd
- *  - returns the ipc with both ipc and rw_mutex locks held in case of success
+ *  - returns the ipc with the ipc lock held in case of success
  *    or an err-code without any lock held otherwise.
+ *
+ * Call holding the both the rw_mutex and the rcu read lock.
  */
 struct kern_ipc_perm *ipcctl_pre_down(struct ipc_namespace *ns,
 				      struct ipc_ids *ids, int id, int cmd,
@@ -772,13 +774,10 @@ struct kern_ipc_perm *ipcctl_pre_down_nolock(struct ipc_namespace *ns,
 	int err = -EPERM;
 	struct kern_ipc_perm *ipcp;
 
-	down_write(&ids->rw_mutex);
-	rcu_read_lock();
-
 	ipcp = ipc_obtain_object_check(ids, id);
 	if (IS_ERR(ipcp)) {
 		err = PTR_ERR(ipcp);
-		goto out_up;
+		goto err;
 	}
 
 	audit_ipc_obj(ipcp);
@@ -789,16 +788,8 @@ struct kern_ipc_perm *ipcctl_pre_down_nolock(struct ipc_namespace *ns,
 	euid = current_euid();
 	if (uid_eq(euid, ipcp->cuid) || uid_eq(euid, ipcp->uid)  ||
 	    ns_capable(ns->user_ns, CAP_SYS_ADMIN))
-		return ipcp;
-
-out_up:
-	/*
-	 * Unsuccessful lookup, unlock and return
-	 * the corresponding error.
-	 */
-	rcu_read_unlock();
-	up_write(&ids->rw_mutex);
-
+		return ipcp; /* successful lookup */
+err:
 	return ERR_PTR(err);
 }
 
-- 
cgit v0.10.2


From 15724ecb7e9bab35fc694c666ad563adba820cc3 Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <davidlohr.bueso@hp.com>
Date: Mon, 8 Jul 2013 16:01:13 -0700
Subject: ipc,msg: shorten critical region in msgctl_down

Instead of holding the ipc lock for the entire function, use the
ipcctl_pre_down_nolock and only acquire the lock for specific commands:
RMID and SET.

Signed-off-by: Davidlohr Bueso <davidlohr.bueso@hp.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/ipc/msg.c b/ipc/msg.c
index f62fa5e..de422ff 100644
--- a/ipc/msg.c
+++ b/ipc/msg.c
@@ -410,11 +410,10 @@ static int msgctl_down(struct ipc_namespace *ns, int msqid, int cmd,
 	down_write(&msg_ids(ns).rw_mutex);
 	rcu_read_lock();
 
-	ipcp = ipcctl_pre_down(ns, &msg_ids(ns), msqid, cmd,
-			       &msqid64.msg_perm, msqid64.msg_qbytes);
+	ipcp = ipcctl_pre_down_nolock(ns, &msg_ids(ns), msqid, cmd,
+				      &msqid64.msg_perm, msqid64.msg_qbytes);
 	if (IS_ERR(ipcp)) {
 		err = PTR_ERR(ipcp);
-		/* the ipc lock is not held upon failure */
 		goto out_unlock1;
 	}
 
@@ -422,10 +421,11 @@ static int msgctl_down(struct ipc_namespace *ns, int msqid, int cmd,
 
 	err = security_msg_queue_msgctl(msq, cmd);
 	if (err)
-		goto out_unlock0;
+		goto out_unlock1;
 
 	switch (cmd) {
 	case IPC_RMID:
+		ipc_lock_object(&msq->q_perm);
 		/* freeque unlocks the ipc object and rcu */
 		freeque(ns, ipcp);
 		goto out_up;
@@ -433,9 +433,10 @@ static int msgctl_down(struct ipc_namespace *ns, int msqid, int cmd,
 		if (msqid64.msg_qbytes > ns->msg_ctlmnb &&
 		    !capable(CAP_SYS_RESOURCE)) {
 			err = -EPERM;
-			goto out_unlock0;
+			goto out_unlock1;
 		}
 
+		ipc_lock_object(&msq->q_perm);
 		err = ipc_update_perm(&msqid64.msg_perm, ipcp);
 		if (err)
 			goto out_unlock0;
@@ -454,6 +455,7 @@ static int msgctl_down(struct ipc_namespace *ns, int msqid, int cmd,
 		break;
 	default:
 		err = -EINVAL;
+		goto out_unlock1;
 	}
 
 out_unlock0:
-- 
cgit v0.10.2


From 2cafed30f150f7314f98717b372df8173516cae0 Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <davidlohr.bueso@hp.com>
Date: Mon, 8 Jul 2013 16:01:14 -0700
Subject: ipc,msg: introduce msgctl_nolock

Similar to semctl, when calling msgctl, the *_INFO and *_STAT commands
can be performed without acquiring the ipc object.

Add a msgctl_nolock() function and move the logic of *_INFO and *_STAT
out of msgctl().  This change still takes the lock and it will be
properly lockless in the next patch

Signed-off-by: Davidlohr Bueso <davidlohr.bueso@hp.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/ipc/msg.c b/ipc/msg.c
index de422ff..f45be81 100644
--- a/ipc/msg.c
+++ b/ipc/msg.c
@@ -467,17 +467,11 @@ out_up:
 	return err;
 }
 
-SYSCALL_DEFINE3(msgctl, int, msqid, int, cmd, struct msqid_ds __user *, buf)
+static int msgctl_nolock(struct ipc_namespace *ns, int msqid,
+			 int cmd, int version, void __user *buf)
 {
+	int err;
 	struct msg_queue *msq;
-	int err, version;
-	struct ipc_namespace *ns;
-
-	if (msqid < 0 || cmd < 0)
-		return -EINVAL;
-
-	version = ipc_parse_version(&cmd);
-	ns = current->nsproxy->ipc_ns;
 
 	switch (cmd) {
 	case IPC_INFO:
@@ -488,6 +482,7 @@ SYSCALL_DEFINE3(msgctl, int, msqid, int, cmd, struct msqid_ds __user *, buf)
 
 		if (!buf)
 			return -EFAULT;
+
 		/*
 		 * We must not return kernel stack data.
 		 * due to padding, it's not enough
@@ -519,7 +514,8 @@ SYSCALL_DEFINE3(msgctl, int, msqid, int, cmd, struct msqid_ds __user *, buf)
 			return -EFAULT;
 		return (max_id < 0) ? 0 : max_id;
 	}
-	case MSG_STAT:	/* msqid is an index rather than a msg queue id */
+
+	case MSG_STAT:
 	case IPC_STAT:
 	{
 		struct msqid64_ds tbuf;
@@ -563,19 +559,42 @@ SYSCALL_DEFINE3(msgctl, int, msqid, int, cmd, struct msqid_ds __user *, buf)
 			return -EFAULT;
 		return success_return;
 	}
-	case IPC_SET:
-	case IPC_RMID:
-		err = msgctl_down(ns, msqid, cmd, buf, version);
-		return err;
+
 	default:
-		return  -EINVAL;
+		return -EINVAL;
 	}
 
+	return err;
 out_unlock:
 	msg_unlock(msq);
 	return err;
 }
 
+SYSCALL_DEFINE3(msgctl, int, msqid, int, cmd, struct msqid_ds __user *, buf)
+{
+	int version;
+	struct ipc_namespace *ns;
+
+	if (msqid < 0 || cmd < 0)
+		return -EINVAL;
+
+	version = ipc_parse_version(&cmd);
+	ns = current->nsproxy->ipc_ns;
+
+	switch (cmd) {
+	case IPC_INFO:
+	case MSG_INFO:
+	case MSG_STAT:	/* msqid is an index rather than a msg queue id */
+	case IPC_STAT:
+		return msgctl_nolock(ns, msqid, cmd, version, buf);
+	case IPC_SET:
+	case IPC_RMID:
+		return msgctl_down(ns, msqid, cmd, buf, version);
+	default:
+		return  -EINVAL;
+	}
+}
+
 static int testmsg(struct msg_msg *msg, long type, int mode)
 {
 	switch(mode)
-- 
cgit v0.10.2


From a5001a0d9768568de5d613c3b3a5b9c7721299da Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <davidlohr.bueso@hp.com>
Date: Mon, 8 Jul 2013 16:01:15 -0700
Subject: ipc,msg: introduce lockless functions to obtain the ipc object

Add msq_obtain_object() and msq_obtain_object_check(), which will allow
us to get the ipc object without acquiring the lock.  Just as with
semaphores, these functions are basically wrappers around
ipc_obtain_object*().

Signed-off-by: Davidlohr Bueso <davidlohr.bueso@hp.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/ipc/msg.c b/ipc/msg.c
index f45be81..c53c137 100644
--- a/ipc/msg.c
+++ b/ipc/msg.c
@@ -166,6 +166,27 @@ static inline struct msg_queue *msg_lock_check(struct ipc_namespace *ns,
 	return container_of(ipcp, struct msg_queue, q_perm);
 }
 
+static inline struct msg_queue *msq_obtain_object(struct ipc_namespace *ns, int id)
+{
+	struct kern_ipc_perm *ipcp = ipc_obtain_object(&msg_ids(ns), id);
+
+	if (IS_ERR(ipcp))
+		return ERR_CAST(ipcp);
+
+	return container_of(ipcp, struct msg_queue, q_perm);
+}
+
+static inline struct msg_queue *msq_obtain_object_check(struct ipc_namespace *ns,
+							int id)
+{
+	struct kern_ipc_perm *ipcp = ipc_obtain_object_check(&msg_ids(ns), id);
+
+	if (IS_ERR(ipcp))
+		return ERR_CAST(ipcp);
+
+	return container_of(ipcp, struct msg_queue, q_perm);
+}
+
 static inline void msg_rmid(struct ipc_namespace *ns, struct msg_queue *s)
 {
 	ipc_rmid(&msg_ids(ns), &s->q_perm);
-- 
cgit v0.10.2


From ac0ba20ea6f2201a1589d6dc26ad1a4f0f967bb8 Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <davidlohr.bueso@hp.com>
Date: Mon, 8 Jul 2013 16:01:16 -0700
Subject: ipc,msg: make msgctl_nolock lockless

While the INFO cmd doesn't take the ipc lock, the STAT commands do
acquire it unnecessarily.  We can do the permissions and security checks
only holding the rcu lock.

This function now mimics semctl_nolock().

Signed-off-by: Davidlohr Bueso <davidlohr.bueso@hp.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/ipc/msg.c b/ipc/msg.c
index c53c137..c218328 100644
--- a/ipc/msg.c
+++ b/ipc/msg.c
@@ -545,17 +545,25 @@ static int msgctl_nolock(struct ipc_namespace *ns, int msqid,
 		if (!buf)
 			return -EFAULT;
 
+		memset(&tbuf, 0, sizeof(tbuf));
+
+		rcu_read_lock();
 		if (cmd == MSG_STAT) {
-			msq = msg_lock(ns, msqid);
-			if (IS_ERR(msq))
-				return PTR_ERR(msq);
+			msq = msq_obtain_object(ns, msqid);
+			if (IS_ERR(msq)) {
+				err = PTR_ERR(msq);
+				goto out_unlock;
+			}
 			success_return = msq->q_perm.id;
 		} else {
-			msq = msg_lock_check(ns, msqid);
-			if (IS_ERR(msq))
-				return PTR_ERR(msq);
+			msq = msq_obtain_object_check(ns, msqid);
+			if (IS_ERR(msq)) {
+				err = PTR_ERR(msq);
+				goto out_unlock;
+			}
 			success_return = 0;
 		}
+
 		err = -EACCES;
 		if (ipcperms(ns, &msq->q_perm, S_IRUGO))
 			goto out_unlock;
@@ -564,8 +572,6 @@ static int msgctl_nolock(struct ipc_namespace *ns, int msqid,
 		if (err)
 			goto out_unlock;
 
-		memset(&tbuf, 0, sizeof(tbuf));
-
 		kernel_to_ipc64_perm(&msq->q_perm, &tbuf.msg_perm);
 		tbuf.msg_stime  = msq->q_stime;
 		tbuf.msg_rtime  = msq->q_rtime;
@@ -575,7 +581,8 @@ static int msgctl_nolock(struct ipc_namespace *ns, int msqid,
 		tbuf.msg_qbytes = msq->q_qbytes;
 		tbuf.msg_lspid  = msq->q_lspid;
 		tbuf.msg_lrpid  = msq->q_lrpid;
-		msg_unlock(msq);
+		rcu_read_unlock();
+
 		if (copy_msqid_to_user(buf, &tbuf, version))
 			return -EFAULT;
 		return success_return;
@@ -587,7 +594,7 @@ static int msgctl_nolock(struct ipc_namespace *ns, int msqid,
 
 	return err;
 out_unlock:
-	msg_unlock(msq);
+	rcu_read_unlock();
 	return err;
 }
 
-- 
cgit v0.10.2


From 3dd1f784ed6603d7ab1043e51e6371235edf2313 Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <davidlohr.bueso@hp.com>
Date: Mon, 8 Jul 2013 16:01:17 -0700
Subject: ipc,msg: shorten critical region in msgsnd

do_msgsnd() is another function that does too many things with the ipc
object lock acquired.  Take it only when needed when actually updating
msq.

Signed-off-by: Davidlohr Bueso <davidlohr.bueso@hp.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/ipc/msg.c b/ipc/msg.c
index c218328..f2a1a8f 100644
--- a/ipc/msg.c
+++ b/ipc/msg.c
@@ -698,10 +698,11 @@ long do_msgsnd(int msqid, long mtype, void __user *mtext,
 	msg->m_type = mtype;
 	msg->m_ts = msgsz;
 
-	msq = msg_lock_check(ns, msqid);
+	rcu_read_lock();
+	msq = msq_obtain_object_check(ns, msqid);
 	if (IS_ERR(msq)) {
 		err = PTR_ERR(msq);
-		goto out_free;
+		goto out_unlock1;
 	}
 
 	for (;;) {
@@ -709,11 +710,11 @@ long do_msgsnd(int msqid, long mtype, void __user *mtext,
 
 		err = -EACCES;
 		if (ipcperms(ns, &msq->q_perm, S_IWUGO))
-			goto out_unlock_free;
+			goto out_unlock1;
 
 		err = security_msg_queue_msgsnd(msq, msg, msgflg);
 		if (err)
-			goto out_unlock_free;
+			goto out_unlock1;
 
 		if (msgsz + msq->q_cbytes <= msq->q_qbytes &&
 				1 + msq->q_qnum <= msq->q_qbytes) {
@@ -723,32 +724,41 @@ long do_msgsnd(int msqid, long mtype, void __user *mtext,
 		/* queue full, wait: */
 		if (msgflg & IPC_NOWAIT) {
 			err = -EAGAIN;
-			goto out_unlock_free;
+			goto out_unlock1;
 		}
+
+		ipc_lock_object(&msq->q_perm);
 		ss_add(msq, &s);
 
 		if (!ipc_rcu_getref(msq)) {
 			err = -EIDRM;
-			goto out_unlock_free;
+			goto out_unlock0;
 		}
 
-		msg_unlock(msq);
+		ipc_unlock_object(&msq->q_perm);
+		rcu_read_unlock();
 		schedule();
 
-		ipc_lock_by_ptr(&msq->q_perm);
+		rcu_read_lock();
+		ipc_lock_object(&msq->q_perm);
+
 		ipc_rcu_putref(msq);
 		if (msq->q_perm.deleted) {
 			err = -EIDRM;
-			goto out_unlock_free;
+			goto out_unlock0;
 		}
+
 		ss_del(&s);
 
 		if (signal_pending(current)) {
 			err = -ERESTARTNOHAND;
-			goto out_unlock_free;
+			goto out_unlock0;
 		}
+
+		ipc_unlock_object(&msq->q_perm);
 	}
 
+	ipc_lock_object(&msq->q_perm);
 	msq->q_lspid = task_tgid_vnr(current);
 	msq->q_stime = get_seconds();
 
@@ -764,9 +774,10 @@ long do_msgsnd(int msqid, long mtype, void __user *mtext,
 	err = 0;
 	msg = NULL;
 
-out_unlock_free:
-	msg_unlock(msq);
-out_free:
+out_unlock0:
+	ipc_unlock_object(&msq->q_perm);
+out_unlock1:
+	rcu_read_unlock();
 	if (msg != NULL)
 		free_msg(msg);
 	return err;
-- 
cgit v0.10.2


From 41a0d523d0f626e9da0dc01de47f1b89058033cf Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <davidlohr.bueso@hp.com>
Date: Mon, 8 Jul 2013 16:01:18 -0700
Subject: ipc,msg: shorten critical region in msgrcv

do_msgrcv() is the last msg queue function that abuses the ipc lock Take
it only when needed when actually updating msq.

Signed-off-by: Davidlohr Bueso <davidlohr.bueso@hp.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Rik van Riel <riel@redhat.com>
Tested-by: Sedat Dilek <sedat.dilek@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/ipc/msg.c b/ipc/msg.c
index f2a1a8f..a3c0dc4 100644
--- a/ipc/msg.c
+++ b/ipc/msg.c
@@ -885,21 +885,19 @@ static struct msg_msg *find_msg(struct msg_queue *msq, long *msgtyp, int mode)
 	return ERR_PTR(-EAGAIN);
 }
 
-
-long do_msgrcv(int msqid, void __user *buf, size_t bufsz, long msgtyp,
-	       int msgflg,
+long do_msgrcv(int msqid, void __user *buf, size_t bufsz, long msgtyp, int msgflg,
 	       long (*msg_handler)(void __user *, struct msg_msg *, size_t))
 {
-	struct msg_queue *msq;
-	struct msg_msg *msg;
 	int mode;
+	struct msg_queue *msq;
 	struct ipc_namespace *ns;
-	struct msg_msg *copy = NULL;
+	struct msg_msg *msg, *copy = NULL;
 
 	ns = current->nsproxy->ipc_ns;
 
 	if (msqid < 0 || (long) bufsz < 0)
 		return -EINVAL;
+
 	if (msgflg & MSG_COPY) {
 		copy = prepare_copy(buf, min_t(size_t, bufsz, ns->msg_ctlmax));
 		if (IS_ERR(copy))
@@ -907,8 +905,10 @@ long do_msgrcv(int msqid, void __user *buf, size_t bufsz, long msgtyp,
 	}
 	mode = convert_mode(&msgtyp, msgflg);
 
-	msq = msg_lock_check(ns, msqid);
+	rcu_read_lock();
+	msq = msq_obtain_object_check(ns, msqid);
 	if (IS_ERR(msq)) {
+		rcu_read_unlock();
 		free_copy(copy);
 		return PTR_ERR(msq);
 	}
@@ -918,10 +918,10 @@ long do_msgrcv(int msqid, void __user *buf, size_t bufsz, long msgtyp,
 
 		msg = ERR_PTR(-EACCES);
 		if (ipcperms(ns, &msq->q_perm, S_IRUGO))
-			goto out_unlock;
+			goto out_unlock1;
 
+		ipc_lock_object(&msq->q_perm);
 		msg = find_msg(msq, &msgtyp, mode);
-
 		if (!IS_ERR(msg)) {
 			/*
 			 * Found a suitable message.
@@ -929,7 +929,7 @@ long do_msgrcv(int msqid, void __user *buf, size_t bufsz, long msgtyp,
 			 */
 			if ((bufsz < msg->m_ts) && !(msgflg & MSG_NOERROR)) {
 				msg = ERR_PTR(-E2BIG);
-				goto out_unlock;
+				goto out_unlock0;
 			}
 			/*
 			 * If we are copying, then do not unlink message and do
@@ -937,8 +937,9 @@ long do_msgrcv(int msqid, void __user *buf, size_t bufsz, long msgtyp,
 			 */
 			if (msgflg & MSG_COPY) {
 				msg = copy_msg(msg, copy);
-				goto out_unlock;
+				goto out_unlock0;
 			}
+
 			list_del(&msg->m_list);
 			msq->q_qnum--;
 			msq->q_rtime = get_seconds();
@@ -947,14 +948,16 @@ long do_msgrcv(int msqid, void __user *buf, size_t bufsz, long msgtyp,
 			atomic_sub(msg->m_ts, &ns->msg_bytes);
 			atomic_dec(&ns->msg_hdrs);
 			ss_wakeup(&msq->q_senders, 0);
-			msg_unlock(msq);
-			break;
+
+			goto out_unlock0;
 		}
+
 		/* No message waiting. Wait for a message */
 		if (msgflg & IPC_NOWAIT) {
 			msg = ERR_PTR(-ENOMSG);
-			goto out_unlock;
+			goto out_unlock0;
 		}
+
 		list_add_tail(&msr_d.r_list, &msq->q_receivers);
 		msr_d.r_tsk = current;
 		msr_d.r_msgtype = msgtyp;
@@ -965,8 +968,9 @@ long do_msgrcv(int msqid, void __user *buf, size_t bufsz, long msgtyp,
 			msr_d.r_maxsize = bufsz;
 		msr_d.r_msg = ERR_PTR(-EAGAIN);
 		current->state = TASK_INTERRUPTIBLE;
-		msg_unlock(msq);
 
+		ipc_unlock_object(&msq->q_perm);
+		rcu_read_unlock();
 		schedule();
 
 		/* Lockless receive, part 1:
@@ -977,7 +981,7 @@ long do_msgrcv(int msqid, void __user *buf, size_t bufsz, long msgtyp,
 		 * Prior to destruction, expunge_all(-EIRDM) changes r_msg.
 		 * Thus if r_msg is -EAGAIN, then the queue not yet destroyed.
 		 * rcu_read_lock() prevents preemption between reading r_msg
-		 * and the spin_lock() inside ipc_lock_by_ptr().
+		 * and acquiring the q_perm.lock in ipc_lock_object().
 		 */
 		rcu_read_lock();
 
@@ -996,32 +1000,34 @@ long do_msgrcv(int msqid, void __user *buf, size_t bufsz, long msgtyp,
 		 * If there is a message or an error then accept it without
 		 * locking.
 		 */
-		if (msg != ERR_PTR(-EAGAIN)) {
-			rcu_read_unlock();
-			break;
-		}
+		if (msg != ERR_PTR(-EAGAIN))
+			goto out_unlock1;
 
 		/* Lockless receive, part 3:
 		 * Acquire the queue spinlock.
 		 */
-		ipc_lock_by_ptr(&msq->q_perm);
-		rcu_read_unlock();
+		ipc_lock_object(&msq->q_perm);
 
 		/* Lockless receive, part 4:
 		 * Repeat test after acquiring the spinlock.
 		 */
 		msg = (struct msg_msg*)msr_d.r_msg;
 		if (msg != ERR_PTR(-EAGAIN))
-			goto out_unlock;
+			goto out_unlock0;
 
 		list_del(&msr_d.r_list);
 		if (signal_pending(current)) {
 			msg = ERR_PTR(-ERESTARTNOHAND);
-out_unlock:
-			msg_unlock(msq);
-			break;
+			goto out_unlock0;
 		}
+
+		ipc_unlock_object(&msq->q_perm);
 	}
+
+out_unlock0:
+	ipc_unlock_object(&msq->q_perm);
+out_unlock1:
+	rcu_read_unlock();
 	if (IS_ERR(msg)) {
 		free_copy(copy);
 		return PTR_ERR(msg);
-- 
cgit v0.10.2


From 9ad66ae65fc8d3e7e3344310fb0aa835910264fe Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <davidlohr.bueso@hp.com>
Date: Mon, 8 Jul 2013 16:01:19 -0700
Subject: ipc: remove unused functions

We can now drop the msg_lock and msg_lock_check functions along with a
bogus comment introduced previously in semctl_down.

Signed-off-by: Davidlohr Bueso <davidlohr.bueso@hp.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/ipc/msg.c b/ipc/msg.c
index a3c0dc4..bd60d7e 100644
--- a/ipc/msg.c
+++ b/ipc/msg.c
@@ -141,31 +141,6 @@ void __init msg_init(void)
 				IPC_MSG_IDS, sysvipc_msg_proc_show);
 }
 
-/*
- * msg_lock_(check_) routines are called in the paths where the rw_mutex
- * is not held.
- */
-static inline struct msg_queue *msg_lock(struct ipc_namespace *ns, int id)
-{
-	struct kern_ipc_perm *ipcp = ipc_lock(&msg_ids(ns), id);
-
-	if (IS_ERR(ipcp))
-		return (struct msg_queue *)ipcp;
-
-	return container_of(ipcp, struct msg_queue, q_perm);
-}
-
-static inline struct msg_queue *msg_lock_check(struct ipc_namespace *ns,
-						int id)
-{
-	struct kern_ipc_perm *ipcp = ipc_lock_check(&msg_ids(ns), id);
-
-	if (IS_ERR(ipcp))
-		return (struct msg_queue *)ipcp;
-
-	return container_of(ipcp, struct msg_queue, q_perm);
-}
-
 static inline struct msg_queue *msq_obtain_object(struct ipc_namespace *ns, int id)
 {
 	struct kern_ipc_perm *ipcp = ipc_obtain_object(&msg_ids(ns), id);
diff --git a/ipc/sem.c b/ipc/sem.c
index b4b892b..d3ad357 100644
--- a/ipc/sem.c
+++ b/ipc/sem.c
@@ -1296,7 +1296,6 @@ static int semctl_down(struct ipc_namespace *ns, int semid,
 				      &semid64.sem_perm, 0);
 	if (IS_ERR(ipcp)) {
 		err = PTR_ERR(ipcp);
-		/* the ipc lock is not held upon failure */
 		goto out_unlock1;
 	}
 
-- 
cgit v0.10.2


From 196aa0132fc7261f34b10ae1bfb44abc1bc69b3c Mon Sep 17 00:00:00 2001
From: Manfred Spraul <manfred@colorfullife.com>
Date: Mon, 8 Jul 2013 16:01:20 -0700
Subject: ipc/util.c, ipc_rcu_alloc: cacheline align allocation

Enforce that ipc_rcu_alloc returns a cacheline aligned pointer on SMP.

Rationale:

The SysV sem code tries to move the main spinlock into a seperate
cacheline (____cacheline_aligned_in_smp).  This works only if
ipc_rcu_alloc returns cacheline aligned pointers.  vmalloc and kmalloc
return cacheline algined pointers, the implementation of ipc_rcu_alloc
breaks that.

[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Manfred Spraul <manfred@colorfullife.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Davidlohr Bueso <davidlohr.bueso@hp.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/ipc/util.c b/ipc/util.c
index a0c139f..4704223 100644
--- a/ipc/util.c
+++ b/ipc/util.c
@@ -468,9 +468,7 @@ void ipc_free(void* ptr, int size)
 struct ipc_rcu {
 	struct rcu_head rcu;
 	atomic_t refcount;
-	/* "void *" makes sure alignment of following data is sane. */
-	void *data[0];
-};
+} ____cacheline_aligned_in_smp;
 
 /**
  *	ipc_rcu_alloc	-	allocate ipc and rcu space 
@@ -488,12 +486,14 @@ void *ipc_rcu_alloc(int size)
 	if (unlikely(!out))
 		return NULL;
 	atomic_set(&out->refcount, 1);
-	return out->data;
+	return out + 1;
 }
 
 int ipc_rcu_getref(void *ptr)
 {
-	return atomic_inc_not_zero(&container_of(ptr, struct ipc_rcu, data)->refcount);
+	struct ipc_rcu *p = ((struct ipc_rcu *)ptr) - 1;
+
+	return atomic_inc_not_zero(&p->refcount);
 }
 
 /**
@@ -507,7 +507,7 @@ static void ipc_schedule_free(struct rcu_head *head)
 
 void ipc_rcu_putref(void *ptr)
 {
-	struct ipc_rcu *p = container_of(ptr, struct ipc_rcu, data);
+	struct ipc_rcu *p = ((struct ipc_rcu *)ptr) - 1;
 
 	if (!atomic_dec_and_test(&p->refcount))
 		return;
-- 
cgit v0.10.2


From f5c936c0f267ec58641451cf8b8d39b4c207ee4d Mon Sep 17 00:00:00 2001
From: Manfred Spraul <manfred@colorfullife.com>
Date: Mon, 8 Jul 2013 16:01:22 -0700
Subject: ipc/sem.c: cacheline align the semaphore structures

As now each semaphore has its own spinlock and parallel operations are
possible, give each semaphore its own cacheline.

On a i3 laptop, this gives up to 28% better performance:

  #semscale 10 | grep "interleave 2"
  - before:
  Cpus 1, interleave 2 delay 0: 36109234 in 10 secs
  Cpus 2, interleave 2 delay 0: 55276317 in 10 secs
  Cpus 3, interleave 2 delay 0: 62411025 in 10 secs
  Cpus 4, interleave 2 delay 0: 81963928 in 10 secs

  -after:
  Cpus 1, interleave 2 delay 0: 35527306 in 10 secs
  Cpus 2, interleave 2 delay 0: 70922909 in 10 secs <<< + 28%
  Cpus 3, interleave 2 delay 0: 80518538 in 10 secs
  Cpus 4, interleave 2 delay 0: 89115148 in 10 secs <<< + 8.7%

i3, with 2 cores and with hyperthreading enabled.  Interleave 2 in order
use first the full cores.  HT partially hides the delay from cacheline
trashing, thus the improvement is "only" 8.7% if 4 threads are running.

Signed-off-by: Manfred Spraul <manfred@colorfullife.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Davidlohr Bueso <davidlohr.bueso@hp.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/ipc/sem.c b/ipc/sem.c
index d3ad357..8498b67 100644
--- a/ipc/sem.c
+++ b/ipc/sem.c
@@ -96,7 +96,7 @@ struct sem {
 	int	sempid;		/* pid of last operation */
 	spinlock_t	lock;	/* spinlock for fine-grained semtimedop */
 	struct list_head sem_pending; /* pending single-sop operations */
-};
+} ____cacheline_aligned_in_smp;
 
 /* One queue for each sleeping process in the system. */
 struct sem_queue {
-- 
cgit v0.10.2


From 1a82e9e1d0f1b45f47a97c9e2349020536ff8987 Mon Sep 17 00:00:00 2001
From: Manfred Spraul <manfred@colorfullife.com>
Date: Mon, 8 Jul 2013 16:01:23 -0700
Subject: ipc/sem: separate wait-for-zero and alter tasks into seperate queues

Introduce separate queues for operations that do not modify the
semaphore values.  Advantages:

 - Simpler logic in check_restart().
 - Faster update_queue(): Right now, all wait-for-zero operations are
   always tested, even if the semaphore value is not 0.
 - wait-for-zero gets again priority, as in linux <=3.0.9

Signed-off-by: Manfred Spraul <manfred@colorfullife.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Davidlohr Bueso <davidlohr.bueso@hp.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/include/linux/sem.h b/include/linux/sem.h
index 53d4265..55e17f6 100644
--- a/include/linux/sem.h
+++ b/include/linux/sem.h
@@ -15,7 +15,10 @@ struct sem_array {
 	time_t			sem_otime;	/* last semop time */
 	time_t			sem_ctime;	/* last change time */
 	struct sem		*sem_base;	/* ptr to first semaphore in array */
-	struct list_head	sem_pending;	/* pending operations to be processed */
+	struct list_head	pending_alter;	/* pending operations */
+						/* that alter the array */
+	struct list_head	pending_const;	/* pending complex operations */
+						/* that do not alter semvals */
 	struct list_head	list_id;	/* undo requests on this array */
 	int			sem_nsems;	/* no. of semaphores in array */
 	int			complex_count;	/* pending complex operations */
diff --git a/ipc/sem.c b/ipc/sem.c
index 8498b67..4d7f88c 100644
--- a/ipc/sem.c
+++ b/ipc/sem.c
@@ -95,7 +95,10 @@ struct sem {
 	int	semval;		/* current value */
 	int	sempid;		/* pid of last operation */
 	spinlock_t	lock;	/* spinlock for fine-grained semtimedop */
-	struct list_head sem_pending; /* pending single-sop operations */
+	struct list_head pending_alter; /* pending single-sop operations */
+					/* that alter the semaphore */
+	struct list_head pending_const; /* pending single-sop operations */
+					/* that do not alter the semaphore*/
 } ____cacheline_aligned_in_smp;
 
 /* One queue for each sleeping process in the system. */
@@ -152,7 +155,7 @@ static int sysvipc_sem_proc_show(struct seq_file *s, void *it);
 /*
  * linked list protection:
  *	sem_undo.id_next,
- *	sem_array.sem_pending{,last},
+ *	sem_array.pending{_alter,_cont},
  *	sem_array.sem_undo: sem_lock() for read/write
  *	sem_undo.proc_next: only "current" is allowed to read/write that field.
  *	
@@ -337,7 +340,7 @@ static inline void sem_rmid(struct ipc_namespace *ns, struct sem_array *s)
  * Without the check/retry algorithm a lockless wakeup is possible:
  * - queue.status is initialized to -EINTR before blocking.
  * - wakeup is performed by
- *	* unlinking the queue entry from sma->sem_pending
+ *	* unlinking the queue entry from the pending list
  *	* setting queue.status to IN_WAKEUP
  *	  This is the notification for the blocked thread that a
  *	  result value is imminent.
@@ -418,12 +421,14 @@ static int newary(struct ipc_namespace *ns, struct ipc_params *params)
 	sma->sem_base = (struct sem *) &sma[1];
 
 	for (i = 0; i < nsems; i++) {
-		INIT_LIST_HEAD(&sma->sem_base[i].sem_pending);
+		INIT_LIST_HEAD(&sma->sem_base[i].pending_alter);
+		INIT_LIST_HEAD(&sma->sem_base[i].pending_const);
 		spin_lock_init(&sma->sem_base[i].lock);
 	}
 
 	sma->complex_count = 0;
-	INIT_LIST_HEAD(&sma->sem_pending);
+	INIT_LIST_HEAD(&sma->pending_alter);
+	INIT_LIST_HEAD(&sma->pending_const);
 	INIT_LIST_HEAD(&sma->list_id);
 	sma->sem_nsems = nsems;
 	sma->sem_ctime = get_seconds();
@@ -609,60 +614,132 @@ static void unlink_queue(struct sem_array *sma, struct sem_queue *q)
  * update_queue is O(N^2) when it restarts scanning the whole queue of
  * waiting operations. Therefore this function checks if the restart is
  * really necessary. It is called after a previously waiting operation
- * was completed.
+ * modified the array.
+ * Note that wait-for-zero operations are handled without restart.
  */
 static int check_restart(struct sem_array *sma, struct sem_queue *q)
 {
-	struct sem *curr;
-	struct sem_queue *h;
-
-	/* if the operation didn't modify the array, then no restart */
-	if (q->alter == 0)
-		return 0;
-
-	/* pending complex operations are too difficult to analyse */
-	if (sma->complex_count)
+	/* pending complex alter operations are too difficult to analyse */
+	if (!list_empty(&sma->pending_alter))
 		return 1;
 
 	/* we were a sleeping complex operation. Too difficult */
 	if (q->nsops > 1)
 		return 1;
 
-	curr = sma->sem_base + q->sops[0].sem_num;
+	/* It is impossible that someone waits for the new value:
+	 * - complex operations always restart.
+	 * - wait-for-zero are handled seperately.
+	 * - q is a previously sleeping simple operation that
+	 *   altered the array. It must be a decrement, because
+	 *   simple increments never sleep.
+	 * - If there are older (higher priority) decrements
+	 *   in the queue, then they have observed the original
+	 *   semval value and couldn't proceed. The operation
+	 *   decremented to value - thus they won't proceed either.
+	 */
+	return 0;
+}
 
-	/* No-one waits on this queue */
-	if (list_empty(&curr->sem_pending))
-		return 0;
+/**
+ * wake_const_ops(sma, semnum, pt) - Wake up non-alter tasks
+ * @sma: semaphore array.
+ * @semnum: semaphore that was modified.
+ * @pt: list head for the tasks that must be woken up.
+ *
+ * wake_const_ops must be called after a semaphore in a semaphore array
+ * was set to 0. If complex const operations are pending, wake_const_ops must
+ * be called with semnum = -1, as well as with the number of each modified
+ * semaphore.
+ * The tasks that must be woken up are added to @pt. The return code
+ * is stored in q->pid.
+ * The function returns 1 if at least one operation was completed successfully.
+ */
+static int wake_const_ops(struct sem_array *sma, int semnum,
+				struct list_head *pt)
+{
+	struct sem_queue *q;
+	struct list_head *walk;
+	struct list_head *pending_list;
+	int semop_completed = 0;
+
+	if (semnum == -1)
+		pending_list = &sma->pending_const;
+	else
+		pending_list = &sma->sem_base[semnum].pending_const;
 
-	/* the new semaphore value */
-	if (curr->semval) {
-		/* It is impossible that someone waits for the new value:
-		 * - q is a previously sleeping simple operation that
-		 *   altered the array. It must be a decrement, because
-		 *   simple increments never sleep.
-		 * - The value is not 0, thus wait-for-zero won't proceed.
-		 * - If there are older (higher priority) decrements
-		 *   in the queue, then they have observed the original
-		 *   semval value and couldn't proceed. The operation
-		 *   decremented to value - thus they won't proceed either.
+	walk = pending_list->next;
+	while (walk != pending_list) {
+		int error;
+
+		q = container_of(walk, struct sem_queue, list);
+		walk = walk->next;
+
+		error = try_atomic_semop(sma, q->sops, q->nsops,
+						q->undo, q->pid);
+
+		if (error <= 0) {
+			/* operation completed, remove from queue & wakeup */
+
+			unlink_queue(sma, q);
+
+			wake_up_sem_queue_prepare(pt, q, error);
+			if (error == 0)
+				semop_completed = 1;
+		}
+	}
+	return semop_completed;
+}
+
+/**
+ * do_smart_wakeup_zero(sma, sops, nsops, pt) - wakeup all wait for zero tasks
+ * @sma: semaphore array
+ * @sops: operations that were performed
+ * @nsops: number of operations
+ * @pt: list head of the tasks that must be woken up.
+ *
+ * do_smart_wakeup_zero() checks all required queue for wait-for-zero
+ * operations, based on the actual changes that were performed on the
+ * semaphore array.
+ * The function returns 1 if at least one operation was completed successfully.
+ */
+static int do_smart_wakeup_zero(struct sem_array *sma, struct sembuf *sops,
+					int nsops, struct list_head *pt)
+{
+	int i;
+	int semop_completed = 0;
+	int got_zero = 0;
+
+	/* first: the per-semaphore queues, if known */
+	if (sops) {
+		for (i = 0; i < nsops; i++) {
+			int num = sops[i].sem_num;
+
+			if (sma->sem_base[num].semval == 0) {
+				got_zero = 1;
+				semop_completed |= wake_const_ops(sma, num, pt);
+			}
+		}
+	} else {
+		/*
+		 * No sops means modified semaphores not known.
+		 * Assume all were changed.
 		 */
-		BUG_ON(q->sops[0].sem_op >= 0);
-		return 0;
+		for (i = 0; i < sma->sem_nsems; i++) {
+			if (sma->sem_base[i].semval == 0) {
+				got_zero = 1;
+				semop_completed |= wake_const_ops(sma, i, pt);
+			}
+		}
 	}
 	/*
-	 * semval is 0. Check if there are wait-for-zero semops.
-	 * They must be the first entries in the per-semaphore queue
+	 * If one of the modified semaphores got 0,
+	 * then check the global queue, too.
 	 */
-	h = list_first_entry(&curr->sem_pending, struct sem_queue, list);
-	BUG_ON(h->nsops != 1);
-	BUG_ON(h->sops[0].sem_num != q->sops[0].sem_num);
+	if (got_zero)
+		semop_completed |= wake_const_ops(sma, -1, pt);
 
-	/* Yes, there is a wait-for-zero semop. Restart */
-	if (h->sops[0].sem_op == 0)
-		return 1;
-
-	/* Again - no-one is waiting for the new value. */
-	return 0;
+	return semop_completed;
 }
 
 
@@ -678,6 +755,8 @@ static int check_restart(struct sem_array *sma, struct sem_queue *q)
  * semaphore.
  * The tasks that must be woken up are added to @pt. The return code
  * is stored in q->pid.
+ * The function internally checks if const operations can now succeed.
+ *
  * The function return 1 if at least one semop was completed successfully.
  */
 static int update_queue(struct sem_array *sma, int semnum, struct list_head *pt)
@@ -688,9 +767,9 @@ static int update_queue(struct sem_array *sma, int semnum, struct list_head *pt)
 	int semop_completed = 0;
 
 	if (semnum == -1)
-		pending_list = &sma->sem_pending;
+		pending_list = &sma->pending_alter;
 	else
-		pending_list = &sma->sem_base[semnum].sem_pending;
+		pending_list = &sma->sem_base[semnum].pending_alter;
 
 again:
 	walk = pending_list->next;
@@ -702,13 +781,12 @@ again:
 
 		/* If we are scanning the single sop, per-semaphore list of
 		 * one semaphore and that semaphore is 0, then it is not
-		 * necessary to scan the "alter" entries: simple increments
+		 * necessary to scan further: simple increments
 		 * that affect only one entry succeed immediately and cannot
 		 * be in the  per semaphore pending queue, and decrements
 		 * cannot be successful if the value is already 0.
 		 */
-		if (semnum != -1 && sma->sem_base[semnum].semval == 0 &&
-				q->alter)
+		if (semnum != -1 && sma->sem_base[semnum].semval == 0)
 			break;
 
 		error = try_atomic_semop(sma, q->sops, q->nsops,
@@ -724,6 +802,7 @@ again:
 			restart = 0;
 		} else {
 			semop_completed = 1;
+			do_smart_wakeup_zero(sma, q->sops, q->nsops, pt);
 			restart = check_restart(sma, q);
 		}
 
@@ -742,8 +821,8 @@ again:
  * @otime: force setting otime
  * @pt: list head of the tasks that must be woken up.
  *
- * do_smart_update() does the required called to update_queue, based on the
- * actual changes that were performed on the semaphore array.
+ * do_smart_update() does the required calls to update_queue and wakeup_zero,
+ * based on the actual changes that were performed on the semaphore array.
  * Note that the function does not do the actual wake-up: the caller is
  * responsible for calling wake_up_sem_queue_do(@pt).
  * It is safe to perform this call after dropping all locks.
@@ -754,6 +833,8 @@ static void do_smart_update(struct sem_array *sma, struct sembuf *sops, int nsop
 	int i;
 	int progress;
 
+	otime |= do_smart_wakeup_zero(sma, sops, nsops, pt);
+
 	progress = 1;
 retry_global:
 	if (sma->complex_count) {
@@ -813,14 +894,14 @@ static int count_semncnt (struct sem_array * sma, ushort semnum)
 	struct sem_queue * q;
 
 	semncnt = 0;
-	list_for_each_entry(q, &sma->sem_base[semnum].sem_pending, list) {
+	list_for_each_entry(q, &sma->sem_base[semnum].pending_alter, list) {
 		struct sembuf * sops = q->sops;
 		BUG_ON(sops->sem_num != semnum);
 		if ((sops->sem_op < 0) && !(sops->sem_flg & IPC_NOWAIT))
 			semncnt++;
 	}
 
-	list_for_each_entry(q, &sma->sem_pending, list) {
+	list_for_each_entry(q, &sma->pending_alter, list) {
 		struct sembuf * sops = q->sops;
 		int nsops = q->nsops;
 		int i;
@@ -839,14 +920,14 @@ static int count_semzcnt (struct sem_array * sma, ushort semnum)
 	struct sem_queue * q;
 
 	semzcnt = 0;
-	list_for_each_entry(q, &sma->sem_base[semnum].sem_pending, list) {
+	list_for_each_entry(q, &sma->sem_base[semnum].pending_const, list) {
 		struct sembuf * sops = q->sops;
 		BUG_ON(sops->sem_num != semnum);
 		if ((sops->sem_op == 0) && !(sops->sem_flg & IPC_NOWAIT))
 			semzcnt++;
 	}
 
-	list_for_each_entry(q, &sma->sem_pending, list) {
+	list_for_each_entry(q, &sma->pending_const, list) {
 		struct sembuf * sops = q->sops;
 		int nsops = q->nsops;
 		int i;
@@ -884,13 +965,22 @@ static void freeary(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp)
 
 	/* Wake up all pending processes and let them fail with EIDRM. */
 	INIT_LIST_HEAD(&tasks);
-	list_for_each_entry_safe(q, tq, &sma->sem_pending, list) {
+	list_for_each_entry_safe(q, tq, &sma->pending_const, list) {
+		unlink_queue(sma, q);
+		wake_up_sem_queue_prepare(&tasks, q, -EIDRM);
+	}
+
+	list_for_each_entry_safe(q, tq, &sma->pending_alter, list) {
 		unlink_queue(sma, q);
 		wake_up_sem_queue_prepare(&tasks, q, -EIDRM);
 	}
 	for (i = 0; i < sma->sem_nsems; i++) {
 		struct sem *sem = sma->sem_base + i;
-		list_for_each_entry_safe(q, tq, &sem->sem_pending, list) {
+		list_for_each_entry_safe(q, tq, &sem->pending_const, list) {
+			unlink_queue(sma, q);
+			wake_up_sem_queue_prepare(&tasks, q, -EIDRM);
+		}
+		list_for_each_entry_safe(q, tq, &sem->pending_alter, list) {
 			unlink_queue(sma, q);
 			wake_up_sem_queue_prepare(&tasks, q, -EIDRM);
 		}
@@ -1658,14 +1748,15 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops,
 		curr = &sma->sem_base[sops->sem_num];
 
 		if (alter)
-			list_add_tail(&queue.list, &curr->sem_pending);
+			list_add_tail(&queue.list, &curr->pending_alter);
 		else
-			list_add(&queue.list, &curr->sem_pending);
+			list_add_tail(&queue.list, &curr->pending_const);
 	} else {
 		if (alter)
-			list_add_tail(&queue.list, &sma->sem_pending);
+			list_add_tail(&queue.list, &sma->pending_alter);
 		else
-			list_add(&queue.list, &sma->sem_pending);
+			list_add_tail(&queue.list, &sma->pending_const);
+
 		sma->complex_count++;
 	}
 
-- 
cgit v0.10.2


From f269f40ad5aeee229ed70044926f44318abe41ef Mon Sep 17 00:00:00 2001
From: Manfred Spraul <manfred@colorfullife.com>
Date: Mon, 8 Jul 2013 16:01:24 -0700
Subject: ipc/sem.c: always use only one queue for alter operations

There are two places that can contain alter operations:
 - the global queue: sma->pending_alter
 - the per-semaphore queues: sma->sem_base[].pending_alter.

Since one of the queues must be processed first, this causes an odd
priorization of the wakeups: complex operations have priority over
simple ops.

The patch restores the behavior of linux <=3.0.9: The longest waiting
operation has the highest priority.

This is done by using only one queue:
 - if there are complex ops, then sma->pending_alter is used.
 - otherwise, the per-semaphore queues are used.

As a side effect, do_smart_update_queue() becomes much simpler: no more
goto logic.

Signed-off-by: Manfred Spraul <manfred@colorfullife.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Davidlohr Bueso <davidlohr.bueso@hp.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/ipc/sem.c b/ipc/sem.c
index 4d7f88c..6291257 100644
--- a/ipc/sem.c
+++ b/ipc/sem.c
@@ -192,6 +192,53 @@ void __init sem_init (void)
 				IPC_SEM_IDS, sysvipc_sem_proc_show);
 }
 
+/**
+ * unmerge_queues - unmerge queues, if possible.
+ * @sma: semaphore array
+ *
+ * The function unmerges the wait queues if complex_count is 0.
+ * It must be called prior to dropping the global semaphore array lock.
+ */
+static void unmerge_queues(struct sem_array *sma)
+{
+	struct sem_queue *q, *tq;
+
+	/* complex operations still around? */
+	if (sma->complex_count)
+		return;
+	/*
+	 * We will switch back to simple mode.
+	 * Move all pending operation back into the per-semaphore
+	 * queues.
+	 */
+	list_for_each_entry_safe(q, tq, &sma->pending_alter, list) {
+		struct sem *curr;
+		curr = &sma->sem_base[q->sops[0].sem_num];
+
+		list_add_tail(&q->list, &curr->pending_alter);
+	}
+	INIT_LIST_HEAD(&sma->pending_alter);
+}
+
+/**
+ * merge_queues - Merge single semop queues into global queue
+ * @sma: semaphore array
+ *
+ * This function merges all per-semaphore queues into the global queue.
+ * It is necessary to achieve FIFO ordering for the pending single-sop
+ * operations when a multi-semop operation must sleep.
+ * Only the alter operations must be moved, the const operations can stay.
+ */
+static void merge_queues(struct sem_array *sma)
+{
+	int i;
+	for (i = 0; i < sma->sem_nsems; i++) {
+		struct sem *sem = sma->sem_base + i;
+
+		list_splice_init(&sem->pending_alter, &sma->pending_alter);
+	}
+}
+
 /*
  * If the request contains only one semaphore operation, and there are
  * no complex transactions pending, lock only the semaphore involved.
@@ -262,6 +309,7 @@ static inline int sem_lock(struct sem_array *sma, struct sembuf *sops,
 static inline void sem_unlock(struct sem_array *sma, int locknum)
 {
 	if (locknum == -1) {
+		unmerge_queues(sma);
 		ipc_unlock_object(&sma->sem_perm);
 	} else {
 		struct sem *sem = sma->sem_base + locknum;
@@ -831,49 +879,38 @@ static void do_smart_update(struct sem_array *sma, struct sembuf *sops, int nsop
 			int otime, struct list_head *pt)
 {
 	int i;
-	int progress;
 
 	otime |= do_smart_wakeup_zero(sma, sops, nsops, pt);
 
-	progress = 1;
-retry_global:
-	if (sma->complex_count) {
-		if (update_queue(sma, -1, pt)) {
-			progress = 1;
-			otime = 1;
-			sops = NULL;
-		}
-	}
-	if (!progress)
-		goto done;
-
-	if (!sops) {
-		/* No semops; something special is going on. */
-		for (i = 0; i < sma->sem_nsems; i++) {
-			if (update_queue(sma, i, pt)) {
-				otime = 1;
-				progress = 1;
+	if (!list_empty(&sma->pending_alter)) {
+		/* semaphore array uses the global queue - just process it. */
+		otime |= update_queue(sma, -1, pt);
+	} else {
+		if (!sops) {
+			/*
+			 * No sops, thus the modified semaphores are not
+			 * known. Check all.
+			 */
+			for (i = 0; i < sma->sem_nsems; i++)
+				otime |= update_queue(sma, i, pt);
+		} else {
+			/*
+			 * Check the semaphores that were increased:
+			 * - No complex ops, thus all sleeping ops are
+			 *   decrease.
+			 * - if we decreased the value, then any sleeping
+			 *   semaphore ops wont be able to run: If the
+			 *   previous value was too small, then the new
+			 *   value will be too small, too.
+			 */
+			for (i = 0; i < nsops; i++) {
+				if (sops[i].sem_op > 0) {
+					otime |= update_queue(sma,
+							sops[i].sem_num, pt);
+				}
 			}
 		}
-		goto done_checkretry;
-	}
-
-	/* Check the semaphores that were modified. */
-	for (i = 0; i < nsops; i++) {
-		if (sops[i].sem_op > 0 ||
-			(sops[i].sem_op < 0 &&
-				sma->sem_base[sops[i].sem_num].semval == 0))
-			if (update_queue(sma, sops[i].sem_num, pt)) {
-				otime = 1;
-				progress = 1;
-			}
-	}
-done_checkretry:
-	if (progress) {
-		progress = 0;
-		goto retry_global;
 	}
-done:
 	if (otime)
 		sma->sem_otime = get_seconds();
 }
@@ -1747,11 +1784,22 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops,
 		struct sem *curr;
 		curr = &sma->sem_base[sops->sem_num];
 
-		if (alter)
-			list_add_tail(&queue.list, &curr->pending_alter);
-		else
+		if (alter) {
+			if (sma->complex_count) {
+				list_add_tail(&queue.list,
+						&sma->pending_alter);
+			} else {
+
+				list_add_tail(&queue.list,
+						&curr->pending_alter);
+			}
+		} else {
 			list_add_tail(&queue.list, &curr->pending_const);
+		}
 	} else {
+		if (!sma->complex_count)
+			merge_queues(sma);
+
 		if (alter)
 			list_add_tail(&queue.list, &sma->pending_alter);
 		else
-- 
cgit v0.10.2


From d12e1e50e47e0900dbbf52237b7e171f4f15ea1e Mon Sep 17 00:00:00 2001
From: Manfred Spraul <manfred@colorfullife.com>
Date: Mon, 8 Jul 2013 16:01:25 -0700
Subject: ipc/sem.c: replace shared sem_otime with per-semaphore value

sem_otime contains the time of the last semaphore operation that
completed successfully.  Every operation updates this value, thus access
from multiple cpus can cause thrashing.

Therefore the patch replaces the variable with a per-semaphore variable.
The per-array sem_otime is only calculated when required.

No performance improvement on a single-socket i3 - only important for
larger systems.

Signed-off-by: Manfred Spraul <manfred@colorfullife.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Davidlohr Bueso <davidlohr.bueso@hp.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/include/linux/sem.h b/include/linux/sem.h
index 55e17f6..976ce3a 100644
--- a/include/linux/sem.h
+++ b/include/linux/sem.h
@@ -12,7 +12,6 @@ struct task_struct;
 struct sem_array {
 	struct kern_ipc_perm	____cacheline_aligned_in_smp
 				sem_perm;	/* permissions .. see ipc.h */
-	time_t			sem_otime;	/* last semop time */
 	time_t			sem_ctime;	/* last change time */
 	struct sem		*sem_base;	/* ptr to first semaphore in array */
 	struct list_head	pending_alter;	/* pending operations */
diff --git a/ipc/sem.c b/ipc/sem.c
index 6291257..51352e1 100644
--- a/ipc/sem.c
+++ b/ipc/sem.c
@@ -99,6 +99,7 @@ struct sem {
 					/* that alter the semaphore */
 	struct list_head pending_const; /* pending single-sop operations */
 					/* that do not alter the semaphore*/
+	time_t	sem_otime;	/* candidate for sem_otime */
 } ____cacheline_aligned_in_smp;
 
 /* One queue for each sleeping process in the system. */
@@ -911,8 +912,14 @@ static void do_smart_update(struct sem_array *sma, struct sembuf *sops, int nsop
 			}
 		}
 	}
-	if (otime)
-		sma->sem_otime = get_seconds();
+	if (otime) {
+		if (sops == NULL) {
+			sma->sem_base[0].sem_otime = get_seconds();
+		} else {
+			sma->sem_base[sops[0].sem_num].sem_otime =
+								get_seconds();
+		}
+	}
 }
 
 
@@ -1058,6 +1065,21 @@ static unsigned long copy_semid_to_user(void __user *buf, struct semid64_ds *in,
 	}
 }
 
+static time_t get_semotime(struct sem_array *sma)
+{
+	int i;
+	time_t res;
+
+	res = sma->sem_base[0].sem_otime;
+	for (i = 1; i < sma->sem_nsems; i++) {
+		time_t to = sma->sem_base[i].sem_otime;
+
+		if (to > res)
+			res = to;
+	}
+	return res;
+}
+
 static int semctl_nolock(struct ipc_namespace *ns, int semid,
 			 int cmd, int version, void __user *p)
 {
@@ -1131,9 +1153,9 @@ static int semctl_nolock(struct ipc_namespace *ns, int semid,
 			goto out_unlock;
 
 		kernel_to_ipc64_perm(&sma->sem_perm, &tbuf.sem_perm);
-		tbuf.sem_otime  = sma->sem_otime;
-		tbuf.sem_ctime  = sma->sem_ctime;
-		tbuf.sem_nsems  = sma->sem_nsems;
+		tbuf.sem_otime = get_semotime(sma);
+		tbuf.sem_ctime = sma->sem_ctime;
+		tbuf.sem_nsems = sma->sem_nsems;
 		rcu_read_unlock();
 		if (copy_semid_to_user(p, &tbuf, version))
 			return -EFAULT;
@@ -2025,6 +2047,9 @@ static int sysvipc_sem_proc_show(struct seq_file *s, void *it)
 {
 	struct user_namespace *user_ns = seq_user_ns(s);
 	struct sem_array *sma = it;
+	time_t sem_otime;
+
+	sem_otime = get_semotime(sma);
 
 	return seq_printf(s,
 			  "%10d %10d  %4o %10u %5u %5u %5u %5u %10lu %10lu\n",
@@ -2036,7 +2061,7 @@ static int sysvipc_sem_proc_show(struct seq_file *s, void *it)
 			  from_kgid_munged(user_ns, sma->sem_perm.gid),
 			  from_kuid_munged(user_ns, sma->sem_perm.cuid),
 			  from_kgid_munged(user_ns, sma->sem_perm.cgid),
-			  sma->sem_otime,
+			  sem_otime,
 			  sma->sem_ctime);
 }
 #endif
-- 
cgit v0.10.2


From 758a6ba39ef6df4cdc615e5edd7bd86eab81a5f7 Mon Sep 17 00:00:00 2001
From: Manfred Spraul <manfred@colorfullife.com>
Date: Mon, 8 Jul 2013 16:01:26 -0700
Subject: ipc/sem.c: rename try_atomic_semop() to perform_atomic_semop(), docu
 update

Cleanup: Some minor points that I noticed while writing the previous
patches

1) The name try_atomic_semop() is misleading: The function performs the
   operation (if it is possible).

2) Some documentation updates.

No real code change, a rename and documentation changes.

Signed-off-by: Manfred Spraul <manfred@colorfullife.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Davidlohr Bueso <davidlohr.bueso@hp.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/ipc/sem.c b/ipc/sem.c
index 51352e1..4108889 100644
--- a/ipc/sem.c
+++ b/ipc/sem.c
@@ -154,12 +154,15 @@ static int sysvipc_sem_proc_show(struct seq_file *s, void *it);
 #define SEMOPM_FAST	64  /* ~ 372 bytes on stack */
 
 /*
- * linked list protection:
+ * Locking:
  *	sem_undo.id_next,
+ *	sem_array.complex_count,
  *	sem_array.pending{_alter,_cont},
- *	sem_array.sem_undo: sem_lock() for read/write
+ *	sem_array.sem_undo: global sem_lock() for read/write
  *	sem_undo.proc_next: only "current" is allowed to read/write that field.
  *	
+ *	sem_array.sem_base[i].pending_{const,alter}:
+ *		global or semaphore sem_lock() for read/write
  */
 
 #define sc_semmsl	sem_ctls[0]
@@ -536,12 +539,19 @@ SYSCALL_DEFINE3(semget, key_t, key, int, nsems, int, semflg)
 	return ipcget(ns, &sem_ids(ns), &sem_ops, &sem_params);
 }
 
-/*
- * Determine whether a sequence of semaphore operations would succeed
- * all at once. Return 0 if yes, 1 if need to sleep, else return error code.
+/** perform_atomic_semop - Perform (if possible) a semaphore operation
+ * @sma: semaphore array
+ * @sops: array with operations that should be checked
+ * @nsems: number of sops
+ * @un: undo array
+ * @pid: pid that did the change
+ *
+ * Returns 0 if the operation was possible.
+ * Returns 1 if the operation is impossible, the caller must sleep.
+ * Negative values are error codes.
  */
 
-static int try_atomic_semop (struct sem_array * sma, struct sembuf * sops,
+static int perform_atomic_semop(struct sem_array *sma, struct sembuf *sops,
 			     int nsops, struct sem_undo *un, int pid)
 {
 	int result, sem_op;
@@ -724,8 +734,8 @@ static int wake_const_ops(struct sem_array *sma, int semnum,
 		q = container_of(walk, struct sem_queue, list);
 		walk = walk->next;
 
-		error = try_atomic_semop(sma, q->sops, q->nsops,
-						q->undo, q->pid);
+		error = perform_atomic_semop(sma, q->sops, q->nsops,
+						 q->undo, q->pid);
 
 		if (error <= 0) {
 			/* operation completed, remove from queue & wakeup */
@@ -838,7 +848,7 @@ again:
 		if (semnum != -1 && sma->sem_base[semnum].semval == 0)
 			break;
 
-		error = try_atomic_semop(sma, q->sops, q->nsops,
+		error = perform_atomic_semop(sma, q->sops, q->nsops,
 					 q->undo, q->pid);
 
 		/* Does q->sleeper still need to sleep? */
@@ -1686,7 +1696,6 @@ static int get_queue_result(struct sem_queue *q)
 	return error;
 }
 
-
 SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops,
 		unsigned, nsops, const struct timespec __user *, timeout)
 {
@@ -1784,7 +1793,8 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops,
 	if (un && un->semid == -1)
 		goto out_unlock_free;
 
-	error = try_atomic_semop (sma, sops, nsops, un, task_tgid_vnr(current));
+	error = perform_atomic_semop(sma, sops, nsops, un,
+					task_tgid_vnr(current));
 	if (error <= 0) {
 		if (alter && error == 0)
 			do_smart_update(sma, sops, nsops, 1, &tasks);
-- 
cgit v0.10.2


From 026dadad6b44f0469a475efb4cae48269d8848bd Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Mon, 8 Jul 2013 16:01:27 -0700
Subject: mwave: fix info leak in mwave_ioctl()

Smatch complains that on 64 bit systems, there is a hole in the
MW_ABILITIES struct between ->component_count and ->component_list[].
It leaks stack information from the mwave_ioctl() function.

I've added a memset() to initialize the struct to zero.

Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Cc: Greg KH <greg@kroah.com>
Cc: Jiri Kosina <jkosina@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/char/mwave/tp3780i.c b/drivers/char/mwave/tp3780i.c
index c689697..04e6d6a 100644
--- a/drivers/char/mwave/tp3780i.c
+++ b/drivers/char/mwave/tp3780i.c
@@ -479,6 +479,7 @@ int tp3780I_QueryAbilities(THINKPAD_BD_DATA * pBDData, MW_ABILITIES * pAbilities
 	PRINTK_2(TRACE_TP3780I,
 		"tp3780i::tp3780I_QueryAbilities entry pBDData %p\n", pBDData);
 
+	memset(pAbilities, 0, sizeof(*pAbilities));
 	/* fill out standard constant fields */
 	pAbilities->instr_per_sec = pBDData->rDspSettings.uIps;
 	pAbilities->data_size = pBDData->rDspSettings.uDStoreSize;
-- 
cgit v0.10.2


From 1d04f3c6ab6bbdc6187ba44b8a667a785b63c4f2 Mon Sep 17 00:00:00 2001
From: Philippe De Muyter <phdm@macqel.be>
Date: Mon, 8 Jul 2013 16:01:28 -0700
Subject: partitions/msdos.c: end-of-line whitespace and semicolon cleanup

Signed-off-by: Philippe De Muyter <phdm@macqel.be>
Cc: Karel Zak <kzak@redhat.com>
Cc: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/block/partitions/msdos.c b/block/partitions/msdos.c
index 7681cd2..9bf19e6 100644
--- a/block/partitions/msdos.c
+++ b/block/partitions/msdos.c
@@ -90,7 +90,7 @@ static int aix_magic_present(struct parsed_partitions *state, unsigned char *p)
 		if (d[0] == '_' && d[1] == 'L' && d[2] == 'V' && d[3] == 'M')
 			ret = 1;
 		put_dev_sector(sect);
-	};
+	}
 	return ret;
 }
 
@@ -142,7 +142,7 @@ static void parse_extended(struct parsed_partitions *state,
 			return;
 
 		if (!msdos_magic_present(data + 510))
-			goto done; 
+			goto done;
 
 		p = (struct partition *) (data + 0x1be);
 
@@ -155,7 +155,7 @@ static void parse_extended(struct parsed_partitions *state,
 		 * and OS/2 seems to use all four entries.
 		 */
 
-		/* 
+		/*
 		 * First process the data partition(s)
 		 */
 		for (i=0; i<4; i++, p++) {
@@ -263,7 +263,7 @@ static void parse_solaris_x86(struct parsed_partitions *state,
 }
 
 #if defined(CONFIG_BSD_DISKLABEL)
-/* 
+/*
  * Create devices for BSD partitions listed in a disklabel, under a
  * dos-like partition. See parse_extended() for more information.
  */
@@ -294,7 +294,7 @@ static void parse_bsd(struct parsed_partitions *state,
 
 		if (state->next == state->limit)
 			break;
-		if (p->p_fstype == BSD_FS_UNUSED) 
+		if (p->p_fstype == BSD_FS_UNUSED)
 			continue;
 		bsd_start = le32_to_cpu(p->p_offset);
 		bsd_size = le32_to_cpu(p->p_size);
@@ -441,7 +441,7 @@ static struct {
 	{NEW_SOLARIS_X86_PARTITION, parse_solaris_x86},
 	{0, NULL},
 };
- 
+
 int msdos_partition(struct parsed_partitions *state)
 {
 	sector_t sector_size = bdev_logical_block_size(state->bdev) / 512;
-- 
cgit v0.10.2


From 6ceea22bbbc84fcf6bf0913bb3db8a657e9002f6 Mon Sep 17 00:00:00 2001
From: Philippe De Muyter <phdm@macqel.be>
Date: Mon, 8 Jul 2013 16:01:29 -0700
Subject: partitions: add aix lvm partition support files

Add partitions/aix.h and partitions/aix.c.

AIX LVM permits to make "logical volumes" which are made of multiple
slices of multiple disks.  The new code allows only access to the
"logical volumes" which are made of one slice on the probed disk, a
slice being a contiguous disk area.  The code also detects "logical
volumes" made of multiple slices on the probed disk, but can not
describe them to the partition layer, because the partition layer
generic code does not support that.  When such non-contiguous "logical
volumes" are detected, a diagnostic message is printed.

[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Philippe De Muyter <phdm@macqel.be>
Cc: Karel Zak <kzak@redhat.com>
Cc: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/block/partitions/Kconfig b/block/partitions/Kconfig
index 75a54e1..4cebb2f 100644
--- a/block/partitions/Kconfig
+++ b/block/partitions/Kconfig
@@ -68,6 +68,17 @@ config ACORN_PARTITION_RISCIX
 	  of machines called RISCiX.  If you say 'Y' here, Linux will be able
 	  to read disks partitioned under RISCiX.
 
+config AIX_PARTITION
+	bool "AIX basic partition table support" if PARTITION_ADVANCED
+	help
+	  Say Y here if you would like to be able to read the hard disk
+	  partition table format used by IBM or Motorola PowerPC machines
+	  running AIX.  AIX actually uses a Logical Volume Manager, where
+	  "logical volumes" can be spread across one or multiple disks,
+	  but this driver works only for the simple case of partitions which
+	  are contiguous.
+	  Otherwise, say N.
+
 config OSF_PARTITION
 	bool "Alpha OSF partition support" if PARTITION_ADVANCED
 	default y if ALPHA
diff --git a/block/partitions/Makefile b/block/partitions/Makefile
index 03af8ea..2be4d7b 100644
--- a/block/partitions/Makefile
+++ b/block/partitions/Makefile
@@ -7,6 +7,7 @@ obj-$(CONFIG_BLOCK) := check.o
 obj-$(CONFIG_ACORN_PARTITION) += acorn.o
 obj-$(CONFIG_AMIGA_PARTITION) += amiga.o
 obj-$(CONFIG_ATARI_PARTITION) += atari.o
+obj-$(CONFIG_AIX_PARTITION) += aix.o
 obj-$(CONFIG_MAC_PARTITION) += mac.o
 obj-$(CONFIG_LDM_PARTITION) += ldm.o
 obj-$(CONFIG_MSDOS_PARTITION) += msdos.o
diff --git a/block/partitions/aix.c b/block/partitions/aix.c
new file mode 100644
index 0000000..43be471
--- /dev/null
+++ b/block/partitions/aix.c
@@ -0,0 +1,293 @@
+/*
+ *  fs/partitions/aix.c
+ *
+ *  Copyright (C) 2012-2013 Philippe De Muyter <phdm@macqel.be>
+ */
+
+#include "check.h"
+#include "aix.h"
+
+struct lvm_rec {
+	char lvm_id[4]; /* "_LVM" */
+	char reserved4[16];
+	__be32 lvmarea_len;
+	__be32 vgda_len;
+	__be32 vgda_psn[2];
+	char reserved36[10];
+	__be16 pp_size; /* log2(pp_size) */
+	char reserved46[12];
+	__be16 version;
+	};
+
+struct vgda {
+	__be32 secs;
+	__be32 usec;
+	char reserved8[16];
+	__be16 numlvs;
+	__be16 maxlvs;
+	__be16 pp_size;
+	__be16 numpvs;
+	__be16 total_vgdas;
+	__be16 vgda_size;
+	};
+
+struct lvd {
+	__be16 lv_ix;
+	__be16 res2;
+	__be16 res4;
+	__be16 maxsize;
+	__be16 lv_state;
+	__be16 mirror;
+	__be16 mirror_policy;
+	__be16 num_lps;
+	__be16 res10[8];
+	};
+
+struct lvname {
+	char name[64];
+	};
+
+struct ppe {
+	__be16 lv_ix;
+	unsigned short res2;
+	unsigned short res4;
+	__be16 lp_ix;
+	unsigned short res8[12];
+	};
+
+struct pvd {
+	char reserved0[16];
+	__be16 pp_count;
+	char reserved18[2];
+	__be32 psn_part1;
+	char reserved24[8];
+	struct ppe ppe[1016];
+	};
+
+#define LVM_MAXLVS 256
+
+/**
+ * last_lba(): return number of last logical block of device
+ * @bdev: block device
+ *
+ * Description: Returns last LBA value on success, 0 on error.
+ * This is stored (by sd and ide-geometry) in
+ *  the part[0] entry for this disk, and is the number of
+ *  physical sectors available on the disk.
+ */
+static u64 last_lba(struct block_device *bdev)
+{
+	if (!bdev || !bdev->bd_inode)
+		return 0;
+	return (bdev->bd_inode->i_size >> 9) - 1ULL;
+}
+
+/**
+ * read_lba(): Read bytes from disk, starting at given LBA
+ * @state
+ * @lba
+ * @buffer
+ * @count
+ *
+ * Description:  Reads @count bytes from @state->bdev into @buffer.
+ * Returns number of bytes read on success, 0 on error.
+ */
+static size_t read_lba(struct parsed_partitions *state, u64 lba, u8 *buffer,
+			size_t count)
+{
+	size_t totalreadcount = 0;
+
+	if (!buffer || lba + count / 512 > last_lba(state->bdev))
+		return 0;
+
+	while (count) {
+		int copied = 512;
+		Sector sect;
+		unsigned char *data = read_part_sector(state, lba++, &sect);
+		if (!data)
+			break;
+		if (copied > count)
+			copied = count;
+		memcpy(buffer, data, copied);
+		put_dev_sector(sect);
+		buffer += copied;
+		totalreadcount += copied;
+		count -= copied;
+	}
+	return totalreadcount;
+}
+
+/**
+ * alloc_pvd(): reads physical volume descriptor
+ * @state
+ * @lba
+ *
+ * Description: Returns pvd on success,  NULL on error.
+ * Allocates space for pvd and fill it with disk blocks at @lba
+ * Notes: remember to free pvd when you're done!
+ */
+static struct pvd *alloc_pvd(struct parsed_partitions *state, u32 lba)
+{
+	size_t count = sizeof(struct pvd);
+	struct pvd *p;
+
+	p = kmalloc(count, GFP_KERNEL);
+	if (!p)
+		return NULL;
+
+	if (read_lba(state, lba, (u8 *) p, count) < count) {
+		kfree(p);
+		return NULL;
+	}
+	return p;
+}
+
+/**
+ * alloc_lvn(): reads logical volume names
+ * @state
+ * @lba
+ *
+ * Description: Returns lvn on success,  NULL on error.
+ * Allocates space for lvn and fill it with disk blocks at @lba
+ * Notes: remember to free lvn when you're done!
+ */
+static struct lvname *alloc_lvn(struct parsed_partitions *state, u32 lba)
+{
+	size_t count = sizeof(struct lvname) * LVM_MAXLVS;
+	struct lvname *p;
+
+	p = kmalloc(count, GFP_KERNEL);
+	if (!p)
+		return NULL;
+
+	if (read_lba(state, lba, (u8 *) p, count) < count) {
+		kfree(p);
+		return NULL;
+	}
+	return p;
+}
+
+int aix_partition(struct parsed_partitions *state)
+{
+	int ret = 0;
+	Sector sect;
+	unsigned char *d;
+	u32 pp_bytes_size;
+	u32 pp_blocks_size = 0;
+	u32 vgda_sector = 0;
+	u32 vgda_len = 0;
+	int numlvs = 0;
+	struct pvd *pvd;
+	struct lv_info {
+		unsigned short pps_per_lv;
+		unsigned short pps_found;
+		unsigned char lv_is_contiguous;
+	} *lvip;
+	struct lvname *n = NULL;
+
+	d = read_part_sector(state, 7, &sect);
+	if (d) {
+		struct lvm_rec *p = (struct lvm_rec *)d;
+		u16 lvm_version = be16_to_cpu(p->version);
+		char tmp[64];
+
+		if (lvm_version == 1) {
+			int pp_size_log2 = be16_to_cpu(p->pp_size);
+
+			pp_bytes_size = 1 << pp_size_log2;
+			pp_blocks_size = pp_bytes_size / 512;
+			snprintf(tmp, sizeof(tmp),
+				" AIX LVM header version %u found\n",
+				lvm_version);
+			vgda_len = be32_to_cpu(p->vgda_len);
+			vgda_sector = be32_to_cpu(p->vgda_psn[0]);
+		} else {
+			snprintf(tmp, sizeof(tmp),
+				" unsupported AIX LVM version %d found\n",
+				lvm_version);
+		}
+		strlcat(state->pp_buf, tmp, PAGE_SIZE);
+		put_dev_sector(sect);
+	}
+	if (vgda_sector && (d = read_part_sector(state, vgda_sector, &sect))) {
+		struct vgda *p = (struct vgda *)d;
+
+		numlvs = be16_to_cpu(p->numlvs);
+		put_dev_sector(sect);
+	}
+	lvip = kzalloc(sizeof(struct lv_info) * state->limit, GFP_KERNEL);
+	if (!lvip)
+		return 0;
+	if (numlvs && (d = read_part_sector(state, vgda_sector + 1, &sect))) {
+		struct lvd *p = (struct lvd *)d;
+		int i;
+
+		n = alloc_lvn(state, vgda_sector + vgda_len - 33);
+		if (n) {
+			int foundlvs = 0;
+
+			for (i = 0; foundlvs < numlvs && i < state->limit; i += 1) {
+				lvip[i].pps_per_lv = be16_to_cpu(p[i].num_lps);
+				if (lvip[i].pps_per_lv)
+					foundlvs += 1;
+			}
+		}
+		put_dev_sector(sect);
+	}
+	pvd = alloc_pvd(state, vgda_sector + 17);
+	if (pvd) {
+		int numpps = be16_to_cpu(pvd->pp_count);
+		int psn_part1 = be32_to_cpu(pvd->psn_part1);
+		int i;
+		int cur_lv_ix = -1;
+		int next_lp_ix = 1;
+		int lp_ix;
+
+		for (i = 0; i < numpps; i += 1) {
+			struct ppe *p = pvd->ppe + i;
+			unsigned int lv_ix;
+
+			lp_ix = be16_to_cpu(p->lp_ix);
+			if (!lp_ix) {
+				next_lp_ix = 1;
+				continue;
+			}
+			lv_ix = be16_to_cpu(p->lv_ix) - 1;
+			if (lv_ix > state->limit) {
+				cur_lv_ix = -1;
+				continue;
+			}
+			lvip[lv_ix].pps_found += 1;
+			if (lp_ix == 1) {
+				cur_lv_ix = lv_ix;
+				next_lp_ix = 1;
+			} else if (lv_ix != cur_lv_ix || lp_ix != next_lp_ix) {
+				next_lp_ix = 1;
+				continue;
+			}
+			if (lp_ix == lvip[lv_ix].pps_per_lv) {
+				char tmp[70];
+
+				put_partition(state, lv_ix + 1,
+				  (i + 1 - lp_ix) * pp_blocks_size + psn_part1,
+				  lvip[lv_ix].pps_per_lv * pp_blocks_size);
+				snprintf(tmp, sizeof(tmp), " <%s>\n",
+					 n[lv_ix].name);
+				strlcat(state->pp_buf, tmp, PAGE_SIZE);
+				lvip[lv_ix].lv_is_contiguous = 1;
+				ret = 1;
+				next_lp_ix = 1;
+			} else
+				next_lp_ix += 1;
+		}
+		for (i = 0; i < state->limit; i += 1)
+			if (lvip[i].pps_found && !lvip[i].lv_is_contiguous)
+				pr_warn("partition %s (%u pp's found) is "
+					"not contiguous\n",
+					n[i].name, lvip[i].pps_found);
+		kfree(pvd);
+	}
+	kfree(n);
+	kfree(lvip);
+	return ret;
+}
diff --git a/block/partitions/aix.h b/block/partitions/aix.h
new file mode 100644
index 0000000..e0c66a9
--- /dev/null
+++ b/block/partitions/aix.h
@@ -0,0 +1 @@
+extern int aix_partition(struct parsed_partitions *state);
-- 
cgit v0.10.2


From f8f066033b015a744065f6c7ed83741b4760376b Mon Sep 17 00:00:00 2001
From: Philippe De Muyter <phdm@macqel.be>
Date: Mon, 8 Jul 2013 16:01:30 -0700
Subject: partitions/msdos: enumerate also AIX LVM partitions

Graft AIX partitions enumeration into partitions/msdos.c

There is already a AIX disks detection logic in msdos.c.  When an AIX disk
has been found, and if configured to, call the aix partitions recognizer.
This avoids removal of AIX disks protection from msdos.c, avoids code
duplication, and ensures that AIX partitions enumeration is called before
plain msdos partitions enumeration.

Signed-off-by: Philippe De Muyter <phdm@macqel.be>
Cc: Karel Zak <kzak@redhat.com>
Cc: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/block/partitions/msdos.c b/block/partitions/msdos.c
index 9bf19e6..9123f25 100644
--- a/block/partitions/msdos.c
+++ b/block/partitions/msdos.c
@@ -23,6 +23,7 @@
 #include "check.h"
 #include "msdos.h"
 #include "efi.h"
+#include "aix.h"
 
 /*
  * Many architectures don't like unaligned accesses, while
@@ -462,8 +463,12 @@ int msdos_partition(struct parsed_partitions *state)
 	 */
 	if (aix_magic_present(state, data)) {
 		put_dev_sector(sect);
+#ifdef CONFIG_AIX_PARTITION
+		return aix_partition(state);
+#else
 		strlcat(state->pp_buf, " [AIX]", PAGE_SIZE);
 		return 0;
+#endif
 	}
 
 	if (!msdos_magic_present(data + 510)) {
-- 
cgit v0.10.2


From 0efbee70890c992f31a7b294ac654ff6c62d51c5 Mon Sep 17 00:00:00 2001
From: Robin Holt <holt@sgi.com>
Date: Mon, 8 Jul 2013 16:01:31 -0700
Subject: reboot: remove -stable friendly PF_THREAD_BOUND define

Remove the prior patch's #define for easier backporting to the stable
releases.

Signed-off-by: Robin Holt <holt@sgi.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Russ Anderson <rja@sgi.com>
Cc: Robin Holt <holt@sgi.com>
Cc: Russell King <rmk+kernel@arm.linux.org.uk>
Cc: Guan Xuetao <gxt@mprc.pku.edu.cn>
Cc: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/kernel/sys.c b/kernel/sys.c
index 071de90..b882440 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -362,11 +362,6 @@ int unregister_reboot_notifier(struct notifier_block *nb)
 }
 EXPORT_SYMBOL(unregister_reboot_notifier);
 
-/* Add backwards compatibility for stable trees. */
-#ifndef PF_NO_SETAFFINITY
-#define PF_NO_SETAFFINITY		PF_THREAD_BOUND
-#endif
-
 static void migrate_to_reboot_cpu(void)
 {
 	/* The boot cpu is always logical cpu 0 */
-- 
cgit v0.10.2


From 15d94b82565ebfb0cf27830b96e6cf5ed2d12a9a Mon Sep 17 00:00:00 2001
From: Robin Holt <holt@sgi.com>
Date: Mon, 8 Jul 2013 16:01:32 -0700
Subject: reboot: move shutdown/reboot related functions to kernel/reboot.c

This patch is preparatory.  It moves reboot related syscall, etc
functions from kernel/sys.c to kernel/reboot.c.

Signed-off-by: Robin Holt <holt@sgi.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Russ Anderson <rja@sgi.com>
Cc: Robin Holt <holt@sgi.com>
Cc: Russell King <rmk+kernel@arm.linux.org.uk>
Cc: Guan Xuetao <gxt@mprc.pku.edu.cn>
Cc: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/kernel/Makefile b/kernel/Makefile
index 271fd31..470839d 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -9,7 +9,7 @@ obj-y     = fork.o exec_domain.o panic.o printk.o \
 	    rcupdate.o extable.o params.o posix-timers.o \
 	    kthread.o wait.o sys_ni.o posix-cpu-timers.o mutex.o \
 	    hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
-	    notifier.o ksysfs.o cred.o \
+	    notifier.o ksysfs.o cred.o reboot.o \
 	    async.o range.o groups.o lglock.o smpboot.o
 
 ifdef CONFIG_FUNCTION_TRACER
diff --git a/kernel/reboot.c b/kernel/reboot.c
new file mode 100644
index 0000000..37d2636
--- /dev/null
+++ b/kernel/reboot.c
@@ -0,0 +1,346 @@
+/*
+ *  linux/kernel/reboot.c
+ *
+ *  Copyright (C) 2013  Linus Torvalds
+ */
+
+#include <linux/export.h>
+#include <linux/kexec.h>
+#include <linux/kmod.h>
+#include <linux/kmsg_dump.h>
+#include <linux/reboot.h>
+#include <linux/suspend.h>
+#include <linux/syscalls.h>
+#include <linux/syscore_ops.h>
+#include <linux/uaccess.h>
+
+/*
+ * this indicates whether you can reboot with ctrl-alt-del: the default is yes
+ */
+
+int C_A_D = 1;
+struct pid *cad_pid;
+EXPORT_SYMBOL(cad_pid);
+
+/*
+ * If set, this is used for preparing the system to power off.
+ */
+
+void (*pm_power_off_prepare)(void);
+
+/**
+ *	emergency_restart - reboot the system
+ *
+ *	Without shutting down any hardware or taking any locks
+ *	reboot the system.  This is called when we know we are in
+ *	trouble so this is our best effort to reboot.  This is
+ *	safe to call in interrupt context.
+ */
+void emergency_restart(void)
+{
+	kmsg_dump(KMSG_DUMP_EMERG);
+	machine_emergency_restart();
+}
+EXPORT_SYMBOL_GPL(emergency_restart);
+
+void kernel_restart_prepare(char *cmd)
+{
+	blocking_notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd);
+	system_state = SYSTEM_RESTART;
+	usermodehelper_disable();
+	device_shutdown();
+}
+
+/**
+ *	register_reboot_notifier - Register function to be called at reboot time
+ *	@nb: Info about notifier function to be called
+ *
+ *	Registers a function with the list of functions
+ *	to be called at reboot time.
+ *
+ *	Currently always returns zero, as blocking_notifier_chain_register()
+ *	always returns zero.
+ */
+int register_reboot_notifier(struct notifier_block *nb)
+{
+	return blocking_notifier_chain_register(&reboot_notifier_list, nb);
+}
+EXPORT_SYMBOL(register_reboot_notifier);
+
+/**
+ *	unregister_reboot_notifier - Unregister previously registered reboot notifier
+ *	@nb: Hook to be unregistered
+ *
+ *	Unregisters a previously registered reboot
+ *	notifier function.
+ *
+ *	Returns zero on success, or %-ENOENT on failure.
+ */
+int unregister_reboot_notifier(struct notifier_block *nb)
+{
+	return blocking_notifier_chain_unregister(&reboot_notifier_list, nb);
+}
+EXPORT_SYMBOL(unregister_reboot_notifier);
+
+static void migrate_to_reboot_cpu(void)
+{
+	/* The boot cpu is always logical cpu 0 */
+	int cpu = 0;
+
+	cpu_hotplug_disable();
+
+	/* Make certain the cpu I'm about to reboot on is online */
+	if (!cpu_online(cpu))
+		cpu = cpumask_first(cpu_online_mask);
+
+	/* Prevent races with other tasks migrating this task */
+	current->flags |= PF_NO_SETAFFINITY;
+
+	/* Make certain I only run on the appropriate processor */
+	set_cpus_allowed_ptr(current, cpumask_of(cpu));
+}
+
+/**
+ *	kernel_restart - reboot the system
+ *	@cmd: pointer to buffer containing command to execute for restart
+ *		or %NULL
+ *
+ *	Shutdown everything and perform a clean reboot.
+ *	This is not safe to call in interrupt context.
+ */
+void kernel_restart(char *cmd)
+{
+	kernel_restart_prepare(cmd);
+	migrate_to_reboot_cpu();
+	syscore_shutdown();
+	if (!cmd)
+		printk(KERN_EMERG "Restarting system.\n");
+	else
+		printk(KERN_EMERG "Restarting system with command '%s'.\n", cmd);
+	kmsg_dump(KMSG_DUMP_RESTART);
+	machine_restart(cmd);
+}
+EXPORT_SYMBOL_GPL(kernel_restart);
+
+static void kernel_shutdown_prepare(enum system_states state)
+{
+	blocking_notifier_call_chain(&reboot_notifier_list,
+		(state == SYSTEM_HALT)?SYS_HALT:SYS_POWER_OFF, NULL);
+	system_state = state;
+	usermodehelper_disable();
+	device_shutdown();
+}
+/**
+ *	kernel_halt - halt the system
+ *
+ *	Shutdown everything and perform a clean system halt.
+ */
+void kernel_halt(void)
+{
+	kernel_shutdown_prepare(SYSTEM_HALT);
+	migrate_to_reboot_cpu();
+	syscore_shutdown();
+	printk(KERN_EMERG "System halted.\n");
+	kmsg_dump(KMSG_DUMP_HALT);
+	machine_halt();
+}
+
+EXPORT_SYMBOL_GPL(kernel_halt);
+
+/**
+ *	kernel_power_off - power_off the system
+ *
+ *	Shutdown everything and perform a clean system power_off.
+ */
+void kernel_power_off(void)
+{
+	kernel_shutdown_prepare(SYSTEM_POWER_OFF);
+	if (pm_power_off_prepare)
+		pm_power_off_prepare();
+	migrate_to_reboot_cpu();
+	syscore_shutdown();
+	printk(KERN_EMERG "Power down.\n");
+	kmsg_dump(KMSG_DUMP_POWEROFF);
+	machine_power_off();
+}
+EXPORT_SYMBOL_GPL(kernel_power_off);
+
+static DEFINE_MUTEX(reboot_mutex);
+
+/*
+ * Reboot system call: for obvious reasons only root may call it,
+ * and even root needs to set up some magic numbers in the registers
+ * so that some mistake won't make this reboot the whole machine.
+ * You can also set the meaning of the ctrl-alt-del-key here.
+ *
+ * reboot doesn't sync: do that yourself before calling this.
+ */
+SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd,
+		void __user *, arg)
+{
+	struct pid_namespace *pid_ns = task_active_pid_ns(current);
+	char buffer[256];
+	int ret = 0;
+
+	/* We only trust the superuser with rebooting the system. */
+	if (!ns_capable(pid_ns->user_ns, CAP_SYS_BOOT))
+		return -EPERM;
+
+	/* For safety, we require "magic" arguments. */
+	if (magic1 != LINUX_REBOOT_MAGIC1 ||
+	    (magic2 != LINUX_REBOOT_MAGIC2 &&
+	                magic2 != LINUX_REBOOT_MAGIC2A &&
+			magic2 != LINUX_REBOOT_MAGIC2B &&
+	                magic2 != LINUX_REBOOT_MAGIC2C))
+		return -EINVAL;
+
+	/*
+	 * If pid namespaces are enabled and the current task is in a child
+	 * pid_namespace, the command is handled by reboot_pid_ns() which will
+	 * call do_exit().
+	 */
+	ret = reboot_pid_ns(pid_ns, cmd);
+	if (ret)
+		return ret;
+
+	/* Instead of trying to make the power_off code look like
+	 * halt when pm_power_off is not set do it the easy way.
+	 */
+	if ((cmd == LINUX_REBOOT_CMD_POWER_OFF) && !pm_power_off)
+		cmd = LINUX_REBOOT_CMD_HALT;
+
+	mutex_lock(&reboot_mutex);
+	switch (cmd) {
+	case LINUX_REBOOT_CMD_RESTART:
+		kernel_restart(NULL);
+		break;
+
+	case LINUX_REBOOT_CMD_CAD_ON:
+		C_A_D = 1;
+		break;
+
+	case LINUX_REBOOT_CMD_CAD_OFF:
+		C_A_D = 0;
+		break;
+
+	case LINUX_REBOOT_CMD_HALT:
+		kernel_halt();
+		do_exit(0);
+		panic("cannot halt");
+
+	case LINUX_REBOOT_CMD_POWER_OFF:
+		kernel_power_off();
+		do_exit(0);
+		break;
+
+	case LINUX_REBOOT_CMD_RESTART2:
+		if (strncpy_from_user(&buffer[0], arg, sizeof(buffer) - 1) < 0) {
+			ret = -EFAULT;
+			break;
+		}
+		buffer[sizeof(buffer) - 1] = '\0';
+
+		kernel_restart(buffer);
+		break;
+
+#ifdef CONFIG_KEXEC
+	case LINUX_REBOOT_CMD_KEXEC:
+		ret = kernel_kexec();
+		break;
+#endif
+
+#ifdef CONFIG_HIBERNATION
+	case LINUX_REBOOT_CMD_SW_SUSPEND:
+		ret = hibernate();
+		break;
+#endif
+
+	default:
+		ret = -EINVAL;
+		break;
+	}
+	mutex_unlock(&reboot_mutex);
+	return ret;
+}
+
+static void deferred_cad(struct work_struct *dummy)
+{
+	kernel_restart(NULL);
+}
+
+/*
+ * This function gets called by ctrl-alt-del - ie the keyboard interrupt.
+ * As it's called within an interrupt, it may NOT sync: the only choice
+ * is whether to reboot at once, or just ignore the ctrl-alt-del.
+ */
+void ctrl_alt_del(void)
+{
+	static DECLARE_WORK(cad_work, deferred_cad);
+
+	if (C_A_D)
+		schedule_work(&cad_work);
+	else
+		kill_cad_pid(SIGINT, 1);
+}
+
+char poweroff_cmd[POWEROFF_CMD_PATH_LEN] = "/sbin/poweroff";
+
+static int __orderly_poweroff(bool force)
+{
+	char **argv;
+	static char *envp[] = {
+		"HOME=/",
+		"PATH=/sbin:/bin:/usr/sbin:/usr/bin",
+		NULL
+	};
+	int ret;
+
+	argv = argv_split(GFP_KERNEL, poweroff_cmd, NULL);
+	if (argv) {
+		ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
+		argv_free(argv);
+	} else {
+		printk(KERN_WARNING "%s failed to allocate memory for \"%s\"\n",
+					 __func__, poweroff_cmd);
+		ret = -ENOMEM;
+	}
+
+	if (ret && force) {
+		printk(KERN_WARNING "Failed to start orderly shutdown: "
+					"forcing the issue\n");
+		/*
+		 * I guess this should try to kick off some daemon to sync and
+		 * poweroff asap.  Or not even bother syncing if we're doing an
+		 * emergency shutdown?
+		 */
+		emergency_sync();
+		kernel_power_off();
+	}
+
+	return ret;
+}
+
+static bool poweroff_force;
+
+static void poweroff_work_func(struct work_struct *work)
+{
+	__orderly_poweroff(poweroff_force);
+}
+
+static DECLARE_WORK(poweroff_work, poweroff_work_func);
+
+/**
+ * orderly_poweroff - Trigger an orderly system poweroff
+ * @force: force poweroff if command execution fails
+ *
+ * This may be called from any context to trigger a system shutdown.
+ * If the orderly shutdown fails, it will force an immediate shutdown.
+ */
+int orderly_poweroff(bool force)
+{
+	if (force) /* do not override the pending "true" */
+		poweroff_force = true;
+	schedule_work(&poweroff_work);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(orderly_poweroff);
diff --git a/kernel/sys.c b/kernel/sys.c
index b882440..771129b 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -116,20 +116,6 @@ EXPORT_SYMBOL(fs_overflowuid);
 EXPORT_SYMBOL(fs_overflowgid);
 
 /*
- * this indicates whether you can reboot with ctrl-alt-del: the default is yes
- */
-
-int C_A_D = 1;
-struct pid *cad_pid;
-EXPORT_SYMBOL(cad_pid);
-
-/*
- * If set, this is used for preparing the system to power off.
- */
-
-void (*pm_power_off_prepare)(void);
-
-/*
  * Returns true if current's euid is same as p's uid or euid,
  * or has CAP_SYS_NICE to p's user_ns.
  *
@@ -308,261 +294,6 @@ out_unlock:
 	return retval;
 }
 
-/**
- *	emergency_restart - reboot the system
- *
- *	Without shutting down any hardware or taking any locks
- *	reboot the system.  This is called when we know we are in
- *	trouble so this is our best effort to reboot.  This is
- *	safe to call in interrupt context.
- */
-void emergency_restart(void)
-{
-	kmsg_dump(KMSG_DUMP_EMERG);
-	machine_emergency_restart();
-}
-EXPORT_SYMBOL_GPL(emergency_restart);
-
-void kernel_restart_prepare(char *cmd)
-{
-	blocking_notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd);
-	system_state = SYSTEM_RESTART;
-	usermodehelper_disable();
-	device_shutdown();
-}
-
-/**
- *	register_reboot_notifier - Register function to be called at reboot time
- *	@nb: Info about notifier function to be called
- *
- *	Registers a function with the list of functions
- *	to be called at reboot time.
- *
- *	Currently always returns zero, as blocking_notifier_chain_register()
- *	always returns zero.
- */
-int register_reboot_notifier(struct notifier_block *nb)
-{
-	return blocking_notifier_chain_register(&reboot_notifier_list, nb);
-}
-EXPORT_SYMBOL(register_reboot_notifier);
-
-/**
- *	unregister_reboot_notifier - Unregister previously registered reboot notifier
- *	@nb: Hook to be unregistered
- *
- *	Unregisters a previously registered reboot
- *	notifier function.
- *
- *	Returns zero on success, or %-ENOENT on failure.
- */
-int unregister_reboot_notifier(struct notifier_block *nb)
-{
-	return blocking_notifier_chain_unregister(&reboot_notifier_list, nb);
-}
-EXPORT_SYMBOL(unregister_reboot_notifier);
-
-static void migrate_to_reboot_cpu(void)
-{
-	/* The boot cpu is always logical cpu 0 */
-	int cpu = 0;
-
-	cpu_hotplug_disable();
-
-	/* Make certain the cpu I'm about to reboot on is online */
-	if (!cpu_online(cpu))
-		cpu = cpumask_first(cpu_online_mask);
-
-	/* Prevent races with other tasks migrating this task */
-	current->flags |= PF_NO_SETAFFINITY;
-
-	/* Make certain I only run on the appropriate processor */
-	set_cpus_allowed_ptr(current, cpumask_of(cpu));
-}
-
-/**
- *	kernel_restart - reboot the system
- *	@cmd: pointer to buffer containing command to execute for restart
- *		or %NULL
- *
- *	Shutdown everything and perform a clean reboot.
- *	This is not safe to call in interrupt context.
- */
-void kernel_restart(char *cmd)
-{
-	kernel_restart_prepare(cmd);
-	migrate_to_reboot_cpu();
-	syscore_shutdown();
-	if (!cmd)
-		printk(KERN_EMERG "Restarting system.\n");
-	else
-		printk(KERN_EMERG "Restarting system with command '%s'.\n", cmd);
-	kmsg_dump(KMSG_DUMP_RESTART);
-	machine_restart(cmd);
-}
-EXPORT_SYMBOL_GPL(kernel_restart);
-
-static void kernel_shutdown_prepare(enum system_states state)
-{
-	blocking_notifier_call_chain(&reboot_notifier_list,
-		(state == SYSTEM_HALT)?SYS_HALT:SYS_POWER_OFF, NULL);
-	system_state = state;
-	usermodehelper_disable();
-	device_shutdown();
-}
-/**
- *	kernel_halt - halt the system
- *
- *	Shutdown everything and perform a clean system halt.
- */
-void kernel_halt(void)
-{
-	kernel_shutdown_prepare(SYSTEM_HALT);
-	migrate_to_reboot_cpu();
-	syscore_shutdown();
-	printk(KERN_EMERG "System halted.\n");
-	kmsg_dump(KMSG_DUMP_HALT);
-	machine_halt();
-}
-
-EXPORT_SYMBOL_GPL(kernel_halt);
-
-/**
- *	kernel_power_off - power_off the system
- *
- *	Shutdown everything and perform a clean system power_off.
- */
-void kernel_power_off(void)
-{
-	kernel_shutdown_prepare(SYSTEM_POWER_OFF);
-	if (pm_power_off_prepare)
-		pm_power_off_prepare();
-	migrate_to_reboot_cpu();
-	syscore_shutdown();
-	printk(KERN_EMERG "Power down.\n");
-	kmsg_dump(KMSG_DUMP_POWEROFF);
-	machine_power_off();
-}
-EXPORT_SYMBOL_GPL(kernel_power_off);
-
-static DEFINE_MUTEX(reboot_mutex);
-
-/*
- * Reboot system call: for obvious reasons only root may call it,
- * and even root needs to set up some magic numbers in the registers
- * so that some mistake won't make this reboot the whole machine.
- * You can also set the meaning of the ctrl-alt-del-key here.
- *
- * reboot doesn't sync: do that yourself before calling this.
- */
-SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd,
-		void __user *, arg)
-{
-	struct pid_namespace *pid_ns = task_active_pid_ns(current);
-	char buffer[256];
-	int ret = 0;
-
-	/* We only trust the superuser with rebooting the system. */
-	if (!ns_capable(pid_ns->user_ns, CAP_SYS_BOOT))
-		return -EPERM;
-
-	/* For safety, we require "magic" arguments. */
-	if (magic1 != LINUX_REBOOT_MAGIC1 ||
-	    (magic2 != LINUX_REBOOT_MAGIC2 &&
-	                magic2 != LINUX_REBOOT_MAGIC2A &&
-			magic2 != LINUX_REBOOT_MAGIC2B &&
-	                magic2 != LINUX_REBOOT_MAGIC2C))
-		return -EINVAL;
-
-	/*
-	 * If pid namespaces are enabled and the current task is in a child
-	 * pid_namespace, the command is handled by reboot_pid_ns() which will
-	 * call do_exit().
-	 */
-	ret = reboot_pid_ns(pid_ns, cmd);
-	if (ret)
-		return ret;
-
-	/* Instead of trying to make the power_off code look like
-	 * halt when pm_power_off is not set do it the easy way.
-	 */
-	if ((cmd == LINUX_REBOOT_CMD_POWER_OFF) && !pm_power_off)
-		cmd = LINUX_REBOOT_CMD_HALT;
-
-	mutex_lock(&reboot_mutex);
-	switch (cmd) {
-	case LINUX_REBOOT_CMD_RESTART:
-		kernel_restart(NULL);
-		break;
-
-	case LINUX_REBOOT_CMD_CAD_ON:
-		C_A_D = 1;
-		break;
-
-	case LINUX_REBOOT_CMD_CAD_OFF:
-		C_A_D = 0;
-		break;
-
-	case LINUX_REBOOT_CMD_HALT:
-		kernel_halt();
-		do_exit(0);
-		panic("cannot halt.\n");
-
-	case LINUX_REBOOT_CMD_POWER_OFF:
-		kernel_power_off();
-		do_exit(0);
-		break;
-
-	case LINUX_REBOOT_CMD_RESTART2:
-		if (strncpy_from_user(&buffer[0], arg, sizeof(buffer) - 1) < 0) {
-			ret = -EFAULT;
-			break;
-		}
-		buffer[sizeof(buffer) - 1] = '\0';
-
-		kernel_restart(buffer);
-		break;
-
-#ifdef CONFIG_KEXEC
-	case LINUX_REBOOT_CMD_KEXEC:
-		ret = kernel_kexec();
-		break;
-#endif
-
-#ifdef CONFIG_HIBERNATION
-	case LINUX_REBOOT_CMD_SW_SUSPEND:
-		ret = hibernate();
-		break;
-#endif
-
-	default:
-		ret = -EINVAL;
-		break;
-	}
-	mutex_unlock(&reboot_mutex);
-	return ret;
-}
-
-static void deferred_cad(struct work_struct *dummy)
-{
-	kernel_restart(NULL);
-}
-
-/*
- * This function gets called by ctrl-alt-del - ie the keyboard interrupt.
- * As it's called within an interrupt, it may NOT sync: the only choice
- * is whether to reboot at once, or just ignore the ctrl-alt-del.
- */
-void ctrl_alt_del(void)
-{
-	static DECLARE_WORK(cad_work, deferred_cad);
-
-	if (C_A_D)
-		schedule_work(&cad_work);
-	else
-		kill_cad_pid(SIGINT, 1);
-}
-	
 /*
  * Unprivileged users may change the real gid to the effective gid
  * or vice versa.  (BSD-style)
@@ -2287,68 +2018,6 @@ SYSCALL_DEFINE3(getcpu, unsigned __user *, cpup, unsigned __user *, nodep,
 	return err ? -EFAULT : 0;
 }
 
-char poweroff_cmd[POWEROFF_CMD_PATH_LEN] = "/sbin/poweroff";
-
-static int __orderly_poweroff(bool force)
-{
-	char **argv;
-	static char *envp[] = {
-		"HOME=/",
-		"PATH=/sbin:/bin:/usr/sbin:/usr/bin",
-		NULL
-	};
-	int ret;
-
-	argv = argv_split(GFP_KERNEL, poweroff_cmd, NULL);
-	if (argv) {
-		ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
-		argv_free(argv);
-	} else {
-		printk(KERN_WARNING "%s failed to allocate memory for \"%s\"\n",
-					 __func__, poweroff_cmd);
-		ret = -ENOMEM;
-	}
-
-	if (ret && force) {
-		printk(KERN_WARNING "Failed to start orderly shutdown: "
-					"forcing the issue\n");
-		/*
-		 * I guess this should try to kick off some daemon to sync and
-		 * poweroff asap.  Or not even bother syncing if we're doing an
-		 * emergency shutdown?
-		 */
-		emergency_sync();
-		kernel_power_off();
-	}
-
-	return ret;
-}
-
-static bool poweroff_force;
-
-static void poweroff_work_func(struct work_struct *work)
-{
-	__orderly_poweroff(poweroff_force);
-}
-
-static DECLARE_WORK(poweroff_work, poweroff_work_func);
-
-/**
- * orderly_poweroff - Trigger an orderly system poweroff
- * @force: force poweroff if command execution fails
- *
- * This may be called from any context to trigger a system shutdown.
- * If the orderly shutdown fails, it will force an immediate shutdown.
- */
-int orderly_poweroff(bool force)
-{
-	if (force) /* do not override the pending "true" */
-		poweroff_force = true;
-	schedule_work(&poweroff_work);
-	return 0;
-}
-EXPORT_SYMBOL_GPL(orderly_poweroff);
-
 /**
  * do_sysinfo - fill in sysinfo struct
  * @info: pointer to buffer to fill
-- 
cgit v0.10.2


From 972ee83df88a7fd84c228a31b4f9611299898984 Mon Sep 17 00:00:00 2001
From: Robin Holt <holt@sgi.com>
Date: Mon, 8 Jul 2013 16:01:34 -0700
Subject: reboot: checkpatch.pl the new kernel/reboot.c file

Get the new file to pass scripts/checkpatch.pl

Signed-off-by: Robin Holt <holt@sgi.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Russ Anderson <rja@sgi.com>
Cc: Robin Holt <holt@sgi.com>
Cc: Russell King <rmk+kernel@arm.linux.org.uk>
Cc: Guan Xuetao <gxt@mprc.pku.edu.cn>
Cc: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/include/linux/reboot.h b/include/linux/reboot.h
index 23b3630..c6eba21 100644
--- a/include/linux/reboot.h
+++ b/include/linux/reboot.h
@@ -26,7 +26,7 @@ extern void machine_shutdown(void);
 struct pt_regs;
 extern void machine_crash_shutdown(struct pt_regs *);
 
-/* 
+/*
  * Architecture independent implemenations of sys_reboot commands.
  */
 
diff --git a/kernel/reboot.c b/kernel/reboot.c
index 37d2636..abb6a04 100644
--- a/kernel/reboot.c
+++ b/kernel/reboot.c
@@ -4,6 +4,8 @@
  *  Copyright (C) 2013  Linus Torvalds
  */
 
+#define pr_fmt(fmt)	"reboot: " fmt
+
 #include <linux/export.h>
 #include <linux/kexec.h>
 #include <linux/kmod.h>
@@ -114,9 +116,9 @@ void kernel_restart(char *cmd)
 	migrate_to_reboot_cpu();
 	syscore_shutdown();
 	if (!cmd)
-		printk(KERN_EMERG "Restarting system.\n");
+		pr_emerg("Restarting system\n");
 	else
-		printk(KERN_EMERG "Restarting system with command '%s'.\n", cmd);
+		pr_emerg("Restarting system with command '%s'\n", cmd);
 	kmsg_dump(KMSG_DUMP_RESTART);
 	machine_restart(cmd);
 }
@@ -125,7 +127,7 @@ EXPORT_SYMBOL_GPL(kernel_restart);
 static void kernel_shutdown_prepare(enum system_states state)
 {
 	blocking_notifier_call_chain(&reboot_notifier_list,
-		(state == SYSTEM_HALT)?SYS_HALT:SYS_POWER_OFF, NULL);
+		(state == SYSTEM_HALT) ? SYS_HALT : SYS_POWER_OFF, NULL);
 	system_state = state;
 	usermodehelper_disable();
 	device_shutdown();
@@ -140,11 +142,10 @@ void kernel_halt(void)
 	kernel_shutdown_prepare(SYSTEM_HALT);
 	migrate_to_reboot_cpu();
 	syscore_shutdown();
-	printk(KERN_EMERG "System halted.\n");
+	pr_emerg("System halted\n");
 	kmsg_dump(KMSG_DUMP_HALT);
 	machine_halt();
 }
-
 EXPORT_SYMBOL_GPL(kernel_halt);
 
 /**
@@ -159,7 +160,7 @@ void kernel_power_off(void)
 		pm_power_off_prepare();
 	migrate_to_reboot_cpu();
 	syscore_shutdown();
-	printk(KERN_EMERG "Power down.\n");
+	pr_emerg("Power down\n");
 	kmsg_dump(KMSG_DUMP_POWEROFF);
 	machine_power_off();
 }
@@ -188,10 +189,10 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd,
 
 	/* For safety, we require "magic" arguments. */
 	if (magic1 != LINUX_REBOOT_MAGIC1 ||
-	    (magic2 != LINUX_REBOOT_MAGIC2 &&
-	                magic2 != LINUX_REBOOT_MAGIC2A &&
+			(magic2 != LINUX_REBOOT_MAGIC2 &&
+			magic2 != LINUX_REBOOT_MAGIC2A &&
 			magic2 != LINUX_REBOOT_MAGIC2B &&
-	                magic2 != LINUX_REBOOT_MAGIC2C))
+			magic2 != LINUX_REBOOT_MAGIC2C))
 		return -EINVAL;
 
 	/*
@@ -234,7 +235,8 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd,
 		break;
 
 	case LINUX_REBOOT_CMD_RESTART2:
-		if (strncpy_from_user(&buffer[0], arg, sizeof(buffer) - 1) < 0) {
+		ret = strncpy_from_user(&buffer[0], arg, sizeof(buffer) - 1);
+		if (ret < 0) {
 			ret = -EFAULT;
 			break;
 		}
@@ -300,14 +302,11 @@ static int __orderly_poweroff(bool force)
 		ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
 		argv_free(argv);
 	} else {
-		printk(KERN_WARNING "%s failed to allocate memory for \"%s\"\n",
-					 __func__, poweroff_cmd);
 		ret = -ENOMEM;
 	}
 
 	if (ret && force) {
-		printk(KERN_WARNING "Failed to start orderly shutdown: "
-					"forcing the issue\n");
+		pr_warn("Failed to start orderly shutdown: forcing the issue\n");
 		/*
 		 * I guess this should try to kick off some daemon to sync and
 		 * poweroff asap.  Or not even bother syncing if we're doing an
-- 
cgit v0.10.2


From edf2b1394611fef7806d4af72179dc3ac101f275 Mon Sep 17 00:00:00 2001
From: Robin Holt <holt@sgi.com>
Date: Mon, 8 Jul 2013 16:01:35 -0700
Subject: reboot: x86: prepare reboot_mode for moving to generic kernel code

Prepare for the moving the parsing of reboot= to the generic kernel code
by making reboot_mode into a more generic form.

Signed-off-by: Robin Holt <holt@sgi.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Miguel Boton <mboton.lkml@gmail.com>
Cc: Russ Anderson <rja@sgi.com>
Cc: Robin Holt <holt@sgi.com>
Cc: Russell King <rmk+kernel@arm.linux.org.uk>
Cc: Guan Xuetao <gxt@mprc.pku.edu.cn>
Acked-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 76fa1e9..f770340 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -36,7 +36,7 @@ void (*pm_power_off)(void);
 EXPORT_SYMBOL(pm_power_off);
 
 static const struct desc_ptr no_idt = {};
-static int reboot_mode;
+static enum reboot_mode reboot_mode;
 enum reboot_type reboot_type = BOOT_ACPI;
 int reboot_force;
 
@@ -88,11 +88,11 @@ static int __init reboot_setup(char *str)
 
 		switch (*str) {
 		case 'w':
-			reboot_mode = 0x1234;
+			reboot_mode = REBOOT_WARM;
 			break;
 
 		case 'c':
-			reboot_mode = 0;
+			reboot_mode = REBOOT_COLD;
 			break;
 
 #ifdef CONFIG_SMP
@@ -536,6 +536,7 @@ static void native_machine_emergency_restart(void)
 	int i;
 	int attempt = 0;
 	int orig_reboot_type = reboot_type;
+	unsigned short mode;
 
 	if (reboot_emergency)
 		emergency_vmx_disable_all();
@@ -543,7 +544,8 @@ static void native_machine_emergency_restart(void)
 	tboot_shutdown(TB_SHUTDOWN_REBOOT);
 
 	/* Tell the BIOS if we want cold or warm reboot */
-	*((unsigned short *)__va(0x472)) = reboot_mode;
+	mode = reboot_mode == REBOOT_WARM ? 0x1234 : 0;
+	*((unsigned short *)__va(0x472)) = mode;
 
 	for (;;) {
 		/* Could also try the reset bit in the Hammer NB */
@@ -585,7 +587,7 @@ static void native_machine_emergency_restart(void)
 
 		case BOOT_EFI:
 			if (efi_enabled(EFI_RUNTIME_SERVICES))
-				efi.reset_system(reboot_mode ?
+				efi.reset_system(reboot_mode == REBOOT_WARM ?
 						 EFI_RESET_WARM :
 						 EFI_RESET_COLD,
 						 EFI_SUCCESS, 0, NULL);
diff --git a/include/linux/reboot.h b/include/linux/reboot.h
index c6eba21..37d56c3 100644
--- a/include/linux/reboot.h
+++ b/include/linux/reboot.h
@@ -10,6 +10,11 @@
 #define SYS_HALT	0x0002	/* Notify of system halt */
 #define SYS_POWER_OFF	0x0003	/* Notify of system power off */
 
+enum reboot_mode {
+	REBOOT_COLD = 0,
+	REBOOT_WARM,
+};
+
 extern int register_reboot_notifier(struct notifier_block *);
 extern int unregister_reboot_notifier(struct notifier_block *);
 
-- 
cgit v0.10.2


From c97a7008517abb7c805fbdd49410032a652def26 Mon Sep 17 00:00:00 2001
From: Robin Holt <holt@sgi.com>
Date: Mon, 8 Jul 2013 16:01:36 -0700
Subject: reboot: unicore32: prepare reboot_mode for moving to generic kernel
 code

Prepare for the moving the parsing of reboot= to the generic kernel code
by making reboot_mode into a more generic form.

Signed-off-by: Robin Holt <holt@sgi.com>
Cc: Guan Xuetao <gxt@mprc.pku.edu.cn>
Cc: Russ Anderson <rja@sgi.com>
Cc: Robin Holt <holt@sgi.com>
Cc: Russell King <rmk+kernel@arm.linux.org.uk>
Cc: H. Peter Anvin <hpa@zytor.com>
Acked-by: Guan Xuetao <gxt@mprc.pku.edu.cn>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/arch/unicore32/kernel/process.c b/arch/unicore32/kernel/process.c
index c944769..93dd035 100644
--- a/arch/unicore32/kernel/process.c
+++ b/arch/unicore32/kernel/process.c
@@ -51,14 +51,14 @@ void arch_cpu_idle(void)
 	local_irq_enable();
 }
 
-static char reboot_mode = 'h';
+static enum reboot_mode reboot_mode = REBOOT_HARD;
 
 int __init reboot_setup(char *str)
 {
-	reboot_mode = str[0];
+	if ('s' == str[0])
+		reboot_mode = REBOOT_SOFT;
 	return 1;
 }
-
 __setup("reboot=", reboot_setup);
 
 void machine_halt(void)
@@ -88,7 +88,7 @@ void machine_restart(char *cmd)
 	 * we may need it to insert some 1:1 mappings so that
 	 * soft boot works.
 	 */
-	setup_mm_for_reboot(reboot_mode);
+	setup_mm_for_reboot();
 
 	/* Clean and invalidate caches */
 	flush_cache_all();
@@ -102,7 +102,7 @@ void machine_restart(char *cmd)
 	/*
 	 * Now handle reboot code.
 	 */
-	if (reboot_mode == 's') {
+	if (reboot_mode == REBOOT_SOFT) {
 		/* Jump into ROM at address 0xffff0000 */
 		cpu_reset(VECTORS_BASE);
 	} else {
diff --git a/arch/unicore32/kernel/setup.h b/arch/unicore32/kernel/setup.h
index 30f749d..f5c51b8 100644
--- a/arch/unicore32/kernel/setup.h
+++ b/arch/unicore32/kernel/setup.h
@@ -22,7 +22,7 @@ extern void puv3_ps2_init(void);
 extern void pci_puv3_preinit(void);
 extern void __init puv3_init_gpio(void);
 
-extern void setup_mm_for_reboot(char mode);
+extern void setup_mm_for_reboot(void);
 
 extern char __stubs_start[], __stubs_end[];
 extern char __vectors_start[], __vectors_end[];
diff --git a/arch/unicore32/mm/mmu.c b/arch/unicore32/mm/mmu.c
index 43c20b4..4f5a532 100644
--- a/arch/unicore32/mm/mmu.c
+++ b/arch/unicore32/mm/mmu.c
@@ -445,7 +445,7 @@ void __init paging_init(void)
  * the user-mode pages.  This will then ensure that we have predictable
  * results when turning the mmu off
  */
-void setup_mm_for_reboot(char mode)
+void setup_mm_for_reboot(void)
 {
 	unsigned long base_pmdval;
 	pgd_t *pgd;
diff --git a/include/linux/reboot.h b/include/linux/reboot.h
index 37d56c3..ca29a6f 100644
--- a/include/linux/reboot.h
+++ b/include/linux/reboot.h
@@ -13,6 +13,8 @@
 enum reboot_mode {
 	REBOOT_COLD = 0,
 	REBOOT_WARM,
+	REBOOT_HARD,
+	REBOOT_SOFT,
 };
 
 extern int register_reboot_notifier(struct notifier_block *);
-- 
cgit v0.10.2


From 58591942789abe1ea18e3fb1e8d8502c70060c29 Mon Sep 17 00:00:00 2001
From: Robin Holt <holt@sgi.com>
Date: Mon, 8 Jul 2013 16:01:38 -0700
Subject: reboot: arm: remove unused restart_mode fields from some arm subarchs

These restart_mode fields are not used at all.  Remove them to make
moving the reboot= cmdline options to the general kernel easier.

Signed-off-by: Robin Holt <holt@sgi.com>
Cc: Russell King <rmk+kernel@arm.linux.org.uk>
Cc: Russ Anderson <rja@sgi.com>
Cc: Robin Holt <holt@sgi.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Guan Xuetao <gxt@mprc.pku.edu.cn>
Acked-by: Russell King <rmk+kernel@arm.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/arch/arm/mach-ebsa110/core.c b/arch/arm/mach-ebsa110/core.c
index 8a53f34..41d2d90 100644
--- a/arch/arm/mach-ebsa110/core.c
+++ b/arch/arm/mach-ebsa110/core.c
@@ -321,7 +321,6 @@ MACHINE_START(EBSA110, "EBSA110")
 	.atag_offset	= 0x400,
 	.reserve_lp0	= 1,
 	.reserve_lp2	= 1,
-	.restart_mode	= 's',
 	.map_io		= ebsa110_map_io,
 	.init_early	= ebsa110_init_early,
 	.init_irq	= ebsa110_init_irq,
diff --git a/arch/arm/mach-pxa/mioa701.c b/arch/arm/mach-pxa/mioa701.c
index 654b0ac..e6b0a93 100644
--- a/arch/arm/mach-pxa/mioa701.c
+++ b/arch/arm/mach-pxa/mioa701.c
@@ -761,7 +761,6 @@ static void mioa701_machine_exit(void)
 
 MACHINE_START(MIOA701, "MIO A701")
 	.atag_offset	= 0x100,
-	.restart_mode	= 's',
 	.map_io		= &pxa27x_map_io,
 	.nr_irqs	= PXA_NR_IRQS,
 	.init_irq	= &pxa27x_init_irq,
diff --git a/arch/arm/mach-pxa/spitz.c b/arch/arm/mach-pxa/spitz.c
index 362726c..c3c0042 100644
--- a/arch/arm/mach-pxa/spitz.c
+++ b/arch/arm/mach-pxa/spitz.c
@@ -979,7 +979,6 @@ static void __init spitz_fixup(struct tag *tags, char **cmdline,
 
 #ifdef CONFIG_MACH_SPITZ
 MACHINE_START(SPITZ, "SHARP Spitz")
-	.restart_mode	= 'g',
 	.fixup		= spitz_fixup,
 	.map_io		= pxa27x_map_io,
 	.nr_irqs	= PXA_NR_IRQS,
@@ -993,7 +992,6 @@ MACHINE_END
 
 #ifdef CONFIG_MACH_BORZOI
 MACHINE_START(BORZOI, "SHARP Borzoi")
-	.restart_mode	= 'g',
 	.fixup		= spitz_fixup,
 	.map_io		= pxa27x_map_io,
 	.nr_irqs	= PXA_NR_IRQS,
@@ -1007,7 +1005,6 @@ MACHINE_END
 
 #ifdef CONFIG_MACH_AKITA
 MACHINE_START(AKITA, "SHARP Akita")
-	.restart_mode	= 'g',
 	.fixup		= spitz_fixup,
 	.map_io		= pxa27x_map_io,
 	.nr_irqs	= PXA_NR_IRQS,
diff --git a/arch/arm/mach-pxa/tosa.c b/arch/arm/mach-pxa/tosa.c
index 3d91d2e..a41992f 100644
--- a/arch/arm/mach-pxa/tosa.c
+++ b/arch/arm/mach-pxa/tosa.c
@@ -969,7 +969,6 @@ static void __init fixup_tosa(struct tag *tags, char **cmdline,
 }
 
 MACHINE_START(TOSA, "SHARP Tosa")
-	.restart_mode	= 'g',
 	.fixup          = fixup_tosa,
 	.map_io         = pxa25x_map_io,
 	.nr_irqs	= TOSA_NR_IRQS,
-- 
cgit v0.10.2


From 16d6d5b00ee75307bab7e4ede9452c97b28f30e2 Mon Sep 17 00:00:00 2001
From: Robin Holt <holt@sgi.com>
Date: Mon, 8 Jul 2013 16:01:39 -0700
Subject: reboot: arm: prepare reboot_mode for moving to generic kernel code

Prepare for the moving the parsing of reboot= to the generic kernel code
by making reboot_mode into a more generic form.

Signed-off-by: Robin Holt <holt@sgi.com>
Cc: Russell King <rmk+kernel@arm.linux.org.uk>
Cc: Russ Anderson <rja@sgi.com>
Cc: Robin Holt <holt@sgi.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Guan Xuetao <gxt@mprc.pku.edu.cn>
Acked-by: Russell King <rmk+kernel@arm.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/arch/arm/include/asm/mach/arch.h b/arch/arm/include/asm/mach/arch.h
index 75bf079..fdf62b4 100644
--- a/arch/arm/include/asm/mach/arch.h
+++ b/arch/arm/include/asm/mach/arch.h
@@ -11,6 +11,7 @@
 #include <linux/types.h>
 
 #ifndef __ASSEMBLY__
+#include <linux/reboot.h>
 
 struct tag;
 struct meminfo;
@@ -43,7 +44,7 @@ struct machine_desc {
 	unsigned char		reserve_lp0 :1;	/* never has lp0	*/
 	unsigned char		reserve_lp1 :1;	/* never has lp1	*/
 	unsigned char		reserve_lp2 :1;	/* never has lp2	*/
-	char			restart_mode;	/* default restart mode	*/
+	enum reboot_mode	reboot_mode;	/* default restart mode	*/
 	struct smp_operations	*smp;		/* SMP operations	*/
 	bool			(*smp_init)(void);
 	void			(*fixup)(struct tag *, char **,
diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c
index 7f1efcd..2d54406 100644
--- a/arch/arm/kernel/process.c
+++ b/arch/arm/kernel/process.c
@@ -175,14 +175,14 @@ void arch_cpu_idle(void)
 		default_idle();
 }
 
-static char reboot_mode = 'h';
+enum reboot_mode reboot_mode = REBOOT_HARD;
 
-int __init reboot_setup(char *str)
+static int __init reboot_setup(char *str)
 {
-	reboot_mode = str[0];
+	if ('s' == str[0])
+		reboot_mode = REBOOT_SOFT;
 	return 1;
 }
-
 __setup("reboot=", reboot_setup);
 
 /*
diff --git a/arch/arm/kernel/setup.c b/arch/arm/kernel/setup.c
index 9b65327..63af9a7 100644
--- a/arch/arm/kernel/setup.c
+++ b/arch/arm/kernel/setup.c
@@ -74,7 +74,7 @@ __setup("fpe=", fpe_setup);
 
 extern void paging_init(struct machine_desc *desc);
 extern void sanity_check_meminfo(void);
-extern void reboot_setup(char *str);
+extern enum reboot_mode reboot_mode;
 extern void setup_dma_zone(struct machine_desc *desc);
 
 unsigned int processor_id;
@@ -861,8 +861,8 @@ void __init setup_arch(char **cmdline_p)
 
 	setup_dma_zone(mdesc);
 
-	if (mdesc->restart_mode)
-		reboot_setup(&mdesc->restart_mode);
+	if (mdesc->reboot_mode != REBOOT_HARD)
+		reboot_mode = mdesc->reboot_mode;
 
 	init_mm.start_code = (unsigned long) _text;
 	init_mm.end_code   = (unsigned long) _etext;
diff --git a/arch/arm/mach-footbridge/cats-hw.c b/arch/arm/mach-footbridge/cats-hw.c
index 6987a09..9669cc0 100644
--- a/arch/arm/mach-footbridge/cats-hw.c
+++ b/arch/arm/mach-footbridge/cats-hw.c
@@ -86,7 +86,7 @@ fixup_cats(struct tag *tags, char **cmdline, struct meminfo *mi)
 MACHINE_START(CATS, "Chalice-CATS")
 	/* Maintainer: Philip Blundell */
 	.atag_offset	= 0x100,
-	.restart_mode	= 's',
+	.reboot_mode	= REBOOT_SOFT,
 	.fixup		= fixup_cats,
 	.map_io		= footbridge_map_io,
 	.init_irq	= footbridge_init_irq,
-- 
cgit v0.10.2


From 7b6d864b48d95e6ea1df7df64475b9cb9616dcf9 Mon Sep 17 00:00:00 2001
From: Robin Holt <holt@sgi.com>
Date: Mon, 8 Jul 2013 16:01:40 -0700
Subject: reboot: arm: change reboot_mode to use enum reboot_mode

Preparing to move the parsing of reboot= to generic kernel code forces
the change in reboot_mode handling to use the enum.

[akpm@linux-foundation.org: fix arch/arm/mach-socfpga/socfpga.c]
Signed-off-by: Robin Holt <holt@sgi.com>
Cc: Russell King <rmk+kernel@arm.linux.org.uk>
Cc: Russ Anderson <rja@sgi.com>
Cc: Robin Holt <holt@sgi.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Guan Xuetao <gxt@mprc.pku.edu.cn>
Acked-by: Russell King <rmk+kernel@arm.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/arch/arm/include/asm/hardware/iop3xx.h b/arch/arm/include/asm/hardware/iop3xx.h
index ed94b1a..423744b 100644
--- a/arch/arm/include/asm/hardware/iop3xx.h
+++ b/arch/arm/include/asm/hardware/iop3xx.h
@@ -223,11 +223,12 @@ extern int iop3xx_get_init_atu(void);
 #ifndef __ASSEMBLY__
 
 #include <linux/types.h>
+#include <linux/reboot.h>
 
 void iop3xx_map_io(void);
 void iop_init_cp6_handler(void);
 void iop_init_time(unsigned long tickrate);
-void iop3xx_restart(char, const char *);
+void iop3xx_restart(enum reboot_mode, const char *);
 
 static inline u32 read_tmr0(void)
 {
diff --git a/arch/arm/include/asm/mach/arch.h b/arch/arm/include/asm/mach/arch.h
index fdf62b4..441efc4 100644
--- a/arch/arm/include/asm/mach/arch.h
+++ b/arch/arm/include/asm/mach/arch.h
@@ -59,7 +59,7 @@ struct machine_desc {
 #ifdef CONFIG_MULTI_IRQ_HANDLER
 	void			(*handle_irq)(struct pt_regs *);
 #endif
-	void			(*restart)(char, const char *);
+	void			(*restart)(enum reboot_mode, const char *);
 };
 
 /*
diff --git a/arch/arm/include/asm/system_misc.h b/arch/arm/include/asm/system_misc.h
index 21a23e3..a3d61ad 100644
--- a/arch/arm/include/asm/system_misc.h
+++ b/arch/arm/include/asm/system_misc.h
@@ -6,11 +6,12 @@
 #include <linux/compiler.h>
 #include <linux/linkage.h>
 #include <linux/irqflags.h>
+#include <linux/reboot.h>
 
 extern void cpu_init(void);
 
 void soft_restart(unsigned long);
-extern void (*arm_pm_restart)(char str, const char *cmd);
+extern void (*arm_pm_restart)(enum reboot_mode reboot_mode, const char *cmd);
 extern void (*arm_pm_idle)(void);
 
 #define UDBG_UNDEFINED	(1 << 0)
diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c
index 2d54406..b7fdd86 100644
--- a/arch/arm/kernel/process.c
+++ b/arch/arm/kernel/process.c
@@ -32,6 +32,7 @@
 #include <linux/hw_breakpoint.h>
 #include <linux/cpuidle.h>
 #include <linux/leds.h>
+#include <linux/reboot.h>
 
 #include <asm/cacheflush.h>
 #include <asm/idmap.h>
@@ -113,7 +114,7 @@ void soft_restart(unsigned long addr)
 	BUG();
 }
 
-static void null_restart(char mode, const char *cmd)
+static void null_restart(enum reboot_mode reboot_mode, const char *cmd)
 {
 }
 
@@ -123,7 +124,7 @@ static void null_restart(char mode, const char *cmd)
 void (*pm_power_off)(void);
 EXPORT_SYMBOL(pm_power_off);
 
-void (*arm_pm_restart)(char str, const char *cmd) = null_restart;
+void (*arm_pm_restart)(enum reboot_mode reboot_mode, const char *cmd) = null_restart;
 EXPORT_SYMBOL_GPL(arm_pm_restart);
 
 /*
diff --git a/arch/arm/mach-at91/at91rm9200.c b/arch/arm/mach-at91/at91rm9200.c
index 9eb5743..4aad93d 100644
--- a/arch/arm/mach-at91/at91rm9200.c
+++ b/arch/arm/mach-at91/at91rm9200.c
@@ -11,6 +11,7 @@
  */
 
 #include <linux/module.h>
+#include <linux/reboot.h>
 
 #include <asm/irq.h>
 #include <asm/mach/arch.h>
@@ -304,7 +305,7 @@ static void at91rm9200_idle(void)
 	at91_pmc_write(AT91_PMC_SCDR, AT91_PMC_PCK);
 }
 
-static void at91rm9200_restart(char mode, const char *cmd)
+static void at91rm9200_restart(enum reboot_mode reboot_mode, const char *cmd)
 {
 	/*
 	 * Perform a hardware reset with the use of the Watchdog timer.
diff --git a/arch/arm/mach-at91/generic.h b/arch/arm/mach-at91/generic.h
index f6de36a..dc6e2f5 100644
--- a/arch/arm/mach-at91/generic.h
+++ b/arch/arm/mach-at91/generic.h
@@ -10,6 +10,7 @@
 
 #include <linux/clkdev.h>
 #include <linux/of.h>
+#include <linux/reboot.h>
 
  /* Map io */
 extern void __init at91_map_io(void);
@@ -60,8 +61,8 @@ extern void at91sam9_idle(void);
 
 /* reset */
 extern void at91_ioremap_rstc(u32 base_addr);
-extern void at91sam9_alt_restart(char, const char *);
-extern void at91sam9g45_restart(char, const char *);
+extern void at91sam9_alt_restart(enum reboot_mode, const char *);
+extern void at91sam9g45_restart(enum reboot_mode, const char *);
 
 /* shutdown */
 extern void at91_ioremap_shdwc(u32 base_addr);
diff --git a/arch/arm/mach-bcm2835/bcm2835.c b/arch/arm/mach-bcm2835/bcm2835.c
index 740fa9e..40686d7 100644
--- a/arch/arm/mach-bcm2835/bcm2835.c
+++ b/arch/arm/mach-bcm2835/bcm2835.c
@@ -53,7 +53,7 @@ static void bcm2835_setup_restart(void)
 	WARN(!wdt_regs, "failed to remap watchdog regs");
 }
 
-static void bcm2835_restart(char mode, const char *cmd)
+static void bcm2835_restart(enum reboot_mode mode, const char *cmd)
 {
 	u32 val;
 
@@ -91,7 +91,7 @@ static void bcm2835_power_off(void)
 	writel_relaxed(val, wdt_regs + PM_RSTS);
 
 	/* Continue with normal reset mechanism */
-	bcm2835_restart(0, "");
+	bcm2835_restart(REBOOT_HARD, "");
 }
 
 static struct map_desc io_map __initdata = {
diff --git a/arch/arm/mach-clps711x/common.c b/arch/arm/mach-clps711x/common.c
index f6d1746..4ca2f3c 100644
--- a/arch/arm/mach-clps711x/common.c
+++ b/arch/arm/mach-clps711x/common.c
@@ -384,7 +384,7 @@ void __init clps711x_timer_init(void)
 	setup_irq(IRQ_TC2OI, &clps711x_timer_irq);
 }
 
-void clps711x_restart(char mode, const char *cmd)
+void clps711x_restart(enum reboot_mode mode, const char *cmd)
 {
 	soft_restart(0);
 }
diff --git a/arch/arm/mach-clps711x/common.h b/arch/arm/mach-clps711x/common.h
index 2a22f4c..9a6767b 100644
--- a/arch/arm/mach-clps711x/common.h
+++ b/arch/arm/mach-clps711x/common.h
@@ -4,6 +4,8 @@
  * Common bits.
  */
 
+#include <linux/reboot.h>
+
 #define CLPS711X_NR_IRQS	(33)
 #define CLPS711X_NR_GPIO	(4 * 8 + 3)
 #define CLPS711X_GPIO(prt, bit)	((prt) * 8 + (bit))
@@ -12,5 +14,5 @@ extern void clps711x_map_io(void);
 extern void clps711x_init_irq(void);
 extern void clps711x_timer_init(void);
 extern void clps711x_handle_irq(struct pt_regs *regs);
-extern void clps711x_restart(char mode, const char *cmd);
+extern void clps711x_restart(enum reboot_mode mode, const char *cmd);
 extern void clps711x_init_early(void);
diff --git a/arch/arm/mach-cns3xxx/core.h b/arch/arm/mach-cns3xxx/core.h
index b23b17b..5218b61 100644
--- a/arch/arm/mach-cns3xxx/core.h
+++ b/arch/arm/mach-cns3xxx/core.h
@@ -11,6 +11,8 @@
 #ifndef __CNS3XXX_CORE_H
 #define __CNS3XXX_CORE_H
 
+#include <linux/reboot.h>
+
 extern void cns3xxx_timer_init(void);
 
 #ifdef CONFIG_CACHE_L2X0
@@ -22,6 +24,6 @@ static inline void cns3xxx_l2x0_init(void) {}
 void __init cns3xxx_map_io(void);
 void __init cns3xxx_init_irq(void);
 void cns3xxx_power_off(void);
-void cns3xxx_restart(char, const char *);
+void cns3xxx_restart(enum reboot_mode, const char *);
 
 #endif /* __CNS3XXX_CORE_H */
diff --git a/arch/arm/mach-cns3xxx/pm.c b/arch/arm/mach-cns3xxx/pm.c
index 79e3d47..fb38c72 100644
--- a/arch/arm/mach-cns3xxx/pm.c
+++ b/arch/arm/mach-cns3xxx/pm.c
@@ -89,7 +89,7 @@ void cns3xxx_pwr_soft_rst(unsigned int block)
 }
 EXPORT_SYMBOL(cns3xxx_pwr_soft_rst);
 
-void cns3xxx_restart(char mode, const char *cmd)
+void cns3xxx_restart(enum reboot_mode mode, const char *cmd)
 {
 	/*
 	 * To reset, we hit the on-board reset register
diff --git a/arch/arm/mach-davinci/devices-da8xx.c b/arch/arm/mach-davinci/devices-da8xx.c
index eb254fe..71a46a3 100644
--- a/arch/arm/mach-davinci/devices-da8xx.c
+++ b/arch/arm/mach-davinci/devices-da8xx.c
@@ -16,6 +16,7 @@
 #include <linux/serial_8250.h>
 #include <linux/ahci_platform.h>
 #include <linux/clk.h>
+#include <linux/reboot.h>
 
 #include <mach/cputype.h>
 #include <mach/common.h>
@@ -366,7 +367,7 @@ static struct platform_device da8xx_wdt_device = {
 	.resource	= da8xx_watchdog_resources,
 };
 
-void da8xx_restart(char mode, const char *cmd)
+void da8xx_restart(enum reboot_mode mode, const char *cmd)
 {
 	struct device *dev;
 
diff --git a/arch/arm/mach-davinci/devices.c b/arch/arm/mach-davinci/devices.c
index 90b83d0..111573c 100644
--- a/arch/arm/mach-davinci/devices.c
+++ b/arch/arm/mach-davinci/devices.c
@@ -13,6 +13,7 @@
 #include <linux/platform_device.h>
 #include <linux/dma-mapping.h>
 #include <linux/io.h>
+#include <linux/reboot.h>
 
 #include <mach/hardware.h>
 #include <linux/platform_data/i2c-davinci.h>
@@ -307,7 +308,7 @@ struct platform_device davinci_wdt_device = {
 	.resource	= wdt_resources,
 };
 
-void davinci_restart(char mode, const char *cmd)
+void davinci_restart(enum reboot_mode mode, const char *cmd)
 {
 	davinci_watchdog_reset(&davinci_wdt_device);
 }
diff --git a/arch/arm/mach-davinci/include/mach/common.h b/arch/arm/mach-davinci/include/mach/common.h
index b124b77..cce316b 100644
--- a/arch/arm/mach-davinci/include/mach/common.h
+++ b/arch/arm/mach-davinci/include/mach/common.h
@@ -14,6 +14,7 @@
 
 #include <linux/compiler.h>
 #include <linux/types.h>
+#include <linux/reboot.h>
 
 extern void davinci_timer_init(void);
 
@@ -81,7 +82,7 @@ extern struct davinci_soc_info davinci_soc_info;
 
 extern void davinci_common_init(struct davinci_soc_info *soc_info);
 extern void davinci_init_ide(void);
-void davinci_restart(char mode, const char *cmd);
+void davinci_restart(enum reboot_mode mode, const char *cmd);
 void davinci_init_late(void);
 
 #ifdef CONFIG_DAVINCI_RESET_CLOCKS
diff --git a/arch/arm/mach-davinci/include/mach/da8xx.h b/arch/arm/mach-davinci/include/mach/da8xx.h
index 3c797e2..7b41a5e 100644
--- a/arch/arm/mach-davinci/include/mach/da8xx.h
+++ b/arch/arm/mach-davinci/include/mach/da8xx.h
@@ -17,6 +17,7 @@
 #include <linux/davinci_emac.h>
 #include <linux/spi/spi.h>
 #include <linux/platform_data/davinci_asp.h>
+#include <linux/reboot.h>
 #include <linux/videodev2.h>
 
 #include <mach/serial.h>
@@ -106,7 +107,7 @@ int da850_register_vpif_display
 			(struct vpif_display_config *display_config);
 int da850_register_vpif_capture
 			(struct vpif_capture_config *capture_config);
-void da8xx_restart(char mode, const char *cmd);
+void da8xx_restart(enum reboot_mode mode, const char *cmd);
 void da8xx_rproc_reserve_cma(void);
 int da8xx_register_rproc(void);
 
diff --git a/arch/arm/mach-davinci/include/mach/tnetv107x.h b/arch/arm/mach-davinci/include/mach/tnetv107x.h
index 366e975..16314c6 100644
--- a/arch/arm/mach-davinci/include/mach/tnetv107x.h
+++ b/arch/arm/mach-davinci/include/mach/tnetv107x.h
@@ -35,6 +35,7 @@
 #include <linux/serial_8250.h>
 #include <linux/input/matrix_keypad.h>
 #include <linux/mfd/ti_ssp.h>
+#include <linux/reboot.h>
 
 #include <linux/platform_data/mmc-davinci.h>
 #include <linux/platform_data/mtd-davinci.h>
@@ -54,7 +55,7 @@ extern struct platform_device tnetv107x_serial_device;
 extern void tnetv107x_init(void);
 extern void tnetv107x_devices_init(struct tnetv107x_device_info *);
 extern void tnetv107x_irq_init(void);
-void tnetv107x_restart(char mode, const char *cmd);
+void tnetv107x_restart(enum reboot_mode mode, const char *cmd);
 
 #endif
 
diff --git a/arch/arm/mach-davinci/tnetv107x.c b/arch/arm/mach-davinci/tnetv107x.c
index 3b2a70d..4545667 100644
--- a/arch/arm/mach-davinci/tnetv107x.c
+++ b/arch/arm/mach-davinci/tnetv107x.c
@@ -19,6 +19,7 @@
 #include <linux/io.h>
 #include <linux/err.h>
 #include <linux/platform_device.h>
+#include <linux/reboot.h>
 
 #include <asm/mach/map.h>
 
@@ -730,7 +731,7 @@ static void tnetv107x_watchdog_reset(struct platform_device *pdev)
 	__raw_writel(1, &regs->kick);
 }
 
-void tnetv107x_restart(char mode, const char *cmd)
+void tnetv107x_restart(enum reboot_mode mode, const char *cmd)
 {
 	tnetv107x_watchdog_reset(&tnetv107x_wdt_device);
 }
diff --git a/arch/arm/mach-dove/common.c b/arch/arm/mach-dove/common.c
index 2a9443d..00247c7 100644
--- a/arch/arm/mach-dove/common.c
+++ b/arch/arm/mach-dove/common.c
@@ -381,7 +381,7 @@ void __init dove_init(void)
 	dove_xor1_init();
 }
 
-void dove_restart(char mode, const char *cmd)
+void dove_restart(enum reboot_mode mode, const char *cmd)
 {
 	/*
 	 * Enable soft reset to assert RSTOUTn.
diff --git a/arch/arm/mach-dove/common.h b/arch/arm/mach-dove/common.h
index e863479..1d72522 100644
--- a/arch/arm/mach-dove/common.h
+++ b/arch/arm/mach-dove/common.h
@@ -11,6 +11,8 @@
 #ifndef __ARCH_DOVE_COMMON_H
 #define __ARCH_DOVE_COMMON_H
 
+#include <linux/reboot.h>
+
 struct mv643xx_eth_platform_data;
 struct mv_sata_platform_data;
 
@@ -42,6 +44,6 @@ void dove_spi1_init(void);
 void dove_i2c_init(void);
 void dove_sdio0_init(void);
 void dove_sdio1_init(void);
-void dove_restart(char, const char *);
+void dove_restart(enum reboot_mode, const char *);
 
 #endif
diff --git a/arch/arm/mach-ebsa110/core.c b/arch/arm/mach-ebsa110/core.c
index 41d2d90..68ac934 100644
--- a/arch/arm/mach-ebsa110/core.c
+++ b/arch/arm/mach-ebsa110/core.c
@@ -311,7 +311,7 @@ static int __init ebsa110_init(void)
 
 arch_initcall(ebsa110_init);
 
-static void ebsa110_restart(char mode, const char *cmd)
+static void ebsa110_restart(enum reboot_mode mode, const char *cmd)
 {
 	soft_restart(0x80000000);
 }
diff --git a/arch/arm/mach-ep93xx/core.c b/arch/arm/mach-ep93xx/core.c
index c49ed3d..df8612f 100644
--- a/arch/arm/mach-ep93xx/core.c
+++ b/arch/arm/mach-ep93xx/core.c
@@ -35,6 +35,7 @@
 #include <linux/spi/spi.h>
 #include <linux/export.h>
 #include <linux/irqchip/arm-vic.h>
+#include <linux/reboot.h>
 
 #include <mach/hardware.h>
 #include <linux/platform_data/video-ep93xx.h>
@@ -921,7 +922,7 @@ void __init ep93xx_init_devices(void)
 	gpio_led_register_device(-1, &ep93xx_led_data);
 }
 
-void ep93xx_restart(char mode, const char *cmd)
+void ep93xx_restart(enum reboot_mode mode, const char *cmd)
 {
 	/*
 	 * Set then clear the SWRST bit to initiate a software reset
diff --git a/arch/arm/mach-ep93xx/include/mach/platform.h b/arch/arm/mach-ep93xx/include/mach/platform.h
index a14e1b3..e256e0b 100644
--- a/arch/arm/mach-ep93xx/include/mach/platform.h
+++ b/arch/arm/mach-ep93xx/include/mach/platform.h
@@ -4,6 +4,8 @@
 
 #ifndef __ASSEMBLY__
 
+#include <linux/reboot.h>
+
 struct i2c_gpio_platform_data;
 struct i2c_board_info;
 struct spi_board_info;
@@ -55,7 +57,7 @@ void ep93xx_ide_release_gpio(struct platform_device *pdev);
 void ep93xx_init_devices(void);
 extern void ep93xx_timer_init(void);
 
-void ep93xx_restart(char, const char *);
+void ep93xx_restart(enum reboot_mode, const char *);
 void ep93xx_init_late(void);
 
 #ifdef CONFIG_CRUNCH
diff --git a/arch/arm/mach-exynos/common.c b/arch/arm/mach-exynos/common.c
index 2c655db..164685b 100644
--- a/arch/arm/mach-exynos/common.c
+++ b/arch/arm/mach-exynos/common.c
@@ -285,12 +285,12 @@ static struct map_desc exynos5440_iodesc0[] __initdata = {
 	},
 };
 
-void exynos4_restart(char mode, const char *cmd)
+void exynos4_restart(enum reboot_mode mode, const char *cmd)
 {
 	__raw_writel(0x1, S5P_SWRESET);
 }
 
-void exynos5_restart(char mode, const char *cmd)
+void exynos5_restart(enum reboot_mode mode, const char *cmd)
 {
 	struct device_node *np;
 	u32 val;
diff --git a/arch/arm/mach-exynos/common.h b/arch/arm/mach-exynos/common.h
index 38d45fd..3e156bc 100644
--- a/arch/arm/mach-exynos/common.h
+++ b/arch/arm/mach-exynos/common.h
@@ -12,6 +12,7 @@
 #ifndef __ARCH_ARM_MACH_EXYNOS_COMMON_H
 #define __ARCH_ARM_MACH_EXYNOS_COMMON_H
 
+#include <linux/reboot.h>
 #include <linux/of.h>
 
 void mct_init(void __iomem *base, int irq_g0, int irq_l0, int irq_l1);
@@ -20,8 +21,8 @@ extern unsigned long xxti_f, xusbxti_f;
 
 struct map_desc;
 void exynos_init_io(void);
-void exynos4_restart(char mode, const char *cmd);
-void exynos5_restart(char mode, const char *cmd);
+void exynos4_restart(enum reboot_mode mode, const char *cmd);
+void exynos5_restart(enum reboot_mode mode, const char *cmd);
 void exynos_init_late(void);
 
 /* ToDo: remove these after migrating legacy exynos4 platforms to dt */
diff --git a/arch/arm/mach-footbridge/common.c b/arch/arm/mach-footbridge/common.c
index a42b369..2739ca2 100644
--- a/arch/arm/mach-footbridge/common.c
+++ b/arch/arm/mach-footbridge/common.c
@@ -198,9 +198,9 @@ void __init footbridge_map_io(void)
 	}
 }
 
-void footbridge_restart(char mode, const char *cmd)
+void footbridge_restart(enum reboot_mode mode, const char *cmd)
 {
-	if (mode == 's') {
+	if (mode == REBOOT_SOFT) {
 		/* Jump into the ROM */
 		soft_restart(0x41000000);
 	} else {
diff --git a/arch/arm/mach-footbridge/common.h b/arch/arm/mach-footbridge/common.h
index a846e50..56607b3 100644
--- a/arch/arm/mach-footbridge/common.h
+++ b/arch/arm/mach-footbridge/common.h
@@ -1,3 +1,4 @@
+#include <linux/reboot.h>
 
 extern void footbridge_timer_init(void);
 extern void isa_timer_init(void);
@@ -8,4 +9,4 @@ extern void footbridge_map_io(void);
 extern void footbridge_init_irq(void);
 
 extern void isa_init_irq(unsigned int irq);
-extern void footbridge_restart(char, const char *);
+extern void footbridge_restart(enum reboot_mode, const char *);
diff --git a/arch/arm/mach-footbridge/netwinder-hw.c b/arch/arm/mach-footbridge/netwinder-hw.c
index 90ea23f..1fd2cf0 100644
--- a/arch/arm/mach-footbridge/netwinder-hw.c
+++ b/arch/arm/mach-footbridge/netwinder-hw.c
@@ -634,9 +634,9 @@ fixup_netwinder(struct tag *tags, char **cmdline, struct meminfo *mi)
 #endif
 }
 
-static void netwinder_restart(char mode, const char *cmd)
+static void netwinder_restart(enum reboot_mode mode, const char *cmd)
 {
-	if (mode == 's') {
+	if (mode == REBOOT_SOFT) {
 		/* Jump into the ROM */
 		soft_restart(0x41000000);
 	} else {
diff --git a/arch/arm/mach-highbank/core.h b/arch/arm/mach-highbank/core.h
index 3f65206..aea1ec5 100644
--- a/arch/arm/mach-highbank/core.h
+++ b/arch/arm/mach-highbank/core.h
@@ -1,8 +1,10 @@
 #ifndef __HIGHBANK_CORE_H
 #define __HIGHBANK_CORE_H
 
+#include <linux/reboot.h>
+
 extern void highbank_set_cpu_jump(int cpu, void *jump_addr);
-extern void highbank_restart(char, const char *);
+extern void highbank_restart(enum reboot_mode, const char *);
 extern void __iomem *scu_base_addr;
 
 #ifdef CONFIG_PM_SLEEP
diff --git a/arch/arm/mach-highbank/system.c b/arch/arm/mach-highbank/system.c
index 37d8384..2df5870 100644
--- a/arch/arm/mach-highbank/system.c
+++ b/arch/arm/mach-highbank/system.c
@@ -15,13 +15,14 @@
  */
 #include <linux/io.h>
 #include <asm/proc-fns.h>
+#include <linux/reboot.h>
 
 #include "core.h"
 #include "sysregs.h"
 
-void highbank_restart(char mode, const char *cmd)
+void highbank_restart(enum reboot_mode mode, const char *cmd)
 {
-	if (mode == 'h')
+	if (mode == REBOOT_HARD)
 		highbank_set_pwr_hard_reset();
 	else
 		highbank_set_pwr_soft_reset();
diff --git a/arch/arm/mach-imx/common.h b/arch/arm/mach-imx/common.h
index ee78847..cb6c838 100644
--- a/arch/arm/mach-imx/common.h
+++ b/arch/arm/mach-imx/common.h
@@ -11,6 +11,8 @@
 #ifndef __ASM_ARCH_MXC_COMMON_H__
 #define __ASM_ARCH_MXC_COMMON_H__
 
+#include <linux/reboot.h>
+
 struct platform_device;
 struct pt_regs;
 struct clk;
@@ -71,7 +73,7 @@ extern int mx53_clocks_init_dt(void);
 extern struct platform_device *mxc_register_gpio(char *name, int id,
 	resource_size_t iobase, resource_size_t iosize, int irq, int irq_high);
 extern void mxc_set_cpu_type(unsigned int type);
-extern void mxc_restart(char, const char *);
+extern void mxc_restart(enum reboot_mode, const char *);
 extern void mxc_arch_reset_init(void __iomem *);
 extern void mxc_arch_reset_init_dt(void);
 extern int mx53_revision(void);
diff --git a/arch/arm/mach-imx/mach-imx6q.c b/arch/arm/mach-imx/mach-imx6q.c
index f596522..7be13f8 100644
--- a/arch/arm/mach-imx/mach-imx6q.c
+++ b/arch/arm/mach-imx/mach-imx6q.c
@@ -27,6 +27,7 @@
 #include <linux/of_platform.h>
 #include <linux/opp.h>
 #include <linux/phy.h>
+#include <linux/reboot.h>
 #include <linux/regmap.h>
 #include <linux/micrel_phy.h>
 #include <linux/mfd/syscon.h>
@@ -67,7 +68,7 @@ static void __init imx6q_init_revision(void)
 	mxc_set_cpu_type(rev >> 16 & 0xff);
 }
 
-static void imx6q_restart(char mode, const char *cmd)
+static void imx6q_restart(enum reboot_mode mode, const char *cmd)
 {
 	struct device_node *np;
 	void __iomem *wdog_base;
diff --git a/arch/arm/mach-imx/system.c b/arch/arm/mach-imx/system.c
index 7cdc79a..6fe81bb 100644
--- a/arch/arm/mach-imx/system.c
+++ b/arch/arm/mach-imx/system.c
@@ -37,7 +37,7 @@ static struct clk *wdog_clk;
 /*
  * Reset the system. It is called by machine_restart().
  */
-void mxc_restart(char mode, const char *cmd)
+void mxc_restart(enum reboot_mode mode, const char *cmd)
 {
 	unsigned int wcr_enable;
 
diff --git a/arch/arm/mach-integrator/common.h b/arch/arm/mach-integrator/common.h
index 72516658b..ad0ac55 100644
--- a/arch/arm/mach-integrator/common.h
+++ b/arch/arm/mach-integrator/common.h
@@ -1,7 +1,8 @@
+#include <linux/reboot.h>
 #include <linux/amba/serial.h>
 extern struct amba_pl010_data ap_uart_data;
 void integrator_init_early(void);
 int integrator_init(bool is_cp);
 void integrator_reserve(void);
-void integrator_restart(char, const char *);
+void integrator_restart(enum reboot_mode, const char *);
 void integrator_init_sysfs(struct device *parent, u32 id);
diff --git a/arch/arm/mach-integrator/core.c b/arch/arm/mach-integrator/core.c
index 81461d2..4cdfd73 100644
--- a/arch/arm/mach-integrator/core.c
+++ b/arch/arm/mach-integrator/core.c
@@ -124,7 +124,7 @@ void __init integrator_reserve(void)
 /*
  * To reset, we hit the on-board reset register in the system FPGA
  */
-void integrator_restart(char mode, const char *cmd)
+void integrator_restart(enum reboot_mode mode, const char *cmd)
 {
 	cm_control(CM_CTRL_RESET, CM_CTRL_RESET);
 }
diff --git a/arch/arm/mach-iop13xx/include/mach/iop13xx.h b/arch/arm/mach-iop13xx/include/mach/iop13xx.h
index 7480f58..17b4027 100644
--- a/arch/arm/mach-iop13xx/include/mach/iop13xx.h
+++ b/arch/arm/mach-iop13xx/include/mach/iop13xx.h
@@ -2,6 +2,9 @@
 #define _IOP13XX_HW_H_
 
 #ifndef __ASSEMBLY__
+
+#include <linux/reboot.h>
+
 /* The ATU offsets can change based on the strapping */
 extern u32 iop13xx_atux_pmmr_offset;
 extern u32 iop13xx_atue_pmmr_offset;
@@ -11,7 +14,7 @@ void iop13xx_map_io(void);
 void iop13xx_platform_init(void);
 void iop13xx_add_tpmi_devices(void);
 void iop13xx_init_irq(void);
-void iop13xx_restart(char, const char *);
+void iop13xx_restart(enum reboot_mode, const char *);
 
 /* CPUID CP6 R0 Page 0 */
 static inline int iop13xx_cpu_id(void)
diff --git a/arch/arm/mach-iop13xx/setup.c b/arch/arm/mach-iop13xx/setup.c
index 1c5bd76..96e6c7a 100644
--- a/arch/arm/mach-iop13xx/setup.c
+++ b/arch/arm/mach-iop13xx/setup.c
@@ -594,7 +594,7 @@ __setup("iop13xx_init_adma", iop13xx_init_adma_setup);
 __setup("iop13xx_init_uart", iop13xx_init_uart_setup);
 __setup("iop13xx_init_i2c", iop13xx_init_i2c_setup);
 
-void iop13xx_restart(char mode, const char *cmd)
+void iop13xx_restart(enum reboot_mode mode, const char *cmd)
 {
 	/*
 	 * Reset the internal bus (warning both cores are reset)
diff --git a/arch/arm/mach-iop32x/n2100.c b/arch/arm/mach-iop32x/n2100.c
index ea0984a..0691443 100644
--- a/arch/arm/mach-iop32x/n2100.c
+++ b/arch/arm/mach-iop32x/n2100.c
@@ -286,7 +286,7 @@ static void n2100_power_off(void)
 		;
 }
 
-static void n2100_restart(char mode, const char *cmd)
+static void n2100_restart(enum reboot_mode mode, const char *cmd)
 {
 	gpio_line_set(N2100_HARDWARE_RESET, GPIO_LOW);
 	gpio_line_config(N2100_HARDWARE_RESET, GPIO_OUT);
diff --git a/arch/arm/mach-ixp4xx/common.c b/arch/arm/mach-ixp4xx/common.c
index 1f6c1fb..5327dec 100644
--- a/arch/arm/mach-ixp4xx/common.c
+++ b/arch/arm/mach-ixp4xx/common.c
@@ -531,9 +531,9 @@ static void __init ixp4xx_clockevent_init(void)
 					0xf, 0xfffffffe);
 }
 
-void ixp4xx_restart(char mode, const char *cmd)
+void ixp4xx_restart(enum reboot_mode mode, const char *cmd)
 {
-	if ( 1 && mode == 's') {
+	if ( 1 && mode == REBOOT_SOFT) {
 		/* Jump into ROM at address 0 */
 		soft_restart(0);
 	} else {
diff --git a/arch/arm/mach-ixp4xx/dsmg600-setup.c b/arch/arm/mach-ixp4xx/dsmg600-setup.c
index 5d413f8..686ef34 100644
--- a/arch/arm/mach-ixp4xx/dsmg600-setup.c
+++ b/arch/arm/mach-ixp4xx/dsmg600-setup.c
@@ -27,6 +27,7 @@
 #include <linux/i2c.h>
 #include <linux/i2c-gpio.h>
 
+#include <mach/hardware.h>
 #include <asm/mach-types.h>
 #include <asm/mach/arch.h>
 #include <asm/mach/flash.h>
diff --git a/arch/arm/mach-ixp4xx/include/mach/platform.h b/arch/arm/mach-ixp4xx/include/mach/platform.h
index db5afb6..4c4c6a6 100644
--- a/arch/arm/mach-ixp4xx/include/mach/platform.h
+++ b/arch/arm/mach-ixp4xx/include/mach/platform.h
@@ -13,6 +13,8 @@
 
 #ifndef __ASSEMBLY__
 
+#include <linux/reboot.h>
+
 #include <asm/types.h>
 
 #ifndef	__ARMEB__
@@ -123,7 +125,7 @@ extern void ixp4xx_init_early(void);
 extern void ixp4xx_init_irq(void);
 extern void ixp4xx_sys_init(void);
 extern void ixp4xx_timer_init(void);
-extern void ixp4xx_restart(char, const char *);
+extern void ixp4xx_restart(enum reboot_mode, const char *);
 extern void ixp4xx_pci_preinit(void);
 struct pci_sys_data;
 extern int ixp4xx_setup(int nr, struct pci_sys_data *sys);
diff --git a/arch/arm/mach-kirkwood/common.c b/arch/arm/mach-kirkwood/common.c
index 7c72c72..e9238b5 100644
--- a/arch/arm/mach-kirkwood/common.c
+++ b/arch/arm/mach-kirkwood/common.c
@@ -20,6 +20,7 @@
 #include <linux/mv643xx_i2c.h>
 #include <linux/timex.h>
 #include <linux/kexec.h>
+#include <linux/reboot.h>
 #include <net/dsa.h>
 #include <asm/page.h>
 #include <asm/mach/map.h>
@@ -722,7 +723,7 @@ void __init kirkwood_init(void)
 #endif
 }
 
-void kirkwood_restart(char mode, const char *cmd)
+void kirkwood_restart(enum reboot_mode mode, const char *cmd)
 {
 	/*
 	 * Enable soft reset to assert RSTOUTn.
diff --git a/arch/arm/mach-kirkwood/common.h b/arch/arm/mach-kirkwood/common.h
index 1c09f3f..fcf3ba6 100644
--- a/arch/arm/mach-kirkwood/common.h
+++ b/arch/arm/mach-kirkwood/common.h
@@ -11,6 +11,8 @@
 #ifndef __ARCH_KIRKWOOD_COMMON_H
 #define __ARCH_KIRKWOOD_COMMON_H
 
+#include <linux/reboot.h>
+
 struct dsa_platform_data;
 struct mv643xx_eth_platform_data;
 struct mv_sata_platform_data;
@@ -53,7 +55,7 @@ void kirkwood_audio_init(void);
 void kirkwood_cpuidle_init(void);
 void kirkwood_cpufreq_init(void);
 
-void kirkwood_restart(char, const char *);
+void kirkwood_restart(enum reboot_mode, const char *);
 void kirkwood_clk_init(void);
 
 /* board init functions for boards not fully converted to fdt */
diff --git a/arch/arm/mach-ks8695/generic.h b/arch/arm/mach-ks8695/generic.h
index 6e97ce4..43253f8 100644
--- a/arch/arm/mach-ks8695/generic.h
+++ b/arch/arm/mach-ks8695/generic.h
@@ -12,5 +12,5 @@
 
 extern __init void ks8695_map_io(void);
 extern __init void ks8695_init_irq(void);
-extern void ks8695_restart(char, const char *);
+extern void ks8695_restart(enum reboot_mode, const char *);
 extern void ks8695_timer_init(void);
diff --git a/arch/arm/mach-ks8695/time.c b/arch/arm/mach-ks8695/time.c
index c272a386..426c976 100644
--- a/arch/arm/mach-ks8695/time.c
+++ b/arch/arm/mach-ks8695/time.c
@@ -154,11 +154,11 @@ void __init ks8695_timer_init(void)
 	setup_irq(KS8695_IRQ_TIMER1, &ks8695_timer_irq);
 }
 
-void ks8695_restart(char mode, const char *cmd)
+void ks8695_restart(enum reboot_mode reboot_mode, const char *cmd)
 {
 	unsigned int reg;
 
-	if (mode == 's')
+	if (reboot_mode == REBOOT_SOFT)
 		soft_restart(0);
 
 	/* disable timer0 */
diff --git a/arch/arm/mach-lpc32xx/common.c b/arch/arm/mach-lpc32xx/common.c
index 0d4db8c..d7aa54c 100644
--- a/arch/arm/mach-lpc32xx/common.c
+++ b/arch/arm/mach-lpc32xx/common.c
@@ -207,11 +207,11 @@ void __init lpc32xx_map_io(void)
 	iotable_init(lpc32xx_io_desc, ARRAY_SIZE(lpc32xx_io_desc));
 }
 
-void lpc23xx_restart(char mode, const char *cmd)
+void lpc23xx_restart(enum reboot_mode mode, const char *cmd)
 {
 	switch (mode) {
-	case 's':
-	case 'h':
+	case REBOOT_SOFT:
+	case REBOOT_HARD:
 		lpc32xx_watchdog_reset();
 		break;
 
diff --git a/arch/arm/mach-lpc32xx/common.h b/arch/arm/mach-lpc32xx/common.h
index e0b2606..1cd8853 100644
--- a/arch/arm/mach-lpc32xx/common.h
+++ b/arch/arm/mach-lpc32xx/common.h
@@ -21,6 +21,7 @@
 
 #include <mach/board.h>
 #include <linux/platform_device.h>
+#include <linux/reboot.h>
 
 /*
  * Other arch specific structures and functions
@@ -29,7 +30,7 @@ extern void lpc32xx_timer_init(void);
 extern void __init lpc32xx_init_irq(void);
 extern void __init lpc32xx_map_io(void);
 extern void __init lpc32xx_serial_init(void);
-extern void lpc23xx_restart(char, const char *);
+extern void lpc23xx_restart(enum reboot_mode, const char *);
 
 
 /*
diff --git a/arch/arm/mach-mmp/common.c b/arch/arm/mach-mmp/common.c
index 9292b79..c03b4ab 100644
--- a/arch/arm/mach-mmp/common.c
+++ b/arch/arm/mach-mmp/common.c
@@ -47,7 +47,7 @@ void __init mmp_map_io(void)
 	mmp_chip_id = __raw_readl(MMP_CHIPID);
 }
 
-void mmp_restart(char mode, const char *cmd)
+void mmp_restart(enum reboot_mode mode, const char *cmd)
 {
 	soft_restart(0);
 }
diff --git a/arch/arm/mach-mmp/common.h b/arch/arm/mach-mmp/common.h
index 0bdc50b..991d7e9 100644
--- a/arch/arm/mach-mmp/common.h
+++ b/arch/arm/mach-mmp/common.h
@@ -1,10 +1,11 @@
+#include <linux/reboot.h>
 #define ARRAY_AND_SIZE(x)	(x), ARRAY_SIZE(x)
 
 extern void timer_init(int irq);
 
 extern void __init icu_init_irq(void);
 extern void __init mmp_map_io(void);
-extern void mmp_restart(char, const char *);
+extern void mmp_restart(enum reboot_mode, const char *);
 extern void __init pxa168_clk_init(void);
 extern void __init pxa910_clk_init(void);
 extern void __init mmp2_clk_init(void);
diff --git a/arch/arm/mach-mmp/include/mach/pxa168.h b/arch/arm/mach-mmp/include/mach/pxa168.h
index 7ed1df2..459c2d0 100644
--- a/arch/arm/mach-mmp/include/mach/pxa168.h
+++ b/arch/arm/mach-mmp/include/mach/pxa168.h
@@ -1,9 +1,11 @@
 #ifndef __ASM_MACH_PXA168_H
 #define __ASM_MACH_PXA168_H
 
+#include <linux/reboot.h>
+
 extern void pxa168_timer_init(void);
 extern void __init pxa168_init_irq(void);
-extern void pxa168_restart(char, const char *);
+extern void pxa168_restart(enum reboot_mode, const char *);
 extern void pxa168_clear_keypad_wakeup(void);
 
 #include <linux/i2c.h>
diff --git a/arch/arm/mach-mmp/pxa168.c b/arch/arm/mach-mmp/pxa168.c
index a30dcf3..144e997 100644
--- a/arch/arm/mach-mmp/pxa168.c
+++ b/arch/arm/mach-mmp/pxa168.c
@@ -172,7 +172,7 @@ int __init pxa168_add_usb_host(struct mv_usb_platform_data *pdata)
 	return platform_device_register(&pxa168_device_usb_host);
 }
 
-void pxa168_restart(char mode, const char *cmd)
+void pxa168_restart(enum reboot_mode mode, const char *cmd)
 {
 	soft_restart(0xffff0000);
 }
diff --git a/arch/arm/mach-mv78xx0/common.c b/arch/arm/mach-mv78xx0/common.c
index 749a7f8..75062ef 100644
--- a/arch/arm/mach-mv78xx0/common.c
+++ b/arch/arm/mach-mv78xx0/common.c
@@ -413,7 +413,7 @@ void __init mv78xx0_init(void)
 	clk_init();
 }
 
-void mv78xx0_restart(char mode, const char *cmd)
+void mv78xx0_restart(enum reboot_mode mode, const char *cmd)
 {
 	/*
 	 * Enable soft reset to assert RSTOUTn.
diff --git a/arch/arm/mach-mv78xx0/common.h b/arch/arm/mach-mv78xx0/common.h
index 5e9485b..6889af2 100644
--- a/arch/arm/mach-mv78xx0/common.h
+++ b/arch/arm/mach-mv78xx0/common.h
@@ -11,6 +11,8 @@
 #ifndef __ARCH_MV78XX0_COMMON_H
 #define __ARCH_MV78XX0_COMMON_H
 
+#include <linux/reboot.h>
+
 struct mv643xx_eth_platform_data;
 struct mv_sata_platform_data;
 
@@ -45,7 +47,7 @@ void mv78xx0_uart1_init(void);
 void mv78xx0_uart2_init(void);
 void mv78xx0_uart3_init(void);
 void mv78xx0_i2c_init(void);
-void mv78xx0_restart(char, const char *);
+void mv78xx0_restart(enum reboot_mode, const char *);
 
 extern void mv78xx0_timer_init(void);
 
diff --git a/arch/arm/mach-mvebu/common.h b/arch/arm/mach-mvebu/common.h
index 98defd5..e366010 100644
--- a/arch/arm/mach-mvebu/common.h
+++ b/arch/arm/mach-mvebu/common.h
@@ -17,7 +17,9 @@
 
 #define ARMADA_XP_MAX_CPUS 4
 
-void mvebu_restart(char mode, const char *cmd);
+#include <linux/reboot.h>
+
+void mvebu_restart(enum reboot_mode mode, const char *cmd);
 
 void armada_370_xp_init_irq(void);
 void armada_370_xp_handle_irq(struct pt_regs *regs);
diff --git a/arch/arm/mach-mvebu/system-controller.c b/arch/arm/mach-mvebu/system-controller.c
index b8079df..f875124 100644
--- a/arch/arm/mach-mvebu/system-controller.c
+++ b/arch/arm/mach-mvebu/system-controller.c
@@ -26,6 +26,7 @@
 #include <linux/init.h>
 #include <linux/of_address.h>
 #include <linux/io.h>
+#include <linux/reboot.h>
 
 static void __iomem *system_controller_base;
 
@@ -63,7 +64,7 @@ static struct of_device_id of_system_controller_table[] = {
 	{ /* end of list */ },
 };
 
-void mvebu_restart(char mode, const char *cmd)
+void mvebu_restart(enum reboot_mode mode, const char *cmd)
 {
 	if (!system_controller_base) {
 		pr_err("Cannot restart, system-controller not available: check the device tree\n");
diff --git a/arch/arm/mach-mxs/mach-mxs.c b/arch/arm/mach-mxs/mach-mxs.c
index 7fa611c..6298adb 100644
--- a/arch/arm/mach-mxs/mach-mxs.c
+++ b/arch/arm/mach-mxs/mach-mxs.c
@@ -20,6 +20,7 @@
 #include <linux/gpio.h>
 #include <linux/init.h>
 #include <linux/irqchip/mxs.h>
+#include <linux/reboot.h>
 #include <linux/micrel_phy.h>
 #include <linux/of_address.h>
 #include <linux/of_platform.h>
@@ -500,7 +501,7 @@ static void __init mxs_machine_init(void)
 /*
  * Reset the system. It is called by machine_restart().
  */
-static void mxs_restart(char mode, const char *cmd)
+static void mxs_restart(enum reboot_mode mode, const char *cmd)
 {
 	struct device_node *np;
 	void __iomem *reset_addr;
diff --git a/arch/arm/mach-netx/generic.c b/arch/arm/mach-netx/generic.c
index 1504b68..db25b0c 100644
--- a/arch/arm/mach-netx/generic.c
+++ b/arch/arm/mach-netx/generic.c
@@ -24,6 +24,7 @@
 #include <linux/platform_device.h>
 #include <linux/io.h>
 #include <linux/irqchip/arm-vic.h>
+#include <linux/reboot.h>
 #include <mach/hardware.h>
 #include <asm/mach/map.h>
 #include <mach/netx-regs.h>
@@ -187,7 +188,7 @@ static int __init netx_init(void)
 
 subsys_initcall(netx_init);
 
-void netx_restart(char mode, const char *cmd)
+void netx_restart(enum reboot_mode mode, const char *cmd)
 {
 	writel(NETX_SYSTEM_RES_CR_FIRMW_RES_EN | NETX_SYSTEM_RES_CR_FIRMW_RES,
 	       NETX_SYSTEM_RES_CR);
diff --git a/arch/arm/mach-netx/generic.h b/arch/arm/mach-netx/generic.h
index 768b26b..bb2ce47 100644
--- a/arch/arm/mach-netx/generic.h
+++ b/arch/arm/mach-netx/generic.h
@@ -17,8 +17,10 @@
  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 
+#include <linux/reboot.h>
+
 extern void __init netx_map_io(void);
 extern void __init netx_init_irq(void);
-extern void netx_restart(char, const char *);
+extern void netx_restart(enum reboot_mode, const char *);
 
 extern void netx_timer_init(void);
diff --git a/arch/arm/mach-nomadik/cpu-8815.c b/arch/arm/mach-nomadik/cpu-8815.c
index 2df209e..13e0df9 100644
--- a/arch/arm/mach-nomadik/cpu-8815.c
+++ b/arch/arm/mach-nomadik/cpu-8815.c
@@ -103,7 +103,7 @@ static void __init cpu8815_map_io(void)
 	iotable_init(cpu8815_io_desc, ARRAY_SIZE(cpu8815_io_desc));
 }
 
-static void cpu8815_restart(char mode, const char *cmd)
+static void cpu8815_restart(enum reboot_mode mode, const char *cmd)
 {
 	void __iomem *srcbase = ioremap(NOMADIK_SRC_BASE, SZ_4K);
 
diff --git a/arch/arm/mach-omap1/board-voiceblue.c b/arch/arm/mach-omap1/board-voiceblue.c
index 6c116e1..4677a9c 100644
--- a/arch/arm/mach-omap1/board-voiceblue.c
+++ b/arch/arm/mach-omap1/board-voiceblue.c
@@ -26,6 +26,7 @@
 #include <linux/serial_reg.h>
 #include <linux/smc91x.h>
 #include <linux/export.h>
+#include <linux/reboot.h>
 
 #include <asm/mach-types.h>
 #include <asm/mach/arch.h>
@@ -215,7 +216,7 @@ void voiceblue_wdt_ping(void)
 	gpio_set_value(0, wdt_gpio_state);
 }
 
-static void voiceblue_restart(char mode, const char *cmd)
+static void voiceblue_restart(enum reboot_mode mode, const char *cmd)
 {
 	/*
 	 * Workaround for 5912/1611b bug mentioned in sprz209d.pdf p. 28
diff --git a/arch/arm/mach-omap1/common.h b/arch/arm/mach-omap1/common.h
index 14f7e99..abec019 100644
--- a/arch/arm/mach-omap1/common.h
+++ b/arch/arm/mach-omap1/common.h
@@ -28,6 +28,7 @@
 
 #include <linux/mtd/mtd.h>
 #include <linux/i2c-omap.h>
+#include <linux/reboot.h>
 
 #include <plat/i2c.h>
 
@@ -70,7 +71,7 @@ static inline int omap_serial_wakeup_init(void)
 void omap1_init_early(void);
 void omap1_init_irq(void);
 void omap1_init_late(void);
-void omap1_restart(char, const char *);
+void omap1_restart(enum reboot_mode, const char *);
 
 extern void __init omap_check_revision(void);
 
diff --git a/arch/arm/mach-omap1/reset.c b/arch/arm/mach-omap1/reset.c
index 5eebd7e..72bf4bf 100644
--- a/arch/arm/mach-omap1/reset.c
+++ b/arch/arm/mach-omap1/reset.c
@@ -3,6 +3,7 @@
  */
 #include <linux/kernel.h>
 #include <linux/io.h>
+#include <linux/reboot.h>
 
 #include <mach/hardware.h>
 
@@ -22,7 +23,7 @@
 #define OMAP_EXTWARM_RST_SRC_ID_SHIFT			5
 
 
-void omap1_restart(char mode, const char *cmd)
+void omap1_restart(enum reboot_mode mode, const char *cmd)
 {
 	/*
 	 * Workaround for 5912/1611b bug mentioned in sprz209d.pdf p. 28
diff --git a/arch/arm/mach-omap2/am33xx-restart.c b/arch/arm/mach-omap2/am33xx-restart.c
index 88e4fa8..1eae962 100644
--- a/arch/arm/mach-omap2/am33xx-restart.c
+++ b/arch/arm/mach-omap2/am33xx-restart.c
@@ -6,6 +6,7 @@
  * published by the Free Software Foundation.
  */
 #include <linux/kernel.h>
+#include <linux/reboot.h>
 
 #include "common.h"
 #include "prm-regbits-33xx.h"
@@ -19,7 +20,7 @@
  * Resets the SoC.  For @cmd, see the 'reboot' syscall in
  * kernel/sys.c.  No return value.
  */
-void am33xx_restart(char mode, const char *cmd)
+void am33xx_restart(enum reboot_mode mode, const char *cmd)
 {
 	/* TODO: Handle mode and cmd if necessary */
 
diff --git a/arch/arm/mach-omap2/common.h b/arch/arm/mach-omap2/common.h
index 72cab3f..dfcc182 100644
--- a/arch/arm/mach-omap2/common.h
+++ b/arch/arm/mach-omap2/common.h
@@ -31,6 +31,7 @@
 #include <linux/i2c.h>
 #include <linux/i2c/twl.h>
 #include <linux/i2c-omap.h>
+#include <linux/reboot.h>
 
 #include <asm/proc-fns.h>
 
@@ -119,33 +120,33 @@ static inline void omap_soc_device_init(void)
 #endif
 
 #if defined(CONFIG_SOC_OMAP2420) || defined(CONFIG_SOC_OMAP2430)
-void omap2xxx_restart(char mode, const char *cmd);
+void omap2xxx_restart(enum reboot_mode mode, const char *cmd);
 #else
-static inline void omap2xxx_restart(char mode, const char *cmd)
+static inline void omap2xxx_restart(enum reboot_mode mode, const char *cmd)
 {
 }
 #endif
 
 #ifdef CONFIG_SOC_AM33XX
-void am33xx_restart(char mode, const char *cmd);
+void am33xx_restart(enum reboot_mode mode, const char *cmd);
 #else
-static inline void am33xx_restart(char mode, const char *cmd)
+static inline void am33xx_restart(enum reboot_mode mode, const char *cmd)
 {
 }
 #endif
 
 #ifdef CONFIG_ARCH_OMAP3
-void omap3xxx_restart(char mode, const char *cmd);
+void omap3xxx_restart(enum reboot_mode mode, const char *cmd);
 #else
-static inline void omap3xxx_restart(char mode, const char *cmd)
+static inline void omap3xxx_restart(enum reboot_mode mode, const char *cmd)
 {
 }
 #endif
 
 #if defined(CONFIG_ARCH_OMAP4) || defined(CONFIG_SOC_OMAP5)
-void omap44xx_restart(char mode, const char *cmd);
+void omap44xx_restart(enum reboot_mode mode, const char *cmd);
 #else
-static inline void omap44xx_restart(char mode, const char *cmd)
+static inline void omap44xx_restart(enum reboot_mode mode, const char *cmd)
 {
 }
 #endif
diff --git a/arch/arm/mach-omap2/omap2-restart.c b/arch/arm/mach-omap2/omap2-restart.c
index 719b716..68423e2 100644
--- a/arch/arm/mach-omap2/omap2-restart.c
+++ b/arch/arm/mach-omap2/omap2-restart.c
@@ -31,7 +31,7 @@ static struct clk *reset_virt_prcm_set_ck, *reset_sys_ck;
  * Set the DPLL to bypass so that reboot completes successfully.  No
  * return value.
  */
-void omap2xxx_restart(char mode, const char *cmd)
+void omap2xxx_restart(enum reboot_mode mode, const char *cmd)
 {
 	u32 rate;
 
diff --git a/arch/arm/mach-omap2/omap3-restart.c b/arch/arm/mach-omap2/omap3-restart.c
index 923c582..5de2a0c 100644
--- a/arch/arm/mach-omap2/omap3-restart.c
+++ b/arch/arm/mach-omap2/omap3-restart.c
@@ -12,6 +12,7 @@
  */
 #include <linux/kernel.h>
 #include <linux/init.h>
+#include <linux/reboot.h>
 
 #include "iomap.h"
 #include "common.h"
@@ -28,7 +29,7 @@
  * Resets the SoC.  For @cmd, see the 'reboot' syscall in
  * kernel/sys.c.  No return value.
  */
-void omap3xxx_restart(char mode, const char *cmd)
+void omap3xxx_restart(enum reboot_mode mode, const char *cmd)
 {
 	omap3_ctrl_write_boot_mode((cmd ? (u8)*cmd : 0));
 	omap3xxx_prm_dpll3_reset(); /* never returns */
diff --git a/arch/arm/mach-omap2/omap4-common.c b/arch/arm/mach-omap2/omap4-common.c
index 38cd3a6..5791143 100644
--- a/arch/arm/mach-omap2/omap4-common.c
+++ b/arch/arm/mach-omap2/omap4-common.c
@@ -23,6 +23,7 @@
 #include <linux/export.h>
 #include <linux/irqchip/arm-gic.h>
 #include <linux/of_address.h>
+#include <linux/reboot.h>
 
 #include <asm/hardware/cache-l2x0.h>
 #include <asm/mach/map.h>
diff --git a/arch/arm/mach-omap2/omap4-restart.c b/arch/arm/mach-omap2/omap4-restart.c
index f90e02e..41dfd7d 100644
--- a/arch/arm/mach-omap2/omap4-restart.c
+++ b/arch/arm/mach-omap2/omap4-restart.c
@@ -8,6 +8,7 @@
  */
 
 #include <linux/types.h>
+#include <linux/reboot.h>
 #include "prminst44xx.h"
 
 /**
@@ -18,7 +19,7 @@
  * Resets the SoC.  For @cmd, see the 'reboot' syscall in
  * kernel/sys.c.  No return value.
  */
-void omap44xx_restart(char mode, const char *cmd)
+void omap44xx_restart(enum reboot_mode mode, const char *cmd)
 {
 	/* XXX Should save 'cmd' into scratchpad for use after reboot */
 	omap4_prminst_global_warm_sw_reset(); /* never returns */
diff --git a/arch/arm/mach-orion5x/common.c b/arch/arm/mach-orion5x/common.c
index f8a6db9..b41599f 100644
--- a/arch/arm/mach-orion5x/common.c
+++ b/arch/arm/mach-orion5x/common.c
@@ -347,7 +347,7 @@ void __init orion5x_init(void)
 	orion5x_wdt_init();
 }
 
-void orion5x_restart(char mode, const char *cmd)
+void orion5x_restart(enum reboot_mode mode, const char *cmd)
 {
 	/*
 	 * Enable and issue soft reset
diff --git a/arch/arm/mach-orion5x/common.h b/arch/arm/mach-orion5x/common.h
index cdaa01f..a909afb 100644
--- a/arch/arm/mach-orion5x/common.h
+++ b/arch/arm/mach-orion5x/common.h
@@ -1,6 +1,8 @@
 #ifndef __ARCH_ORION5X_COMMON_H
 #define __ARCH_ORION5X_COMMON_H
 
+#include <linux/reboot.h>
+
 struct dsa_platform_data;
 struct mv643xx_eth_platform_data;
 struct mv_sata_platform_data;
@@ -29,7 +31,7 @@ void orion5x_spi_init(void);
 void orion5x_uart0_init(void);
 void orion5x_uart1_init(void);
 void orion5x_xor_init(void);
-void orion5x_restart(char, const char *);
+void orion5x_restart(enum reboot_mode, const char *);
 
 /*
  * PCIe/PCI functions.
diff --git a/arch/arm/mach-orion5x/ls-chl-setup.c b/arch/arm/mach-orion5x/ls-chl-setup.c
index 24f4e14..6234977 100644
--- a/arch/arm/mach-orion5x/ls-chl-setup.c
+++ b/arch/arm/mach-orion5x/ls-chl-setup.c
@@ -139,7 +139,7 @@ static struct mv_sata_platform_data lschl_sata_data = {
 
 static void lschl_power_off(void)
 {
-	orion5x_restart('h', NULL);
+	orion5x_restart(REBOOT_HARD, NULL);
 }
 
 /*****************************************************************************
diff --git a/arch/arm/mach-orion5x/ls_hgl-setup.c b/arch/arm/mach-orion5x/ls_hgl-setup.c
index fc653bb..fe04c4b 100644
--- a/arch/arm/mach-orion5x/ls_hgl-setup.c
+++ b/arch/arm/mach-orion5x/ls_hgl-setup.c
@@ -185,7 +185,7 @@ static struct mv_sata_platform_data ls_hgl_sata_data = {
 
 static void ls_hgl_power_off(void)
 {
-	orion5x_restart('h', NULL);
+	orion5x_restart(REBOOT_HARD, NULL);
 }
 
 
diff --git a/arch/arm/mach-orion5x/lsmini-setup.c b/arch/arm/mach-orion5x/lsmini-setup.c
index 18e66e6..ca4dbe9 100644
--- a/arch/arm/mach-orion5x/lsmini-setup.c
+++ b/arch/arm/mach-orion5x/lsmini-setup.c
@@ -185,7 +185,7 @@ static struct mv_sata_platform_data lsmini_sata_data = {
 
 static void lsmini_power_off(void)
 {
-	orion5x_restart('h', NULL);
+	orion5x_restart(REBOOT_HARD, NULL);
 }
 
 
diff --git a/arch/arm/mach-picoxcell/common.c b/arch/arm/mach-picoxcell/common.c
index b13f51b..ec79fea 100644
--- a/arch/arm/mach-picoxcell/common.c
+++ b/arch/arm/mach-picoxcell/common.c
@@ -11,6 +11,7 @@
 #include <linux/of.h>
 #include <linux/of_address.h>
 #include <linux/of_platform.h>
+#include <linux/reboot.h>
 
 #include <asm/mach/arch.h>
 #include <asm/mach/map.h>
@@ -63,7 +64,7 @@ static const char *picoxcell_dt_match[] = {
 	NULL
 };
 
-static void picoxcell_wdt_restart(char mode, const char *cmd)
+static void picoxcell_wdt_restart(enum reboot_mode mode, const char *cmd)
 {
 	/*
 	 * Configure the watchdog to reset with the shortest possible timeout
diff --git a/arch/arm/mach-prima2/common.h b/arch/arm/mach-prima2/common.h
index 81135cd..a630485 100644
--- a/arch/arm/mach-prima2/common.h
+++ b/arch/arm/mach-prima2/common.h
@@ -10,6 +10,8 @@
 #define __MACH_PRIMA2_COMMON_H__
 
 #include <linux/init.h>
+#include <linux/reboot.h>
+
 #include <asm/mach/time.h>
 #include <asm/exception.h>
 
@@ -22,7 +24,7 @@ extern void sirfsoc_cpu_die(unsigned int cpu);
 
 extern void __init sirfsoc_of_irq_init(void);
 extern void __init sirfsoc_of_clk_init(void);
-extern void sirfsoc_restart(char, const char *);
+extern void sirfsoc_restart(enum reboot_mode, const char *);
 extern asmlinkage void __exception_irq_entry sirfsoc_handle_irq(struct pt_regs *regs);
 
 #ifndef CONFIG_DEBUG_LL
diff --git a/arch/arm/mach-prima2/rstc.c b/arch/arm/mach-prima2/rstc.c
index d5e0cbc..ccb5339 100644
--- a/arch/arm/mach-prima2/rstc.c
+++ b/arch/arm/mach-prima2/rstc.c
@@ -13,6 +13,7 @@
 #include <linux/device.h>
 #include <linux/of.h>
 #include <linux/of_address.h>
+#include <linux/reboot.h>
 
 void __iomem *sirfsoc_rstc_base;
 static DEFINE_MUTEX(rstc_lock);
@@ -84,7 +85,7 @@ int sirfsoc_reset_device(struct device *dev)
 
 #define SIRFSOC_SYS_RST_BIT  BIT(31)
 
-void sirfsoc_restart(char mode, const char *cmd)
+void sirfsoc_restart(enum reboot_mode mode, const char *cmd)
 {
 	writel(SIRFSOC_SYS_RST_BIT, sirfsoc_rstc_base);
 }
diff --git a/arch/arm/mach-pxa/corgi.c b/arch/arm/mach-pxa/corgi.c
index a5b8fead..f162f1b 100644
--- a/arch/arm/mach-pxa/corgi.c
+++ b/arch/arm/mach-pxa/corgi.c
@@ -663,16 +663,16 @@ static void corgi_poweroff(void)
 		/* Green LED off tells the bootloader to halt */
 		gpio_set_value(CORGI_GPIO_LED_GREEN, 0);
 
-	pxa_restart('h', NULL);
+	pxa_restart(REBOOT_HARD, NULL);
 }
 
-static void corgi_restart(char mode, const char *cmd)
+static void corgi_restart(enum reboot_mode mode, const char *cmd)
 {
 	if (!machine_is_corgi())
 		/* Green LED on tells the bootloader to reboot */
 		gpio_set_value(CORGI_GPIO_LED_GREEN, 1);
 
-	pxa_restart('h', cmd);
+	pxa_restart(REBOOT_HARD, cmd);
 }
 
 static void __init corgi_init(void)
diff --git a/arch/arm/mach-pxa/generic.h b/arch/arm/mach-pxa/generic.h
index fd7ea39..8963984 100644
--- a/arch/arm/mach-pxa/generic.h
+++ b/arch/arm/mach-pxa/generic.h
@@ -9,6 +9,8 @@
  * published by the Free Software Foundation.
  */
 
+#include <linux/reboot.h>
+
 struct irq_data;
 
 extern void pxa_timer_init(void);
@@ -56,4 +58,4 @@ void __init pxa_set_btuart_info(void *info);
 void __init pxa_set_stuart_info(void *info);
 void __init pxa_set_hwuart_info(void *info);
 
-void pxa_restart(char, const char *);
+void pxa_restart(enum reboot_mode, const char *);
diff --git a/arch/arm/mach-pxa/mioa701.c b/arch/arm/mach-pxa/mioa701.c
index e6b0a93..acc9d3c 100644
--- a/arch/arm/mach-pxa/mioa701.c
+++ b/arch/arm/mach-pxa/mioa701.c
@@ -37,6 +37,7 @@
 #include <linux/wm97xx.h>
 #include <linux/mtd/physmap.h>
 #include <linux/usb/gpio_vbus.h>
+#include <linux/reboot.h>
 #include <linux/regulator/max1586.h>
 #include <linux/slab.h>
 #include <linux/i2c/pxa-i2c.h>
@@ -696,13 +697,13 @@ static void mioa701_machine_exit(void);
 static void mioa701_poweroff(void)
 {
 	mioa701_machine_exit();
-	pxa_restart('s', NULL);
+	pxa_restart(REBOOT_SOFT, NULL);
 }
 
-static void mioa701_restart(char c, const char *cmd)
+static void mioa701_restart(enum reboot_mode c, const char *cmd)
 {
 	mioa701_machine_exit();
-	pxa_restart('s', cmd);
+	pxa_restart(REBOOT_SOFT, cmd);
 }
 
 static struct gpio global_gpios[] = {
diff --git a/arch/arm/mach-pxa/poodle.c b/arch/arm/mach-pxa/poodle.c
index 50ccd5f..711d37e 100644
--- a/arch/arm/mach-pxa/poodle.c
+++ b/arch/arm/mach-pxa/poodle.c
@@ -422,7 +422,7 @@ static struct i2c_board_info __initdata poodle_i2c_devices[] = {
 
 static void poodle_poweroff(void)
 {
-	pxa_restart('h', NULL);
+	pxa_restart(REBOOT_HARD, NULL);
 }
 
 static void __init poodle_init(void)
diff --git a/arch/arm/mach-pxa/reset.c b/arch/arm/mach-pxa/reset.c
index 3fab583..0d5dd64 100644
--- a/arch/arm/mach-pxa/reset.c
+++ b/arch/arm/mach-pxa/reset.c
@@ -83,7 +83,7 @@ static void do_hw_reset(void)
 	writel_relaxed(readl_relaxed(OSCR) + 368640, OSMR3);
 }
 
-void pxa_restart(char mode, const char *cmd)
+void pxa_restart(enum reboot_mode mode, const char *cmd)
 {
 	local_irq_disable();
 	local_fiq_disable();
@@ -91,14 +91,14 @@ void pxa_restart(char mode, const char *cmd)
 	clear_reset_status(RESET_STATUS_ALL);
 
 	switch (mode) {
-	case 's':
+	case REBOOT_SOFT:
 		/* Jump into ROM at address 0 */
 		soft_restart(0);
 		break;
-	case 'g':
+	case REBOOT_GPIO:
 		do_gpio_reset();
 		break;
-	case 'h':
+	case REBOOT_HARD:
 	default:
 		do_hw_reset();
 		break;
diff --git a/arch/arm/mach-pxa/spitz.c b/arch/arm/mach-pxa/spitz.c
index c3c0042..2125df0 100644
--- a/arch/arm/mach-pxa/spitz.c
+++ b/arch/arm/mach-pxa/spitz.c
@@ -31,6 +31,7 @@
 #include <linux/regulator/machine.h>
 #include <linux/io.h>
 #include <linux/module.h>
+#include <linux/reboot.h>
 
 #include <asm/setup.h>
 #include <asm/mach-types.h>
@@ -924,10 +925,10 @@ static inline void spitz_i2c_init(void) {}
  ******************************************************************************/
 static void spitz_poweroff(void)
 {
-	pxa_restart('g', NULL);
+	pxa_restart(REBOOT_GPIO, NULL);
 }
 
-static void spitz_restart(char mode, const char *cmd)
+static void spitz_restart(enum reboot_mode mode, const char *cmd)
 {
 	uint32_t msc0 = __raw_readl(MSC0);
 	/* Bootloader magic for a reboot */
diff --git a/arch/arm/mach-pxa/tosa.c b/arch/arm/mach-pxa/tosa.c
index a41992f..0206b91 100644
--- a/arch/arm/mach-pxa/tosa.c
+++ b/arch/arm/mach-pxa/tosa.c
@@ -36,6 +36,7 @@
 #include <linux/input/matrix_keypad.h>
 #include <linux/i2c/pxa-i2c.h>
 #include <linux/usb/gpio_vbus.h>
+#include <linux/reboot.h>
 
 #include <asm/setup.h>
 #include <asm/mach-types.h>
@@ -911,10 +912,10 @@ static struct platform_device *devices[] __initdata = {
 
 static void tosa_poweroff(void)
 {
-	pxa_restart('g', NULL);
+	pxa_restart(REBOOT_GPIO, NULL);
 }
 
-static void tosa_restart(char mode, const char *cmd)
+static void tosa_restart(enum reboot_mode mode, const char *cmd)
 {
 	uint32_t msc0 = __raw_readl(MSC0);
 
diff --git a/arch/arm/mach-realview/realview_eb.c b/arch/arm/mach-realview/realview_eb.c
index 5b1c8bf..c85ddb2 100644
--- a/arch/arm/mach-realview/realview_eb.c
+++ b/arch/arm/mach-realview/realview_eb.c
@@ -29,6 +29,7 @@
 #include <linux/io.h>
 #include <linux/irqchip/arm-gic.h>
 #include <linux/platform_data/clk-realview.h>
+#include <linux/reboot.h>
 
 #include <mach/hardware.h>
 #include <asm/irq.h>
@@ -418,7 +419,7 @@ static void __init realview_eb_timer_init(void)
 	realview_eb_twd_init();
 }
 
-static void realview_eb_restart(char mode, const char *cmd)
+static void realview_eb_restart(enum reboot_mode mode, const char *cmd)
 {
 	void __iomem *reset_ctrl = __io_address(REALVIEW_SYS_RESETCTL);
 	void __iomem *lock_ctrl = __io_address(REALVIEW_SYS_LOCK);
diff --git a/arch/arm/mach-realview/realview_pb1176.c b/arch/arm/mach-realview/realview_pb1176.c
index d5e83a1..c5eade7 100644
--- a/arch/arm/mach-realview/realview_pb1176.c
+++ b/arch/arm/mach-realview/realview_pb1176.c
@@ -31,6 +31,7 @@
 #include <linux/io.h>
 #include <linux/irqchip/arm-gic.h>
 #include <linux/platform_data/clk-realview.h>
+#include <linux/reboot.h>
 
 #include <mach/hardware.h>
 #include <asm/irq.h>
@@ -329,7 +330,7 @@ static void __init realview_pb1176_timer_init(void)
 	realview_timer_init(IRQ_DC1176_TIMER0);
 }
 
-static void realview_pb1176_restart(char mode, const char *cmd)
+static void realview_pb1176_restart(enum reboot_mode mode, const char *cmd)
 {
 	void __iomem *reset_ctrl = __io_address(REALVIEW_SYS_RESETCTL);
 	void __iomem *lock_ctrl = __io_address(REALVIEW_SYS_LOCK);
diff --git a/arch/arm/mach-realview/realview_pb11mp.c b/arch/arm/mach-realview/realview_pb11mp.c
index c3cfe21..f4b0962 100644
--- a/arch/arm/mach-realview/realview_pb11mp.c
+++ b/arch/arm/mach-realview/realview_pb11mp.c
@@ -29,6 +29,7 @@
 #include <linux/io.h>
 #include <linux/irqchip/arm-gic.h>
 #include <linux/platform_data/clk-realview.h>
+#include <linux/reboot.h>
 
 #include <mach/hardware.h>
 #include <asm/irq.h>
@@ -316,7 +317,7 @@ static void __init realview_pb11mp_timer_init(void)
 	realview_pb11mp_twd_init();
 }
 
-static void realview_pb11mp_restart(char mode, const char *cmd)
+static void realview_pb11mp_restart(enum reboot_mode mode, const char *cmd)
 {
 	void __iomem *reset_ctrl = __io_address(REALVIEW_SYS_RESETCTL);
 	void __iomem *lock_ctrl = __io_address(REALVIEW_SYS_LOCK);
diff --git a/arch/arm/mach-realview/realview_pba8.c b/arch/arm/mach-realview/realview_pba8.c
index dde652a..10a3e1d 100644
--- a/arch/arm/mach-realview/realview_pba8.c
+++ b/arch/arm/mach-realview/realview_pba8.c
@@ -29,6 +29,7 @@
 #include <linux/io.h>
 #include <linux/irqchip/arm-gic.h>
 #include <linux/platform_data/clk-realview.h>
+#include <linux/reboot.h>
 
 #include <asm/irq.h>
 #include <asm/mach-types.h>
@@ -264,7 +265,7 @@ static void __init realview_pba8_timer_init(void)
 	realview_timer_init(IRQ_PBA8_TIMER0_1);
 }
 
-static void realview_pba8_restart(char mode, const char *cmd)
+static void realview_pba8_restart(enum reboot_mode mode, const char *cmd)
 {
 	void __iomem *reset_ctrl = __io_address(REALVIEW_SYS_RESETCTL);
 	void __iomem *lock_ctrl = __io_address(REALVIEW_SYS_LOCK);
diff --git a/arch/arm/mach-realview/realview_pbx.c b/arch/arm/mach-realview/realview_pbx.c
index 54f0185..9d75493 100644
--- a/arch/arm/mach-realview/realview_pbx.c
+++ b/arch/arm/mach-realview/realview_pbx.c
@@ -28,6 +28,7 @@
 #include <linux/io.h>
 #include <linux/irqchip/arm-gic.h>
 #include <linux/platform_data/clk-realview.h>
+#include <linux/reboot.h>
 
 #include <asm/irq.h>
 #include <asm/mach-types.h>
@@ -344,7 +345,7 @@ static void realview_pbx_fixup(struct tag *tags, char **from,
 #endif
 }
 
-static void realview_pbx_restart(char mode, const char *cmd)
+static void realview_pbx_restart(enum reboot_mode mode, const char *cmd)
 {
 	void __iomem *reset_ctrl = __io_address(REALVIEW_SYS_RESETCTL);
 	void __iomem *lock_ctrl = __io_address(REALVIEW_SYS_LOCK);
diff --git a/arch/arm/mach-rpc/riscpc.c b/arch/arm/mach-rpc/riscpc.c
index a302cf5..09d602b 100644
--- a/arch/arm/mach-rpc/riscpc.c
+++ b/arch/arm/mach-rpc/riscpc.c
@@ -20,6 +20,7 @@
 #include <linux/ata_platform.h>
 #include <linux/io.h>
 #include <linux/i2c.h>
+#include <linux/reboot.h>
 
 #include <asm/elf.h>
 #include <asm/mach-types.h>
@@ -201,7 +202,7 @@ static int __init rpc_init(void)
 
 arch_initcall(rpc_init);
 
-static void rpc_restart(char mode, const char *cmd)
+static void rpc_restart(enum reboot_mode mode, const char *cmd)
 {
 	iomd_writeb(0, IOMD_ROMCR0);
 
diff --git a/arch/arm/mach-s3c24xx/common.h b/arch/arm/mach-s3c24xx/common.h
index 307c371..84b2806 100644
--- a/arch/arm/mach-s3c24xx/common.h
+++ b/arch/arm/mach-s3c24xx/common.h
@@ -12,6 +12,8 @@
 #ifndef __ARCH_ARM_MACH_S3C24XX_COMMON_H
 #define __ARCH_ARM_MACH_S3C24XX_COMMON_H __FILE__
 
+#include <linux/reboot.h>
+
 struct s3c2410_uartcfg;
 
 #ifdef CONFIG_CPU_S3C2410
@@ -20,7 +22,7 @@ extern  int s3c2410a_init(void);
 extern void s3c2410_map_io(void);
 extern void s3c2410_init_uarts(struct s3c2410_uartcfg *cfg, int no);
 extern void s3c2410_init_clocks(int xtal);
-extern void s3c2410_restart(char mode, const char *cmd);
+extern void s3c2410_restart(enum reboot_mode mode, const char *cmd);
 extern void s3c2410_init_irq(void);
 #else
 #define s3c2410_init_clocks NULL
@@ -36,7 +38,7 @@ extern void s3c2412_map_io(void);
 extern void s3c2412_init_uarts(struct s3c2410_uartcfg *cfg, int no);
 extern void s3c2412_init_clocks(int xtal);
 extern  int s3c2412_baseclk_add(void);
-extern void s3c2412_restart(char mode, const char *cmd);
+extern void s3c2412_restart(enum reboot_mode mode, const char *cmd);
 extern void s3c2412_init_irq(void);
 #else
 #define s3c2412_init_clocks NULL
@@ -51,7 +53,7 @@ extern void s3c2416_map_io(void);
 extern void s3c2416_init_uarts(struct s3c2410_uartcfg *cfg, int no);
 extern void s3c2416_init_clocks(int xtal);
 extern  int s3c2416_baseclk_add(void);
-extern void s3c2416_restart(char mode, const char *cmd);
+extern void s3c2416_restart(enum reboot_mode mode, const char *cmd);
 extern void s3c2416_init_irq(void);
 
 extern struct syscore_ops s3c2416_irq_syscore_ops;
@@ -66,7 +68,7 @@ extern struct syscore_ops s3c2416_irq_syscore_ops;
 extern void s3c244x_map_io(void);
 extern void s3c244x_init_uarts(struct s3c2410_uartcfg *cfg, int no);
 extern void s3c244x_init_clocks(int xtal);
-extern void s3c244x_restart(char mode, const char *cmd);
+extern void s3c244x_restart(enum reboot_mode mode, const char *cmd);
 #else
 #define s3c244x_init_clocks NULL
 #define s3c244x_init_uarts NULL
@@ -96,7 +98,7 @@ extern void s3c2443_map_io(void);
 extern void s3c2443_init_uarts(struct s3c2410_uartcfg *cfg, int no);
 extern void s3c2443_init_clocks(int xtal);
 extern  int s3c2443_baseclk_add(void);
-extern void s3c2443_restart(char mode, const char *cmd);
+extern void s3c2443_restart(enum reboot_mode mode, const char *cmd);
 extern void s3c2443_init_irq(void);
 #else
 #define s3c2443_init_clocks NULL
diff --git a/arch/arm/mach-s3c24xx/s3c2410.c b/arch/arm/mach-s3c24xx/s3c2410.c
index ff384ac..34676d1 100644
--- a/arch/arm/mach-s3c24xx/s3c2410.c
+++ b/arch/arm/mach-s3c24xx/s3c2410.c
@@ -22,6 +22,7 @@
 #include <linux/syscore_ops.h>
 #include <linux/serial_core.h>
 #include <linux/platform_device.h>
+#include <linux/reboot.h>
 #include <linux/io.h>
 
 #include <asm/mach/arch.h>
@@ -196,9 +197,9 @@ int __init s3c2410a_init(void)
 	return s3c2410_init();
 }
 
-void s3c2410_restart(char mode, const char *cmd)
+void s3c2410_restart(enum reboot_mode mode, const char *cmd)
 {
-	if (mode == 's') {
+	if (mode == REBOOT_SOFT) {
 		soft_restart(0);
 	}
 
diff --git a/arch/arm/mach-s3c24xx/s3c2412.c b/arch/arm/mach-s3c24xx/s3c2412.c
index 0f864d4..0251650c 100644
--- a/arch/arm/mach-s3c24xx/s3c2412.c
+++ b/arch/arm/mach-s3c24xx/s3c2412.c
@@ -22,6 +22,7 @@
 #include <linux/serial_core.h>
 #include <linux/platform_device.h>
 #include <linux/io.h>
+#include <linux/reboot.h>
 
 #include <asm/mach/arch.h>
 #include <asm/mach/map.h>
@@ -129,9 +130,9 @@ static void s3c2412_idle(void)
 	cpu_do_idle();
 }
 
-void s3c2412_restart(char mode, const char *cmd)
+void s3c2412_restart(enum reboot_mode mode, const char *cmd)
 {
-	if (mode == 's')
+	if (mode == REBOOT_SOFT)
 		soft_restart(0);
 
 	/* errata "Watch-dog/Software Reset Problem" specifies that
diff --git a/arch/arm/mach-s3c24xx/s3c2416.c b/arch/arm/mach-s3c24xx/s3c2416.c
index b9c5d38..9ef3ccf 100644
--- a/arch/arm/mach-s3c24xx/s3c2416.c
+++ b/arch/arm/mach-s3c24xx/s3c2416.c
@@ -35,6 +35,7 @@
 #include <linux/syscore_ops.h>
 #include <linux/clk.h>
 #include <linux/io.h>
+#include <linux/reboot.h>
 
 #include <asm/mach/arch.h>
 #include <asm/mach/map.h>
@@ -79,9 +80,9 @@ static struct device s3c2416_dev = {
 	.bus		= &s3c2416_subsys,
 };
 
-void s3c2416_restart(char mode, const char *cmd)
+void s3c2416_restart(enum reboot_mode mode, const char *cmd)
 {
-	if (mode == 's')
+	if (mode == REBOOT_SOFT)
 		soft_restart(0);
 
 	__raw_writel(S3C2443_SWRST_RESET, S3C2443_SWRST);
diff --git a/arch/arm/mach-s3c24xx/s3c2443.c b/arch/arm/mach-s3c24xx/s3c2443.c
index 8328cd6..b6c7191 100644
--- a/arch/arm/mach-s3c24xx/s3c2443.c
+++ b/arch/arm/mach-s3c24xx/s3c2443.c
@@ -22,6 +22,7 @@
 #include <linux/device.h>
 #include <linux/clk.h>
 #include <linux/io.h>
+#include <linux/reboot.h>
 
 #include <asm/mach/arch.h>
 #include <asm/mach/map.h>
@@ -59,9 +60,9 @@ static struct device s3c2443_dev = {
 	.bus		= &s3c2443_subsys,
 };
 
-void s3c2443_restart(char mode, const char *cmd)
+void s3c2443_restart(enum reboot_mode mode, const char *cmd)
 {
-	if (mode == 's')
+	if (mode == REBOOT_SOFT)
 		soft_restart(0);
 
 	__raw_writel(S3C2443_SWRST_RESET, S3C2443_SWRST);
diff --git a/arch/arm/mach-s3c24xx/s3c244x.c b/arch/arm/mach-s3c24xx/s3c244x.c
index d0423e2..911b555 100644
--- a/arch/arm/mach-s3c24xx/s3c244x.c
+++ b/arch/arm/mach-s3c24xx/s3c244x.c
@@ -18,6 +18,7 @@
 #include <linux/init.h>
 #include <linux/serial_core.h>
 #include <linux/platform_device.h>
+#include <linux/reboot.h>
 #include <linux/device.h>
 #include <linux/syscore_ops.h>
 #include <linux/clk.h>
@@ -198,9 +199,9 @@ struct syscore_ops s3c244x_pm_syscore_ops = {
 	.resume		= s3c244x_resume,
 };
 
-void s3c244x_restart(char mode, const char *cmd)
+void s3c244x_restart(enum reboot_mode mode, const char *cmd)
 {
-	if (mode == 's')
+	if (mode == REBOOT_SOFT)
 		soft_restart(0);
 
 	samsung_wdt_reset();
diff --git a/arch/arm/mach-s3c64xx/common.c b/arch/arm/mach-s3c64xx/common.c
index 1aed6f4..3f62e46 100644
--- a/arch/arm/mach-s3c64xx/common.c
+++ b/arch/arm/mach-s3c64xx/common.c
@@ -21,6 +21,7 @@
 #include <linux/ioport.h>
 #include <linux/serial_core.h>
 #include <linux/platform_device.h>
+#include <linux/reboot.h>
 #include <linux/io.h>
 #include <linux/dma-mapping.h>
 #include <linux/irq.h>
@@ -381,9 +382,9 @@ static int __init s3c64xx_init_irq_eint(void)
 }
 arch_initcall(s3c64xx_init_irq_eint);
 
-void s3c64xx_restart(char mode, const char *cmd)
+void s3c64xx_restart(enum reboot_mode mode, const char *cmd)
 {
-	if (mode != 's')
+	if (mode != REBOOT_SOFT)
 		samsung_wdt_reset();
 
 	/* if all else fails, or mode was for soft, jump to 0 */
diff --git a/arch/arm/mach-s3c64xx/common.h b/arch/arm/mach-s3c64xx/common.h
index 6cfc99b..e8f990b 100644
--- a/arch/arm/mach-s3c64xx/common.h
+++ b/arch/arm/mach-s3c64xx/common.h
@@ -17,13 +17,15 @@
 #ifndef __ARCH_ARM_MACH_S3C64XX_COMMON_H
 #define __ARCH_ARM_MACH_S3C64XX_COMMON_H
 
+#include <linux/reboot.h>
+
 void s3c64xx_init_irq(u32 vic0, u32 vic1);
 void s3c64xx_init_io(struct map_desc *mach_desc, int size);
 
 void s3c64xx_register_clocks(unsigned long xtal, unsigned armclk_limit);
 void s3c64xx_setup_clocks(void);
 
-void s3c64xx_restart(char mode, const char *cmd);
+void s3c64xx_restart(enum reboot_mode mode, const char *cmd);
 void s3c64xx_init_late(void);
 
 #ifdef CONFIG_CPU_S3C6400
diff --git a/arch/arm/mach-s5p64x0/common.c b/arch/arm/mach-s5p64x0/common.c
index 76d0053..dfdfdc3 100644
--- a/arch/arm/mach-s5p64x0/common.c
+++ b/arch/arm/mach-s5p64x0/common.c
@@ -24,6 +24,7 @@
 #include <linux/dma-mapping.h>
 #include <linux/gpio.h>
 #include <linux/irq.h>
+#include <linux/reboot.h>
 
 #include <asm/irq.h>
 #include <asm/proc-fns.h>
@@ -439,9 +440,9 @@ static int __init s5p64x0_init_irq_eint(void)
 }
 arch_initcall(s5p64x0_init_irq_eint);
 
-void s5p64x0_restart(char mode, const char *cmd)
+void s5p64x0_restart(enum reboot_mode mode, const char *cmd)
 {
-	if (mode != 's')
+	if (mode != REBOOT_SOFT)
 		samsung_wdt_reset();
 
 	soft_restart(0);
diff --git a/arch/arm/mach-s5p64x0/common.h b/arch/arm/mach-s5p64x0/common.h
index f8a60fd..f3a9b43 100644
--- a/arch/arm/mach-s5p64x0/common.h
+++ b/arch/arm/mach-s5p64x0/common.h
@@ -12,6 +12,8 @@
 #ifndef __ARCH_ARM_MACH_S5P64X0_COMMON_H
 #define __ARCH_ARM_MACH_S5P64X0_COMMON_H
 
+#include <linux/reboot.h>
+
 void s5p6440_init_irq(void);
 void s5p6450_init_irq(void);
 void s5p64x0_init_io(struct map_desc *mach_desc, int size);
@@ -22,7 +24,7 @@ void s5p6440_setup_clocks(void);
 void s5p6450_register_clocks(void);
 void s5p6450_setup_clocks(void);
 
-void s5p64x0_restart(char mode, const char *cmd);
+void s5p64x0_restart(enum reboot_mode mode, const char *cmd);
 
 #ifdef CONFIG_CPU_S5P6440
 
diff --git a/arch/arm/mach-s5pc100/common.c b/arch/arm/mach-s5pc100/common.c
index 5110315..4bdfecf 100644
--- a/arch/arm/mach-s5pc100/common.c
+++ b/arch/arm/mach-s5pc100/common.c
@@ -24,6 +24,7 @@
 #include <linux/serial_core.h>
 #include <linux/platform_device.h>
 #include <linux/sched.h>
+#include <linux/reboot.h>
 
 #include <asm/irq.h>
 #include <asm/proc-fns.h>
@@ -217,9 +218,9 @@ void __init s5pc100_init_uarts(struct s3c2410_uartcfg *cfg, int no)
 	s3c24xx_init_uartdevs("s3c6400-uart", s5p_uart_resources, cfg, no);
 }
 
-void s5pc100_restart(char mode, const char *cmd)
+void s5pc100_restart(enum reboot_mode mode, const char *cmd)
 {
-	if (mode != 's')
+	if (mode != REBOOT_SOFT)
 		samsung_wdt_reset();
 
 	soft_restart(0);
diff --git a/arch/arm/mach-s5pc100/common.h b/arch/arm/mach-s5pc100/common.h
index c41f912..08d782d 100644
--- a/arch/arm/mach-s5pc100/common.h
+++ b/arch/arm/mach-s5pc100/common.h
@@ -12,13 +12,15 @@
 #ifndef __ARCH_ARM_MACH_S5PC100_COMMON_H
 #define __ARCH_ARM_MACH_S5PC100_COMMON_H
 
+#include <linux/reboot.h>
+
 void s5pc100_init_io(struct map_desc *mach_desc, int size);
 void s5pc100_init_irq(void);
 
 void s5pc100_register_clocks(void);
 void s5pc100_setup_clocks(void);
 
-void s5pc100_restart(char mode, const char *cmd);
+void s5pc100_restart(enum reboot_mode mode, const char *cmd);
 
 extern  int s5pc100_init(void);
 extern void s5pc100_map_io(void);
diff --git a/arch/arm/mach-s5pv210/common.c b/arch/arm/mach-s5pv210/common.c
index 9dfe93e..023f1a7 100644
--- a/arch/arm/mach-s5pv210/common.c
+++ b/arch/arm/mach-s5pv210/common.c
@@ -143,7 +143,7 @@ static struct map_desc s5pv210_iodesc[] __initdata = {
 	}
 };
 
-void s5pv210_restart(char mode, const char *cmd)
+void s5pv210_restart(enum reboot_mode mode, const char *cmd)
 {
 	__raw_writel(0x1, S5P_SWRESET);
 }
diff --git a/arch/arm/mach-s5pv210/common.h b/arch/arm/mach-s5pv210/common.h
index 0a1cc0ae..fe1beb5 100644
--- a/arch/arm/mach-s5pv210/common.h
+++ b/arch/arm/mach-s5pv210/common.h
@@ -12,13 +12,15 @@
 #ifndef __ARCH_ARM_MACH_S5PV210_COMMON_H
 #define __ARCH_ARM_MACH_S5PV210_COMMON_H
 
+#include <linux/reboot.h>
+
 void s5pv210_init_io(struct map_desc *mach_desc, int size);
 void s5pv210_init_irq(void);
 
 void s5pv210_register_clocks(void);
 void s5pv210_setup_clocks(void);
 
-void s5pv210_restart(char mode, const char *cmd);
+void s5pv210_restart(enum reboot_mode mode, const char *cmd);
 
 extern  int s5pv210_init(void);
 extern void s5pv210_map_io(void);
diff --git a/arch/arm/mach-sa1100/generic.c b/arch/arm/mach-sa1100/generic.c
index 9db3e98..f25b611 100644
--- a/arch/arm/mach-sa1100/generic.c
+++ b/arch/arm/mach-sa1100/generic.c
@@ -19,6 +19,7 @@
 #include <linux/cpufreq.h>
 #include <linux/ioport.h>
 #include <linux/platform_device.h>
+#include <linux/reboot.h>
 
 #include <video/sa1100fb.h>
 
@@ -131,9 +132,9 @@ static void sa1100_power_off(void)
 	PMCR = PMCR_SF;
 }
 
-void sa11x0_restart(char mode, const char *cmd)
+void sa11x0_restart(enum reboot_mode mode, const char *cmd)
 {
-	if (mode == 's') {
+	if (mode == REBOOT_SOFT) {
 		/* Jump into ROM at address 0 */
 		soft_restart(0);
 	} else {
diff --git a/arch/arm/mach-sa1100/generic.h b/arch/arm/mach-sa1100/generic.h
index 2abc6a1..9a33695 100644
--- a/arch/arm/mach-sa1100/generic.h
+++ b/arch/arm/mach-sa1100/generic.h
@@ -3,12 +3,13 @@
  *
  * Author: Nicolas Pitre
  */
+#include <linux/reboot.h>
 
 extern void sa1100_timer_init(void);
 extern void __init sa1100_map_io(void);
 extern void __init sa1100_init_irq(void);
 extern void __init sa1100_init_gpio(void);
-extern void sa11x0_restart(char, const char *);
+extern void sa11x0_restart(enum reboot_mode, const char *);
 extern void sa11x0_init_late(void);
 
 #define SET_BANK(__nr,__start,__size) \
diff --git a/arch/arm/mach-shark/core.c b/arch/arm/mach-shark/core.c
index 1535557..1d32c5e 100644
--- a/arch/arm/mach-shark/core.c
+++ b/arch/arm/mach-shark/core.c
@@ -11,6 +11,7 @@
 #include <linux/serial_8250.h>
 #include <linux/io.h>
 #include <linux/cpu.h>
+#include <linux/reboot.h>
 
 #include <asm/setup.h>
 #include <asm/mach-types.h>
@@ -24,7 +25,7 @@
 #define ROMCARD_SIZE            0x08000000
 #define ROMCARD_START           0x10000000
 
-static void shark_restart(char mode, const char *cmd)
+static void shark_restart(enum reboot_mode mode, const char *cmd)
 {
         short temp;
         /* Reset the Machine via pc[3] of the sequoia chipset */
diff --git a/arch/arm/mach-shmobile/board-armadillo800eva.c b/arch/arm/mach-shmobile/board-armadillo800eva.c
index 44a6215..45221fd 100644
--- a/arch/arm/mach-shmobile/board-armadillo800eva.c
+++ b/arch/arm/mach-shmobile/board-armadillo800eva.c
@@ -42,6 +42,7 @@
 #include <linux/mmc/sh_mmcif.h>
 #include <linux/mmc/sh_mobile_sdhi.h>
 #include <linux/i2c-gpio.h>
+#include <linux/reboot.h>
 #include <mach/common.h>
 #include <mach/irqs.h>
 #include <mach/r8a7740.h>
@@ -1259,7 +1260,7 @@ static void __init eva_add_early_devices(void)
 }
 
 #define RESCNT2 IOMEM(0xe6188020)
-static void eva_restart(char mode, const char *cmd)
+static void eva_restart(enum reboot_mode mode, const char *cmd)
 {
 	/* Do soft power on reset */
 	writel((1 << 31), RESCNT2);
diff --git a/arch/arm/mach-shmobile/board-kzm9g.c b/arch/arm/mach-shmobile/board-kzm9g.c
index 165483c..1068120 100644
--- a/arch/arm/mach-shmobile/board-kzm9g.c
+++ b/arch/arm/mach-shmobile/board-kzm9g.c
@@ -34,6 +34,7 @@
 #include <linux/pinctrl/machine.h>
 #include <linux/pinctrl/pinconf-generic.h>
 #include <linux/platform_device.h>
+#include <linux/reboot.h>
 #include <linux/regulator/fixed.h>
 #include <linux/regulator/machine.h>
 #include <linux/smsc911x.h>
@@ -890,7 +891,7 @@ static void __init kzm_init(void)
 	sh73a0_pm_init();
 }
 
-static void kzm9g_restart(char mode, const char *cmd)
+static void kzm9g_restart(enum reboot_mode mode, const char *cmd)
 {
 #define RESCNT2 IOMEM(0xe6188020)
 	/* Do soft power on reset */
diff --git a/arch/arm/mach-socfpga/socfpga.c b/arch/arm/mach-socfpga/socfpga.c
index 8ea11b47..bfce964 100644
--- a/arch/arm/mach-socfpga/socfpga.c
+++ b/arch/arm/mach-socfpga/socfpga.c
@@ -19,6 +19,7 @@
 #include <linux/of_address.h>
 #include <linux/of_irq.h>
 #include <linux/of_platform.h>
+#include <linux/reboot.h>
 
 #include <asm/hardware/cache-l2x0.h>
 #include <asm/mach/arch.h>
@@ -89,13 +90,13 @@ static void __init socfpga_init_irq(void)
 	socfpga_sysmgr_init();
 }
 
-static void socfpga_cyclone5_restart(char mode, const char *cmd)
+static void socfpga_cyclone5_restart(enum reboot_mode mode, const char *cmd)
 {
 	u32 temp;
 
 	temp = readl(rst_manager_base_addr + SOCFPGA_RSTMGR_CTRL);
 
-	if (mode == 'h')
+	if (mode == REBOOT_HARD)
 		temp |= RSTMGR_CTRL_SWCOLDRSTREQ;
 	else
 		temp |= RSTMGR_CTRL_SWWARMRSTREQ;
diff --git a/arch/arm/mach-spear/generic.h b/arch/arm/mach-spear/generic.h
index a9fd453..904f2c9 100644
--- a/arch/arm/mach-spear/generic.h
+++ b/arch/arm/mach-spear/generic.h
@@ -16,6 +16,8 @@
 #include <linux/dmaengine.h>
 #include <linux/amba/pl08x.h>
 #include <linux/init.h>
+#include <linux/reboot.h>
+
 #include <asm/mach/time.h>
 
 extern void spear13xx_timer_init(void);
@@ -32,7 +34,7 @@ void __init spear6xx_clk_init(void __iomem *misc_base);
 void __init spear13xx_map_io(void);
 void __init spear13xx_l2x0_init(void);
 
-void spear_restart(char, const char *);
+void spear_restart(enum reboot_mode, const char *);
 
 void spear13xx_secondary_startup(void);
 void __cpuinit spear13xx_cpu_die(unsigned int cpu);
diff --git a/arch/arm/mach-spear/restart.c b/arch/arm/mach-spear/restart.c
index 2b44500..ce5e098 100644
--- a/arch/arm/mach-spear/restart.c
+++ b/arch/arm/mach-spear/restart.c
@@ -12,14 +12,15 @@
  */
 #include <linux/io.h>
 #include <linux/amba/sp810.h>
+#include <linux/reboot.h>
 #include <asm/system_misc.h>
 #include <mach/spear.h>
 #include "generic.h"
 
 #define SPEAR13XX_SYS_SW_RES			(VA_MISC_BASE + 0x204)
-void spear_restart(char mode, const char *cmd)
+void spear_restart(enum reboot_mode mode, const char *cmd)
 {
-	if (mode == 's') {
+	if (mode == REBOOT_SOFT) {
 		/* software reset, Jump into ROM at address 0 */
 		soft_restart(0);
 	} else {
diff --git a/arch/arm/mach-sunxi/sunxi.c b/arch/arm/mach-sunxi/sunxi.c
index 84485a1..38a3c55 100644
--- a/arch/arm/mach-sunxi/sunxi.c
+++ b/arch/arm/mach-sunxi/sunxi.c
@@ -18,6 +18,7 @@
 #include <linux/of_irq.h>
 #include <linux/of_platform.h>
 #include <linux/io.h>
+#include <linux/reboot.h>
 
 #include <linux/clk/sunxi.h>
 
@@ -33,7 +34,7 @@
 
 static void __iomem *wdt_base;
 
-static void sun4i_restart(char mode, const char *cmd)
+static void sun4i_restart(enum reboot_mode mode, const char *cmd)
 {
 	if (!wdt_base)
 		return;
diff --git a/arch/arm/mach-tegra/board.h b/arch/arm/mach-tegra/board.h
index 1787327..9a6659f 100644
--- a/arch/arm/mach-tegra/board.h
+++ b/arch/arm/mach-tegra/board.h
@@ -23,8 +23,9 @@
 #define __MACH_TEGRA_BOARD_H
 
 #include <linux/types.h>
+#include <linux/reboot.h>
 
-void tegra_assert_system_reset(char mode, const char *cmd);
+void tegra_assert_system_reset(enum reboot_mode mode, const char *cmd);
 
 void __init tegra_init_early(void);
 void __init tegra_map_common_io(void);
diff --git a/arch/arm/mach-tegra/common.c b/arch/arm/mach-tegra/common.c
index b25153e..94a119a 100644
--- a/arch/arm/mach-tegra/common.c
+++ b/arch/arm/mach-tegra/common.c
@@ -22,6 +22,7 @@
 #include <linux/io.h>
 #include <linux/clk.h>
 #include <linux/delay.h>
+#include <linux/reboot.h>
 #include <linux/irqchip.h>
 #include <linux/clk-provider.h>
 
@@ -68,7 +69,7 @@ void __init tegra_dt_init_irq(void)
 }
 #endif
 
-void tegra_assert_system_reset(char mode, const char *cmd)
+void tegra_assert_system_reset(enum reboot_mode mode, const char *cmd)
 {
 	void __iomem *reset = IO_ADDRESS(TEGRA_PMC_BASE + 0);
 	u32 reg;
diff --git a/arch/arm/mach-u300/core.c b/arch/arm/mach-u300/core.c
index 4f7ac2a..35670b1 100644
--- a/arch/arm/mach-u300/core.c
+++ b/arch/arm/mach-u300/core.c
@@ -300,11 +300,11 @@ static void __init u300_init_check_chip(void)
 /* Forward declare this function from the watchdog */
 void coh901327_watchdog_reset(void);
 
-static void u300_restart(char mode, const char *cmd)
+static void u300_restart(enum reboot_mode mode, const char *cmd)
 {
 	switch (mode) {
-	case 's':
-	case 'h':
+	case REBOOT_SOFT:
+	case REBOOT_HARD:
 #ifdef CONFIG_COH901327_WATCHDOG
 		coh901327_watchdog_reset();
 #endif
diff --git a/arch/arm/mach-versatile/core.c b/arch/arm/mach-versatile/core.c
index 54bb80b..3b0572f 100644
--- a/arch/arm/mach-versatile/core.c
+++ b/arch/arm/mach-versatile/core.c
@@ -38,6 +38,7 @@
 #include <linux/clkdev.h>
 #include <linux/mtd/physmap.h>
 #include <linux/bitops.h>
+#include <linux/reboot.h>
 
 #include <asm/irq.h>
 #include <asm/hardware/arm_timer.h>
@@ -733,7 +734,7 @@ static void versatile_leds_event(led_event_t ledevt)
 }
 #endif	/* CONFIG_LEDS */
 
-void versatile_restart(char mode, const char *cmd)
+void versatile_restart(enum reboot_mode mode, const char *cmd)
 {
 	void __iomem *sys = __io_address(VERSATILE_SYS_BASE);
 	u32 val;
diff --git a/arch/arm/mach-versatile/core.h b/arch/arm/mach-versatile/core.h
index 5c1b87d..f06d576 100644
--- a/arch/arm/mach-versatile/core.h
+++ b/arch/arm/mach-versatile/core.h
@@ -24,13 +24,14 @@
 
 #include <linux/amba/bus.h>
 #include <linux/of_platform.h>
+#include <linux/reboot.h>
 
 extern void __init versatile_init(void);
 extern void __init versatile_init_early(void);
 extern void __init versatile_init_irq(void);
 extern void __init versatile_map_io(void);
 extern void versatile_timer_init(void);
-extern void versatile_restart(char, const char *);
+extern void versatile_restart(enum reboot_mode, const char *);
 extern unsigned int mmc_status(struct device *dev);
 #ifdef CONFIG_OF
 extern struct of_dev_auxdata versatile_auxdata_lookup[];
diff --git a/arch/arm/mach-vt8500/vt8500.c b/arch/arm/mach-vt8500/vt8500.c
index f8f2f00..eefaa60 100644
--- a/arch/arm/mach-vt8500/vt8500.c
+++ b/arch/arm/mach-vt8500/vt8500.c
@@ -21,6 +21,7 @@
 #include <linux/clocksource.h>
 #include <linux/io.h>
 #include <linux/pm.h>
+#include <linux/reboot.h>
 
 #include <asm/mach-types.h>
 #include <asm/mach/arch.h>
@@ -46,7 +47,7 @@
 
 static void __iomem *pmc_base;
 
-void vt8500_restart(char mode, const char *cmd)
+void vt8500_restart(enum reboot_mode mode, const char *cmd)
 {
 	if (pmc_base)
 		writel(1, pmc_base + VT8500_PMSR_REG);
diff --git a/arch/arm/mach-w90x900/cpu.c b/arch/arm/mach-w90x900/cpu.c
index 9e4dd8b..b1eabaa 100644
--- a/arch/arm/mach-w90x900/cpu.c
+++ b/arch/arm/mach-w90x900/cpu.c
@@ -230,9 +230,9 @@ void __init nuc900_init_clocks(void)
 #define	WTE	(1 << 7)
 #define	WTRE	(1 << 1)
 
-void nuc9xx_restart(char mode, const char *cmd)
+void nuc9xx_restart(enum reboot_mode mode, const char *cmd)
 {
-	if (mode == 's') {
+	if (mode == REBOOT_SOFT) {
 		/* Jump into ROM at address 0 */
 		soft_restart(0);
 	} else {
diff --git a/arch/arm/mach-w90x900/nuc9xx.h b/arch/arm/mach-w90x900/nuc9xx.h
index 88ef4b2..e3ab1e1 100644
--- a/arch/arm/mach-w90x900/nuc9xx.h
+++ b/arch/arm/mach-w90x900/nuc9xx.h
@@ -14,10 +14,13 @@
  * published by the Free Software Foundation.
  *
  */
+
+#include <linux/reboot.h>
+
 struct map_desc;
 
 /* core initialisation functions */
 
 extern void nuc900_init_irq(void);
 extern void nuc900_timer_init(void);
-extern void nuc9xx_restart(char, const char *);
+extern void nuc9xx_restart(enum reboot_mode, const char *);
diff --git a/arch/arm/plat-iop/gpio.c b/arch/arm/plat-iop/gpio.c
index e4de9be..697de6d 100644
--- a/arch/arm/plat-iop/gpio.c
+++ b/arch/arm/plat-iop/gpio.c
@@ -17,6 +17,7 @@
 #include <linux/gpio.h>
 #include <linux/export.h>
 #include <asm/hardware/iop3xx.h>
+#include <mach/gpio.h>
 
 void gpio_line_config(int line, int direction)
 {
diff --git a/arch/arm/plat-iop/restart.c b/arch/arm/plat-iop/restart.c
index 33fa699..3a4d5e5 100644
--- a/arch/arm/plat-iop/restart.c
+++ b/arch/arm/plat-iop/restart.c
@@ -11,7 +11,7 @@
 #include <asm/system_misc.h>
 #include <mach/hardware.h>
 
-void iop3xx_restart(char mode, const char *cmd)
+void iop3xx_restart(enum reboot_mode mode, const char *cmd)
 {
 	*IOP3XX_PCSR = 0x30;
 
diff --git a/drivers/power/reset/restart-poweroff.c b/drivers/power/reset/restart-poweroff.c
index 059cd15..5758033 100644
--- a/drivers/power/reset/restart-poweroff.c
+++ b/drivers/power/reset/restart-poweroff.c
@@ -15,11 +15,12 @@
 #include <linux/platform_device.h>
 #include <linux/of_platform.h>
 #include <linux/module.h>
+#include <linux/reboot.h>
 #include <asm/system_misc.h>
 
 static void restart_poweroff_do_poweroff(void)
 {
-	arm_pm_restart('h', NULL);
+	arm_pm_restart(REBOOT_HARD, NULL);
 }
 
 static int restart_poweroff_probe(struct platform_device *pdev)
diff --git a/drivers/power/reset/vexpress-poweroff.c b/drivers/power/reset/vexpress-poweroff.c
index 469e696..476aa49 100644
--- a/drivers/power/reset/vexpress-poweroff.c
+++ b/drivers/power/reset/vexpress-poweroff.c
@@ -48,7 +48,7 @@ static void vexpress_power_off(void)
 
 static struct device *vexpress_restart_device;
 
-static void vexpress_restart(char str, const char *cmd)
+static void vexpress_restart(enum reboot_mode reboot_mode, const char *cmd)
 {
 	vexpress_reset_do(vexpress_restart_device, "restart");
 }
diff --git a/include/linux/reboot.h b/include/linux/reboot.h
index ca29a6f..5a5d7f7 100644
--- a/include/linux/reboot.h
+++ b/include/linux/reboot.h
@@ -15,6 +15,7 @@ enum reboot_mode {
 	REBOOT_WARM,
 	REBOOT_HARD,
 	REBOOT_SOFT,
+	REBOOT_GPIO,
 };
 
 extern int register_reboot_notifier(struct notifier_block *);
diff --git a/include/linux/vexpress.h b/include/linux/vexpress.h
index ea7168a..617c01b 100644
--- a/include/linux/vexpress.h
+++ b/include/linux/vexpress.h
@@ -15,6 +15,7 @@
 #define _LINUX_VEXPRESS_H
 
 #include <linux/device.h>
+#include <linux/reboot.h>
 
 #define VEXPRESS_SITE_MB		0
 #define VEXPRESS_SITE_DB1		1
-- 
cgit v0.10.2


From 1b3a5d02ee070c8f9943333b9b6370f486601e0f Mon Sep 17 00:00:00 2001
From: Robin Holt <holt@sgi.com>
Date: Mon, 8 Jul 2013 16:01:42 -0700
Subject: reboot: move arch/x86 reboot= handling to generic kernel

Merge together the unicore32, arm, and x86 reboot= command line
parameter handling.

Signed-off-by: Robin Holt <holt@sgi.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Russell King <rmk+kernel@arm.linux.org.uk>
Cc: Guan Xuetao <gxt@mprc.pku.edu.cn>
Cc: Russ Anderson <rja@sgi.com>
Cc: Robin Holt <holt@sgi.com>
Acked-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Guan Xuetao <gxt@mprc.pku.edu.cn>
Acked-by: Russell King <rmk+kernel@arm.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 25dc4a0..75236f1 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -2681,9 +2681,17 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
 			Run specified binary instead of /init from the ramdisk,
 			used for early userspace startup. See initrd.
 
-	reboot=		[BUGS=X86-32,BUGS=ARM,BUGS=IA-64] Rebooting mode
-			Format: <reboot_mode>[,<reboot_mode2>[,...]]
-			See arch/*/kernel/reboot.c or arch/*/kernel/process.c
+	reboot=		[KNL]
+			Format (x86 or x86_64):
+				[w[arm] | c[old] | h[ard] | s[oft] | g[pio]] \
+				[[,]s[mp]#### \
+				[[,]b[ios] | a[cpi] | k[bd] | t[riple] | e[fi] | p[ci]] \
+				[[,]f[orce]
+			Where reboot_mode is one of warm (soft) or cold (hard) or gpio,
+			      reboot_type is one of bios, acpi, kbd, triple, efi, or pci,
+			      reboot_force is either force or not specified,
+			      reboot_cpu is s[mp]#### with #### being the processor
+					to be used for rebooting.
 
 	relax_domain_level=
 			[KNL, SMP] Set scheduler's default relax_domain_level.
diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c
index b7fdd86..d3ca4f6 100644
--- a/arch/arm/kernel/process.c
+++ b/arch/arm/kernel/process.c
@@ -176,16 +176,6 @@ void arch_cpu_idle(void)
 		default_idle();
 }
 
-enum reboot_mode reboot_mode = REBOOT_HARD;
-
-static int __init reboot_setup(char *str)
-{
-	if ('s' == str[0])
-		reboot_mode = REBOOT_SOFT;
-	return 1;
-}
-__setup("reboot=", reboot_setup);
-
 /*
  * Called by kexec, immediately prior to machine_kexec().
  *
diff --git a/arch/unicore32/kernel/process.c b/arch/unicore32/kernel/process.c
index 93dd035..778ebba 100644
--- a/arch/unicore32/kernel/process.c
+++ b/arch/unicore32/kernel/process.c
@@ -51,16 +51,6 @@ void arch_cpu_idle(void)
 	local_irq_enable();
 }
 
-static enum reboot_mode reboot_mode = REBOOT_HARD;
-
-int __init reboot_setup(char *str)
-{
-	if ('s' == str[0])
-		reboot_mode = REBOOT_SOFT;
-	return 1;
-}
-__setup("reboot=", reboot_setup);
-
 void machine_halt(void)
 {
 	gpio_set_value(GPO_SOFT_OFF, 0);
diff --git a/arch/x86/include/asm/emergency-restart.h b/arch/x86/include/asm/emergency-restart.h
index 75ce3f4..77a99ac 100644
--- a/arch/x86/include/asm/emergency-restart.h
+++ b/arch/x86/include/asm/emergency-restart.h
@@ -1,18 +1,6 @@
 #ifndef _ASM_X86_EMERGENCY_RESTART_H
 #define _ASM_X86_EMERGENCY_RESTART_H
 
-enum reboot_type {
-	BOOT_TRIPLE = 't',
-	BOOT_KBD = 'k',
-	BOOT_BIOS = 'b',
-	BOOT_ACPI = 'a',
-	BOOT_EFI = 'e',
-	BOOT_CF9 = 'p',
-	BOOT_CF9_COND = 'q',
-};
-
-extern enum reboot_type reboot_type;
-
 extern void machine_emergency_restart(void);
 
 #endif /* _ASM_X86_EMERGENCY_RESTART_H */
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index 39cc7f7..63092af 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -25,6 +25,7 @@
 #include <linux/kdebug.h>
 #include <linux/delay.h>
 #include <linux/crash_dump.h>
+#include <linux/reboot.h>
 
 #include <asm/uv/uv_mmrs.h>
 #include <asm/uv/uv_hub.h>
@@ -36,7 +37,6 @@
 #include <asm/ipi.h>
 #include <asm/smp.h>
 #include <asm/x86_init.h>
-#include <asm/emergency-restart.h>
 #include <asm/nmi.h>
 
 /* BMC sets a bit this MMR non-zero before sending an NMI */
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index f770340..563ed91 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -36,22 +36,6 @@ void (*pm_power_off)(void);
 EXPORT_SYMBOL(pm_power_off);
 
 static const struct desc_ptr no_idt = {};
-static enum reboot_mode reboot_mode;
-enum reboot_type reboot_type = BOOT_ACPI;
-int reboot_force;
-
-/*
- * This variable is used privately to keep track of whether or not
- * reboot_type is still set to its default value (i.e., reboot= hasn't
- * been set on the command line).  This is needed so that we can
- * suppress DMI scanning for reboot quirks.  Without it, it's
- * impossible to override a faulty reboot quirk without recompiling.
- */
-static int reboot_default = 1;
-
-#ifdef CONFIG_SMP
-static int reboot_cpu = -1;
-#endif
 
 /*
  * This is set if we need to go through the 'emergency' path.
@@ -64,79 +48,6 @@ static int reboot_emergency;
 bool port_cf9_safe = false;
 
 /*
- * reboot=b[ios] | s[mp] | t[riple] | k[bd] | e[fi] [, [w]arm | [c]old] | p[ci]
- * warm   Don't set the cold reboot flag
- * cold   Set the cold reboot flag
- * bios   Reboot by jumping through the BIOS
- * smp    Reboot by executing reset on BSP or other CPU
- * triple Force a triple fault (init)
- * kbd    Use the keyboard controller. cold reset (default)
- * acpi   Use the RESET_REG in the FADT
- * efi    Use efi reset_system runtime service
- * pci    Use the so-called "PCI reset register", CF9
- * force  Avoid anything that could hang.
- */
-static int __init reboot_setup(char *str)
-{
-	for (;;) {
-		/*
-		 * Having anything passed on the command line via
-		 * reboot= will cause us to disable DMI checking
-		 * below.
-		 */
-		reboot_default = 0;
-
-		switch (*str) {
-		case 'w':
-			reboot_mode = REBOOT_WARM;
-			break;
-
-		case 'c':
-			reboot_mode = REBOOT_COLD;
-			break;
-
-#ifdef CONFIG_SMP
-		case 's':
-			if (isdigit(*(str+1))) {
-				reboot_cpu = (int) (*(str+1) - '0');
-				if (isdigit(*(str+2)))
-					reboot_cpu = reboot_cpu*10 + (int)(*(str+2) - '0');
-			}
-			/*
-			 * We will leave sorting out the final value
-			 * when we are ready to reboot, since we might not
-			 * have detected BSP APIC ID or smp_num_cpu
-			 */
-			break;
-#endif /* CONFIG_SMP */
-
-		case 'b':
-		case 'a':
-		case 'k':
-		case 't':
-		case 'e':
-		case 'p':
-			reboot_type = *str;
-			break;
-
-		case 'f':
-			reboot_force = 1;
-			break;
-		}
-
-		str = strchr(str, ',');
-		if (str)
-			str++;
-		else
-			break;
-	}
-	return 1;
-}
-
-__setup("reboot=", reboot_setup);
-
-
-/*
  * Reboot options and system auto-detection code provided by
  * Dell Inc. so their systems "just work". :-)
  */
@@ -616,26 +527,10 @@ void native_machine_shutdown(void)
 {
 	/* Stop the cpus and apics */
 #ifdef CONFIG_SMP
-
-	/* The boot cpu is always logical cpu 0 */
-	int reboot_cpu_id = 0;
-
-	/* See if there has been given a command line override */
-	if ((reboot_cpu != -1) && (reboot_cpu < nr_cpu_ids) &&
-		cpu_online(reboot_cpu))
-		reboot_cpu_id = reboot_cpu;
-
-	/* Make certain the cpu I'm about to reboot on is online */
-	if (!cpu_online(reboot_cpu_id))
-		reboot_cpu_id = smp_processor_id();
-
-	/* Make certain I only run on the appropriate processor */
-	set_cpus_allowed_ptr(current, cpumask_of(reboot_cpu_id));
-
 	/*
-	 * O.K Now that I'm on the appropriate processor, stop all of the
-	 * others. Also disable the local irq to not receive the per-cpu
-	 * timer interrupt which may trigger scheduler's load balance.
+	 * Stop all of the others. Also disable the local irq to
+	 * not receive the per-cpu timer interrupt which may trigger
+	 * scheduler's load balance.
 	 */
 	local_irq_disable();
 	stop_other_cpus();
diff --git a/include/linux/reboot.h b/include/linux/reboot.h
index 5a5d7f7..8e00f9f 100644
--- a/include/linux/reboot.h
+++ b/include/linux/reboot.h
@@ -17,6 +17,23 @@ enum reboot_mode {
 	REBOOT_SOFT,
 	REBOOT_GPIO,
 };
+extern enum reboot_mode reboot_mode;
+
+enum reboot_type {
+	BOOT_TRIPLE = 't',
+	BOOT_KBD = 'k',
+	BOOT_BIOS = 'b',
+	BOOT_ACPI = 'a',
+	BOOT_EFI = 'e',
+	BOOT_CF9 = 'p',
+	BOOT_CF9_COND = 'q',
+};
+extern enum reboot_type reboot_type;
+
+extern int reboot_default;
+extern int reboot_cpu;
+extern int reboot_force;
+
 
 extern int register_reboot_notifier(struct notifier_block *);
 extern int unregister_reboot_notifier(struct notifier_block *);
diff --git a/kernel/reboot.c b/kernel/reboot.c
index abb6a04..269ed93 100644
--- a/kernel/reboot.c
+++ b/kernel/reboot.c
@@ -6,6 +6,7 @@
 
 #define pr_fmt(fmt)	"reboot: " fmt
 
+#include <linux/ctype.h>
 #include <linux/export.h>
 #include <linux/kexec.h>
 #include <linux/kmod.h>
@@ -24,6 +25,18 @@ int C_A_D = 1;
 struct pid *cad_pid;
 EXPORT_SYMBOL(cad_pid);
 
+#if defined(CONFIG_ARM) || defined(CONFIG_UNICORE32)
+#define DEFAULT_REBOOT_MODE		= REBOOT_HARD
+#else
+#define DEFAULT_REBOOT_MODE
+#endif
+enum reboot_mode reboot_mode DEFAULT_REBOOT_MODE;
+
+int reboot_default;
+int reboot_cpu;
+enum reboot_type reboot_type = BOOT_ACPI;
+int reboot_force;
+
 /*
  * If set, this is used for preparing the system to power off.
  */
@@ -87,7 +100,7 @@ EXPORT_SYMBOL(unregister_reboot_notifier);
 static void migrate_to_reboot_cpu(void)
 {
 	/* The boot cpu is always logical cpu 0 */
-	int cpu = 0;
+	int cpu = reboot_cpu;
 
 	cpu_hotplug_disable();
 
@@ -343,3 +356,64 @@ int orderly_poweroff(bool force)
 	return 0;
 }
 EXPORT_SYMBOL_GPL(orderly_poweroff);
+
+static int __init reboot_setup(char *str)
+{
+	for (;;) {
+		/*
+		 * Having anything passed on the command line via
+		 * reboot= will cause us to disable DMI checking
+		 * below.
+		 */
+		reboot_default = 0;
+
+		switch (*str) {
+		case 'w':
+			reboot_mode = REBOOT_WARM;
+			break;
+
+		case 'c':
+			reboot_mode = REBOOT_COLD;
+			break;
+
+		case 'h':
+			reboot_mode = REBOOT_HARD;
+			break;
+
+		case 's':
+			if (isdigit(*(str+1)))
+				reboot_cpu = simple_strtoul(str+1, NULL, 0);
+			else if (str[1] == 'm' && str[2] == 'p' &&
+							isdigit(*(str+3)))
+				reboot_cpu = simple_strtoul(str+3, NULL, 0);
+			else
+				reboot_mode = REBOOT_SOFT;
+			break;
+
+		case 'g':
+			reboot_mode = REBOOT_GPIO;
+			break;
+
+		case 'b':
+		case 'a':
+		case 'k':
+		case 't':
+		case 'e':
+		case 'p':
+			reboot_type = *str;
+			break;
+
+		case 'f':
+			reboot_force = 1;
+			break;
+		}
+
+		str = strchr(str, ',');
+		if (str)
+			str++;
+		else
+			break;
+	}
+	return 1;
+}
+__setup("reboot=", reboot_setup);
-- 
cgit v0.10.2


From 4df87bb7b6a22dfc6fdd5abb3dd362b3af2c164d Mon Sep 17 00:00:00 2001
From: Chanho Min <chanho.min@lge.com>
Date: Mon, 8 Jul 2013 16:01:43 -0700
Subject: lib: add weak clz/ctz functions

Some architectures need __c[lt]z[sd]i2() for __builtin_c[lt]z[ll] and
that causes a build failure.  They can be implemented using the
fls()/__ffs() and overridden by linking arch-specific versions may not
be implemented yet.

This is required by "lib: add lz4 compressor module".

Reference: https://lkml.org/lkml/2013/4/18/603

Signed-off-by: Chanho Min <chanho.min@lge.com>
Reported-by: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: "Darrick J. Wong" <djwong@us.ibm.com>
Cc: Bob Pearson <rpearson@systemfabricworks.com>
Cc: Richard Weinberger <richard@nod.at>
Cc: Herbert Xu <herbert@gondor.hengli.com.au>
Cc: Yann Collet <yann.collet.73@gmail.com>
Cc: Kyungsik Lee <kyungsik.lee@lge.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/lib/Makefile b/lib/Makefile
index c09e38e..9afb4f7 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -23,7 +23,7 @@ lib-y	+= kobject.o klist.o
 
 obj-y += bcd.o div64.o sort.o parser.o halfmd4.o debug_locks.o random32.o \
 	 bust_spinlocks.o hexdump.o kasprintf.o bitmap.o scatterlist.o \
-	 gcd.o lcm.o list_sort.o uuid.o flex_array.o iovec.o \
+	 gcd.o lcm.o list_sort.o uuid.o flex_array.o iovec.o clz_ctz.o \
 	 bsearch.o find_last_bit.o find_next_bit.o llist.o memweight.o kfifo.o
 obj-y += string_helpers.o
 obj-$(CONFIG_TEST_STRING_HELPERS) += test-string_helpers.o
diff --git a/lib/clz_ctz.c b/lib/clz_ctz.c
new file mode 100644
index 0000000..a8f8379
--- /dev/null
+++ b/lib/clz_ctz.c
@@ -0,0 +1,58 @@
+/*
+ * lib/clz_ctz.c
+ *
+ * Copyright (C) 2013 Chanho Min <chanho.min@lge.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * __c[lt]z[sd]i2 can be overridden by linking arch-specific versions.
+ */
+
+#include <linux/export.h>
+#include <linux/kernel.h>
+
+int __weak __ctzsi2(int val)
+{
+	return __ffs(val);
+}
+EXPORT_SYMBOL(__ctzsi2);
+
+int __weak __clzsi2(int val)
+{
+	return 32 - fls(val);
+}
+EXPORT_SYMBOL(__clzsi2);
+
+#if BITS_PER_LONG == 32
+
+int __weak __clzdi2(long val)
+{
+	return 32 - fls((int)val);
+}
+EXPORT_SYMBOL(__clzdi2);
+
+int __weak __ctzdi2(long val)
+{
+	return __ffs((u32)val);
+}
+EXPORT_SYMBOL(__ctzdi2);
+
+#elif BITS_PER_LONG == 64
+
+int __weak __clzdi2(long val)
+{
+	return 64 - fls64((u64)val);
+}
+EXPORT_SYMBOL(__clzdi2);
+
+int __weak __ctzdi2(long val)
+{
+	return __ffs64((u64)val);
+}
+EXPORT_SYMBOL(__ctzdi2);
+
+#else
+#error BITS_PER_LONG not 32 or 64
+#endif
-- 
cgit v0.10.2


From cffb78b0e0b3a30b059b27a1d97500cf6464efa9 Mon Sep 17 00:00:00 2001
From: Kyungsik Lee <kyungsik.lee@lge.com>
Date: Mon, 8 Jul 2013 16:01:45 -0700
Subject: decompressor: add LZ4 decompressor module

Add support for LZ4 decompression in the Linux Kernel.  LZ4 Decompression
APIs for kernel are based on LZ4 implementation by Yann Collet.

Benchmark Results(PATCH v3)
Compiler: Linaro ARM gcc 4.6.2

1. ARMv7, 1.5GHz based board
   Kernel: linux 3.4
   Uncompressed Kernel Size: 14MB
        Compressed Size  Decompression Speed
   LZO  6.7MB            20.1MB/s, 25.2MB/s(UA)
   LZ4  7.3MB            29.1MB/s, 45.6MB/s(UA)

2. ARMv7, 1.7GHz based board
   Kernel: linux 3.7
   Uncompressed Kernel Size: 14MB
        Compressed Size  Decompression Speed
   LZO  6.0MB            34.1MB/s, 52.2MB/s(UA)
   LZ4  6.5MB            86.7MB/s
- UA: Unaligned memory Access support
- Latest patch set for LZO applied

This patch set is for adding support for LZ4-compressed Kernel.  LZ4 is a
very fast lossless compression algorithm and it also features an extremely
fast decoder [1].

But we have five of decompressors already and one question which does
arise, however, is that of where do we stop adding new ones?  This issue
had been discussed and came to the conclusion [2].

Russell King said that we should have:

 - one decompressor which is the fastest
 - one decompressor for the highest compression ratio
 - one popular decompressor (eg conventional gzip)

If we have a replacement one for one of these, then it should do exactly
that: replace it.

The benchmark shows that an 8% increase in image size vs a 66% increase
in decompression speed compared to LZO(which has been known as the
fastest decompressor in the Kernel).  Therefore the "fast but may not be
small" compression title has clearly been taken by LZ4 [3].

[1] http://code.google.com/p/lz4/
[2] http://thread.gmane.org/gmane.linux.kbuild.devel/9157
[3] http://thread.gmane.org/gmane.linux.kbuild.devel/9347

LZ4 homepage: http://fastcompression.blogspot.com/p/lz4.html
LZ4 source repository: http://code.google.com/p/lz4/

Signed-off-by: Kyungsik Lee <kyungsik.lee@lge.com>
Signed-off-by: Yann Collet <yann.collet.73@gmail.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Russell King <rmk@arm.linux.org.uk>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Florian Fainelli <florian@openwrt.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/include/linux/lz4.h b/include/linux/lz4.h
new file mode 100644
index 0000000..7f6c75a
--- /dev/null
+++ b/include/linux/lz4.h
@@ -0,0 +1,51 @@
+#ifndef __LZ4_H__
+#define __LZ4_H__
+/*
+ * LZ4 Kernel Interface
+ *
+ * Copyright (C) 2013, LG Electronics, Kyungsik Lee <kyungsik.lee@lge.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/*
+ * lz4_compressbound()
+ * Provides the maximum size that LZ4 may output in a "worst case" scenario
+ * (input data not compressible)
+ */
+static inline size_t lz4_compressbound(size_t isize)
+{
+	return isize + (isize / 255) + 16;
+}
+
+/*
+ * lz4_decompress()
+ *	src     : source address of the compressed data
+ *	src_len : is the input size, whcih is returned after decompress done
+ *	dest	: output buffer address of the decompressed data
+ *	actual_dest_len: is the size of uncompressed data, supposing it's known
+ *	return  : Success if return 0
+ *		  Error if return (< 0)
+ *	note :  Destination buffer must be already allocated.
+ *		slightly faster than lz4_decompress_unknownoutputsize()
+ */
+int lz4_decompress(const char *src, size_t *src_len, char *dest,
+		size_t actual_dest_len);
+
+/*
+ * lz4_decompress_unknownoutputsize()
+ *	src     : source address of the compressed data
+ *	src_len : is the input size, therefore the compressed size
+ *	dest	: output buffer address of the decompressed data
+ *	dest_len: is the max size of the destination buffer, which is
+ *			returned with actual size of decompressed data after
+ *			decompress done
+ *	return  : Success if return 0
+ *		  Error if return (< 0)
+ *	note :  Destination buffer must be already allocated.
+ */
+int lz4_decompress_unknownoutputsize(const char *src, size_t src_len,
+		char *dest, size_t *dest_len);
+#endif
diff --git a/lib/lz4/lz4_decompress.c b/lib/lz4/lz4_decompress.c
new file mode 100644
index 0000000..dcc8975
--- /dev/null
+++ b/lib/lz4/lz4_decompress.c
@@ -0,0 +1,326 @@
+/*
+ * LZ4 Decompressor for Linux kernel
+ *
+ * Copyright (C) 2013 LG Electronics Co., Ltd. (http://www.lge.com/)
+ *
+ * Based on LZ4 implementation by Yann Collet.
+ *
+ * LZ4 - Fast LZ compression algorithm
+ * Copyright (C) 2011-2012, Yann Collet.
+ * BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ *  You can contact the author at :
+ *  - LZ4 homepage : http://fastcompression.blogspot.com/p/lz4.html
+ *  - LZ4 source repository : http://code.google.com/p/lz4/
+ */
+
+#ifndef STATIC
+#include <linux/module.h>
+#include <linux/kernel.h>
+#endif
+#include <linux/lz4.h>
+
+#include <asm/unaligned.h>
+
+#include "lz4defs.h"
+
+static int lz4_uncompress(const char *source, char *dest, int osize)
+{
+	const BYTE *ip = (const BYTE *) source;
+	const BYTE *ref;
+	BYTE *op = (BYTE *) dest;
+	BYTE * const oend = op + osize;
+	BYTE *cpy;
+	unsigned token;
+	size_t length;
+	size_t dec32table[] = {0, 3, 2, 3, 0, 0, 0, 0};
+#if LZ4_ARCH64
+	size_t dec64table[] = {0, 0, 0, -1, 0, 1, 2, 3};
+#endif
+
+	while (1) {
+
+		/* get runlength */
+		token = *ip++;
+		length = (token >> ML_BITS);
+		if (length == RUN_MASK) {
+			size_t len;
+
+			len = *ip++;
+			for (; len == 255; length += 255)
+				len = *ip++;
+			length += len;
+		}
+
+		/* copy literals */
+		cpy = op + length;
+		if (unlikely(cpy > oend - COPYLENGTH)) {
+			/*
+			 * Error: not enough place for another match
+			 * (min 4) + 5 literals
+			 */
+			if (cpy != oend)
+				goto _output_error;
+
+			memcpy(op, ip, length);
+			ip += length;
+			break; /* EOF */
+		}
+		LZ4_WILDCOPY(ip, op, cpy);
+		ip -= (op - cpy);
+		op = cpy;
+
+		/* get offset */
+		LZ4_READ_LITTLEENDIAN_16(ref, cpy, ip);
+		ip += 2;
+
+		/* Error: offset create reference outside destination buffer */
+		if (unlikely(ref < (BYTE *const) dest))
+			goto _output_error;
+
+		/* get matchlength */
+		length = token & ML_MASK;
+		if (length == ML_MASK) {
+			for (; *ip == 255; length += 255)
+				ip++;
+			length += *ip++;
+		}
+
+		/* copy repeated sequence */
+		if (unlikely((op - ref) < STEPSIZE)) {
+#if LZ4_ARCH64
+			size_t dec64 = dec64table[op - ref];
+#else
+			const int dec64 = 0;
+#endif
+			op[0] = ref[0];
+			op[1] = ref[1];
+			op[2] = ref[2];
+			op[3] = ref[3];
+			op += 4;
+			ref += 4;
+			ref -= dec32table[op-ref];
+			PUT4(ref, op);
+			op += STEPSIZE - 4;
+			ref -= dec64;
+		} else {
+			LZ4_COPYSTEP(ref, op);
+		}
+		cpy = op + length - (STEPSIZE - 4);
+		if (cpy > (oend - COPYLENGTH)) {
+
+			/* Error: request to write beyond destination buffer */
+			if (cpy > oend)
+				goto _output_error;
+			LZ4_SECURECOPY(ref, op, (oend - COPYLENGTH));
+			while (op < cpy)
+				*op++ = *ref++;
+			op = cpy;
+			/*
+			 * Check EOF (should never happen, since last 5 bytes
+			 * are supposed to be literals)
+			 */
+			if (op == oend)
+				goto _output_error;
+			continue;
+		}
+		LZ4_SECURECOPY(ref, op, cpy);
+		op = cpy; /* correction */
+	}
+	/* end of decoding */
+	return (int) (((char *)ip) - source);
+
+	/* write overflow error detected */
+_output_error:
+	return (int) (-(((char *)ip) - source));
+}
+
+static int lz4_uncompress_unknownoutputsize(const char *source, char *dest,
+				int isize, size_t maxoutputsize)
+{
+	const BYTE *ip = (const BYTE *) source;
+	const BYTE *const iend = ip + isize;
+	const BYTE *ref;
+
+
+	BYTE *op = (BYTE *) dest;
+	BYTE * const oend = op + maxoutputsize;
+	BYTE *cpy;
+
+	size_t dec32table[] = {0, 3, 2, 3, 0, 0, 0, 0};
+#if LZ4_ARCH64
+	size_t dec64table[] = {0, 0, 0, -1, 0, 1, 2, 3};
+#endif
+
+	/* Main Loop */
+	while (ip < iend) {
+
+		unsigned token;
+		size_t length;
+
+		/* get runlength */
+		token = *ip++;
+		length = (token >> ML_BITS);
+		if (length == RUN_MASK) {
+			int s = 255;
+			while ((ip < iend) && (s == 255)) {
+				s = *ip++;
+				length += s;
+			}
+		}
+		/* copy literals */
+		cpy = op + length;
+		if ((cpy > oend - COPYLENGTH) ||
+			(ip + length > iend - COPYLENGTH)) {
+
+			if (cpy > oend)
+				goto _output_error;/* writes beyond buffer */
+
+			if (ip + length != iend)
+				goto _output_error;/*
+						    * Error: LZ4 format requires
+						    * to consume all input
+						    * at this stage
+						    */
+			memcpy(op, ip, length);
+			op += length;
+			break;/* Necessarily EOF, due to parsing restrictions */
+		}
+		LZ4_WILDCOPY(ip, op, cpy);
+		ip -= (op - cpy);
+		op = cpy;
+
+		/* get offset */
+		LZ4_READ_LITTLEENDIAN_16(ref, cpy, ip);
+		ip += 2;
+		if (ref < (BYTE * const) dest)
+			goto _output_error;
+			/*
+			 * Error : offset creates reference
+			 * outside of destination buffer
+			 */
+
+		/* get matchlength */
+		length = (token & ML_MASK);
+		if (length == ML_MASK) {
+			while (ip < iend) {
+				int s = *ip++;
+				length += s;
+				if (s == 255)
+					continue;
+				break;
+			}
+		}
+
+		/* copy repeated sequence */
+		if (unlikely((op - ref) < STEPSIZE)) {
+#if LZ4_ARCH64
+			size_t dec64 = dec64table[op - ref];
+#else
+			const int dec64 = 0;
+#endif
+				op[0] = ref[0];
+				op[1] = ref[1];
+				op[2] = ref[2];
+				op[3] = ref[3];
+				op += 4;
+				ref += 4;
+				ref -= dec32table[op - ref];
+				PUT4(ref, op);
+				op += STEPSIZE - 4;
+				ref -= dec64;
+		} else {
+			LZ4_COPYSTEP(ref, op);
+		}
+		cpy = op + length - (STEPSIZE-4);
+		if (cpy > oend - COPYLENGTH) {
+			if (cpy > oend)
+				goto _output_error; /* write outside of buf */
+
+			LZ4_SECURECOPY(ref, op, (oend - COPYLENGTH));
+			while (op < cpy)
+				*op++ = *ref++;
+			op = cpy;
+			/*
+			 * Check EOF (should never happen, since last 5 bytes
+			 * are supposed to be literals)
+			 */
+			if (op == oend)
+				goto _output_error;
+			continue;
+		}
+		LZ4_SECURECOPY(ref, op, cpy);
+		op = cpy; /* correction */
+	}
+	/* end of decoding */
+	return (int) (((char *) op) - dest);
+
+	/* write overflow error detected */
+_output_error:
+	return (int) (-(((char *) ip) - source));
+}
+
+int lz4_decompress(const char *src, size_t *src_len, char *dest,
+		size_t actual_dest_len)
+{
+	int ret = -1;
+	int input_len = 0;
+
+	input_len = lz4_uncompress(src, dest, actual_dest_len);
+	if (input_len < 0)
+		goto exit_0;
+	*src_len = input_len;
+
+	return 0;
+exit_0:
+	return ret;
+}
+#ifndef STATIC
+EXPORT_SYMBOL_GPL(lz4_decompress);
+#endif
+
+int lz4_decompress_unknownoutputsize(const char *src, size_t src_len,
+		char *dest, size_t *dest_len)
+{
+	int ret = -1;
+	int out_len = 0;
+
+	out_len = lz4_uncompress_unknownoutputsize(src, dest, src_len,
+					*dest_len);
+	if (out_len < 0)
+		goto exit_0;
+	*dest_len = out_len;
+
+	return 0;
+exit_0:
+	return ret;
+}
+#ifndef STATIC
+EXPORT_SYMBOL_GPL(lz4_decompress_unknownoutputsize);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("LZ4 Decompressor");
+#endif
diff --git a/lib/lz4/lz4defs.h b/lib/lz4/lz4defs.h
new file mode 100644
index 0000000..43ac31d
--- /dev/null
+++ b/lib/lz4/lz4defs.h
@@ -0,0 +1,94 @@
+/*
+ * lz4defs.h -- architecture specific defines
+ *
+ * Copyright (C) 2013, LG Electronics, Kyungsik Lee <kyungsik.lee@lge.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/*
+ * Detects 64 bits mode
+ */
+#if (defined(__x86_64__) || defined(__x86_64) || defined(__amd64__) \
+	|| defined(__ppc64__) || defined(__LP64__))
+#define LZ4_ARCH64 1
+#else
+#define LZ4_ARCH64 0
+#endif
+
+/*
+ * Architecture-specific macros
+ */
+#define BYTE	u8
+#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)		\
+	|| defined(CONFIG_ARM) && __LINUX_ARM_ARCH__ >= 6	\
+	&& defined(ARM_EFFICIENT_UNALIGNED_ACCESS)
+typedef struct _U32_S { u32 v; } U32_S;
+typedef struct _U64_S { u64 v; } U64_S;
+
+#define A32(x) (((U32_S *)(x))->v)
+#define A64(x) (((U64_S *)(x))->v)
+
+#define PUT4(s, d) (A32(d) = A32(s))
+#define PUT8(s, d) (A64(d) = A64(s))
+#else /* CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS */
+
+#define PUT4(s, d) \
+	put_unaligned(get_unaligned((const u32 *) s), (u32 *) d)
+#define PUT8(s, d) \
+	put_unaligned(get_unaligned((const u64 *) s), (u64 *) d)
+#endif
+
+#define COPYLENGTH 8
+#define ML_BITS  4
+#define ML_MASK  ((1U << ML_BITS) - 1)
+#define RUN_BITS (8 - ML_BITS)
+#define RUN_MASK ((1U << RUN_BITS) - 1)
+
+#if LZ4_ARCH64/* 64-bit */
+#define STEPSIZE 8
+
+#define LZ4_COPYSTEP(s, d)	\
+	do {			\
+		PUT8(s, d);	\
+		d += 8;		\
+		s += 8;		\
+	} while (0)
+
+#define LZ4_COPYPACKET(s, d)	LZ4_COPYSTEP(s, d)
+
+#define LZ4_SECURECOPY(s, d, e)			\
+	do {					\
+		if (d < e) {			\
+			LZ4_WILDCOPY(s, d, e);	\
+		}				\
+	} while (0)
+
+#else	/* 32-bit */
+#define STEPSIZE 4
+
+#define LZ4_COPYSTEP(s, d)	\
+	do {			\
+		PUT4(s, d);	\
+		d += 4;		\
+		s += 4;		\
+	} while (0)
+
+#define LZ4_COPYPACKET(s, d)		\
+	do {				\
+		LZ4_COPYSTEP(s, d);	\
+		LZ4_COPYSTEP(s, d);	\
+	} while (0)
+
+#define LZ4_SECURECOPY	LZ4_WILDCOPY
+#endif
+
+#define LZ4_READ_LITTLEENDIAN_16(d, s, p) \
+	(d = s - get_unaligned_le16(p))
+
+#define LZ4_WILDCOPY(s, d, e)		\
+	do {				\
+		LZ4_COPYPACKET(s, d);	\
+	} while (d < e)
-- 
cgit v0.10.2


From e76e1fdfa8f8dc1ea6699923cf5d92b5bee9c936 Mon Sep 17 00:00:00 2001
From: Kyungsik Lee <kyungsik.lee@lge.com>
Date: Mon, 8 Jul 2013 16:01:46 -0700
Subject: lib: add support for LZ4-compressed kernel

Add support for extracting LZ4-compressed kernel images, as well as
LZ4-compressed ramdisk images in the kernel boot process.

Signed-off-by: Kyungsik Lee <kyungsik.lee@lge.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Russell King <rmk@arm.linux.org.uk>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Florian Fainelli <florian@openwrt.org>
Cc: Yann Collet <yann.collet.73@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/include/linux/decompress/unlz4.h b/include/linux/decompress/unlz4.h
new file mode 100644
index 0000000..d5b68bf
--- /dev/null
+++ b/include/linux/decompress/unlz4.h
@@ -0,0 +1,10 @@
+#ifndef DECOMPRESS_UNLZ4_H
+#define DECOMPRESS_UNLZ4_H
+
+int unlz4(unsigned char *inbuf, int len,
+	int(*fill)(void*, unsigned int),
+	int(*flush)(void*, unsigned int),
+	unsigned char *output,
+	int *pos,
+	void(*error)(char *x));
+#endif
diff --git a/init/Kconfig b/init/Kconfig
index ea1be00..54d3fa5 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -112,10 +112,13 @@ config HAVE_KERNEL_XZ
 config HAVE_KERNEL_LZO
 	bool
 
+config HAVE_KERNEL_LZ4
+	bool
+
 choice
 	prompt "Kernel compression mode"
 	default KERNEL_GZIP
-	depends on HAVE_KERNEL_GZIP || HAVE_KERNEL_BZIP2 || HAVE_KERNEL_LZMA || HAVE_KERNEL_XZ || HAVE_KERNEL_LZO
+	depends on HAVE_KERNEL_GZIP || HAVE_KERNEL_BZIP2 || HAVE_KERNEL_LZMA || HAVE_KERNEL_XZ || HAVE_KERNEL_LZO || HAVE_KERNEL_LZ4
 	help
 	  The linux kernel is a kind of self-extracting executable.
 	  Several compression algorithms are available, which differ
@@ -182,6 +185,18 @@ config KERNEL_LZO
 	  size is about 10% bigger than gzip; however its speed
 	  (both compression and decompression) is the fastest.
 
+config KERNEL_LZ4
+	bool "LZ4"
+	depends on HAVE_KERNEL_LZ4
+	help
+	  LZ4 is an LZ77-type compressor with a fixed, byte-oriented encoding.
+	  A preliminary version of LZ4 de/compression tool is available at
+	  <https://code.google.com/p/lz4/>.
+
+	  Its compression ratio is worse than LZO. The size of the kernel
+	  is about 8% bigger than LZO. But the decompression speed is
+	  faster than LZO.
+
 endchoice
 
 config DEFAULT_HOSTNAME
diff --git a/lib/Kconfig b/lib/Kconfig
index f1ed53c..e9cdc8f 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -194,6 +194,9 @@ config LZO_COMPRESS
 config LZO_DECOMPRESS
 	tristate
 
+config LZ4_DECOMPRESS
+	tristate
+
 source "lib/xz/Kconfig"
 
 #
@@ -218,6 +221,10 @@ config DECOMPRESS_LZO
 	select LZO_DECOMPRESS
 	tristate
 
+config DECOMPRESS_LZ4
+	select LZ4_DECOMPRESS
+	tristate
+
 #
 # Generic allocator support is selected if needed
 #
diff --git a/lib/Makefile b/lib/Makefile
index 9afb4f7..326f6c7 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -75,6 +75,7 @@ obj-$(CONFIG_REED_SOLOMON) += reed_solomon/
 obj-$(CONFIG_BCH) += bch.o
 obj-$(CONFIG_LZO_COMPRESS) += lzo/
 obj-$(CONFIG_LZO_DECOMPRESS) += lzo/
+obj-$(CONFIG_LZ4_DECOMPRESS) += lz4/
 obj-$(CONFIG_XZ_DEC) += xz/
 obj-$(CONFIG_RAID6_PQ) += raid6/
 
@@ -83,6 +84,7 @@ lib-$(CONFIG_DECOMPRESS_BZIP2) += decompress_bunzip2.o
 lib-$(CONFIG_DECOMPRESS_LZMA) += decompress_unlzma.o
 lib-$(CONFIG_DECOMPRESS_XZ) += decompress_unxz.o
 lib-$(CONFIG_DECOMPRESS_LZO) += decompress_unlzo.o
+lib-$(CONFIG_DECOMPRESS_LZ4) += decompress_unlz4.o
 
 obj-$(CONFIG_TEXTSEARCH) += textsearch.o
 obj-$(CONFIG_TEXTSEARCH_KMP) += ts_kmp.o
diff --git a/lib/decompress.c b/lib/decompress.c
index f8fdeda..4d1cd03 100644
--- a/lib/decompress.c
+++ b/lib/decompress.c
@@ -11,6 +11,7 @@
 #include <linux/decompress/unxz.h>
 #include <linux/decompress/inflate.h>
 #include <linux/decompress/unlzo.h>
+#include <linux/decompress/unlz4.h>
 
 #include <linux/types.h>
 #include <linux/string.h>
@@ -31,6 +32,9 @@
 #ifndef CONFIG_DECOMPRESS_LZO
 # define unlzo NULL
 #endif
+#ifndef CONFIG_DECOMPRESS_LZ4
+# define unlz4 NULL
+#endif
 
 struct compress_format {
 	unsigned char magic[2];
@@ -45,6 +49,7 @@ static const struct compress_format compressed_formats[] __initconst = {
 	{ {0x5d, 0x00}, "lzma", unlzma },
 	{ {0xfd, 0x37}, "xz", unxz },
 	{ {0x89, 0x4c}, "lzo", unlzo },
+	{ {0x02, 0x21}, "lz4", unlz4 },
 	{ {0, 0}, NULL, NULL }
 };
 
diff --git a/lib/decompress_unlz4.c b/lib/decompress_unlz4.c
new file mode 100644
index 0000000..3e67cfa
--- /dev/null
+++ b/lib/decompress_unlz4.c
@@ -0,0 +1,187 @@
+/*
+ * Wrapper for decompressing LZ4-compressed kernel, initramfs, and initrd
+ *
+ * Copyright (C) 2013, LG Electronics, Kyungsik Lee <kyungsik.lee@lge.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifdef STATIC
+#define PREBOOT
+#include "lz4/lz4_decompress.c"
+#else
+#include <linux/decompress/unlz4.h>
+#endif
+#include <linux/types.h>
+#include <linux/lz4.h>
+#include <linux/decompress/mm.h>
+#include <linux/compiler.h>
+
+#include <asm/unaligned.h>
+
+/*
+ * Note: Uncompressed chunk size is used in the compressor side
+ * (userspace side for compression).
+ * It is hardcoded because there is not proper way to extract it
+ * from the binary stream which is generated by the preliminary
+ * version of LZ4 tool so far.
+ */
+#define LZ4_DEFAULT_UNCOMPRESSED_CHUNK_SIZE (8 << 20)
+#define ARCHIVE_MAGICNUMBER 0x184C2102
+
+STATIC inline int INIT unlz4(u8 *input, int in_len,
+				int (*fill) (void *, unsigned int),
+				int (*flush) (void *, unsigned int),
+				u8 *output, int *posp,
+				void (*error) (char *x))
+{
+	int ret = -1;
+	size_t chunksize = 0;
+	size_t uncomp_chunksize = LZ4_DEFAULT_UNCOMPRESSED_CHUNK_SIZE;
+	u8 *inp;
+	u8 *inp_start;
+	u8 *outp;
+	int size = in_len;
+#ifdef PREBOOT
+	size_t out_len = get_unaligned_le32(input + in_len);
+#endif
+	size_t dest_len;
+
+
+	if (output) {
+		outp = output;
+	} else if (!flush) {
+		error("NULL output pointer and no flush function provided");
+		goto exit_0;
+	} else {
+		outp = large_malloc(uncomp_chunksize);
+		if (!outp) {
+			error("Could not allocate output buffer");
+			goto exit_0;
+		}
+	}
+
+	if (input && fill) {
+		error("Both input pointer and fill function provided,");
+		goto exit_1;
+	} else if (input) {
+		inp = input;
+	} else if (!fill) {
+		error("NULL input pointer and missing fill function");
+		goto exit_1;
+	} else {
+		inp = large_malloc(lz4_compressbound(uncomp_chunksize));
+		if (!inp) {
+			error("Could not allocate input buffer");
+			goto exit_1;
+		}
+	}
+	inp_start = inp;
+
+	if (posp)
+		*posp = 0;
+
+	if (fill)
+		fill(inp, 4);
+
+	chunksize = get_unaligned_le32(inp);
+	if (chunksize == ARCHIVE_MAGICNUMBER) {
+		inp += 4;
+		size -= 4;
+	} else {
+		error("invalid header");
+		goto exit_2;
+	}
+
+	if (posp)
+		*posp += 4;
+
+	for (;;) {
+
+		if (fill)
+			fill(inp, 4);
+
+		chunksize = get_unaligned_le32(inp);
+		if (chunksize == ARCHIVE_MAGICNUMBER) {
+			inp += 4;
+			size -= 4;
+			if (posp)
+				*posp += 4;
+			continue;
+		}
+		inp += 4;
+		size -= 4;
+
+		if (posp)
+			*posp += 4;
+
+		if (fill) {
+			if (chunksize > lz4_compressbound(uncomp_chunksize)) {
+				error("chunk length is longer than allocated");
+				goto exit_2;
+			}
+			fill(inp, chunksize);
+		}
+#ifdef PREBOOT
+		if (out_len >= uncomp_chunksize) {
+			dest_len = uncomp_chunksize;
+			out_len -= dest_len;
+		} else
+			dest_len = out_len;
+		ret = lz4_decompress(inp, &chunksize, outp, dest_len);
+#else
+		dest_len = uncomp_chunksize;
+		ret = lz4_decompress_unknownoutputsize(inp, chunksize, outp,
+				&dest_len);
+#endif
+		if (ret < 0) {
+			error("Decoding failed");
+			goto exit_2;
+		}
+
+		if (flush && flush(outp, dest_len) != dest_len)
+			goto exit_2;
+		if (output)
+			outp += dest_len;
+		if (posp)
+			*posp += chunksize;
+
+		size -= chunksize;
+
+		if (size == 0)
+			break;
+		else if (size < 0) {
+			error("data corrupted");
+			goto exit_2;
+		}
+
+		inp += chunksize;
+		if (fill)
+			inp = inp_start;
+	}
+
+	ret = 0;
+exit_2:
+	if (!input)
+		large_free(inp_start);
+exit_1:
+	if (!output)
+		large_free(outp);
+exit_0:
+	return ret;
+}
+
+#ifdef PREBOOT
+STATIC int INIT decompress(unsigned char *buf, int in_len,
+			      int(*fill)(void*, unsigned int),
+			      int(*flush)(void*, unsigned int),
+			      unsigned char *output,
+			      int *posp,
+			      void(*error)(char *x)
+	)
+{
+	return unlz4(buf, in_len - 4, fill, flush, output, posp, error);
+}
+#endif
diff --git a/lib/lz4/Makefile b/lib/lz4/Makefile
new file mode 100644
index 0000000..7f548c6
--- /dev/null
+++ b/lib/lz4/Makefile
@@ -0,0 +1 @@
+obj-$(CONFIG_LZ4_DECOMPRESS) += lz4_decompress.o
diff --git a/lib/lz4/lz4_decompress.c b/lib/lz4/lz4_decompress.c
index dcc8975..d3414ea 100644
--- a/lib/lz4/lz4_decompress.c
+++ b/lib/lz4/lz4_decompress.c
@@ -1,7 +1,7 @@
 /*
  * LZ4 Decompressor for Linux kernel
  *
- * Copyright (C) 2013 LG Electronics Co., Ltd. (http://www.lge.com/)
+ * Copyright (C) 2013, LG Electronics, Kyungsik Lee <kyungsik.lee@lge.com>
  *
  * Based on LZ4 implementation by Yann Collet.
  *
diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib
index f97869f..6031e23 100644
--- a/scripts/Makefile.lib
+++ b/scripts/Makefile.lib
@@ -311,6 +311,11 @@ cmd_lzo = (cat $(filter-out FORCE,$^) | \
 	lzop -9 && $(call size_append, $(filter-out FORCE,$^))) > $@ || \
 	(rm -f $@ ; false)
 
+quiet_cmd_lz4 = LZ4     $@
+cmd_lz4 = (cat $(filter-out FORCE,$^) | \
+	lz4c -l -c1 stdin stdout && $(call size_append, $(filter-out FORCE,$^))) > $@ || \
+	(rm -f $@ ; false)
+
 # U-Boot mkimage
 # ---------------------------------------------------------------------------
 
diff --git a/usr/Kconfig b/usr/Kconfig
index 085872b..642f503 100644
--- a/usr/Kconfig
+++ b/usr/Kconfig
@@ -90,6 +90,15 @@ config RD_LZO
 	  Support loading of a LZO encoded initial ramdisk or cpio buffer
 	  If unsure, say N.
 
+config RD_LZ4
+	bool "Support initial ramdisks compressed using LZ4" if EXPERT
+	default !EXPERT
+	depends on BLK_DEV_INITRD
+	select DECOMPRESS_LZ4
+	help
+	  Support loading of a LZ4 encoded initial ramdisk or cpio buffer
+	  If unsure, say N.
+
 choice
 	prompt "Built-in initramfs compression mode" if INITRAMFS_SOURCE!=""
 	help
-- 
cgit v0.10.2


From f9b493ac9b833fd9dd3bbd50460adb33f29e1238 Mon Sep 17 00:00:00 2001
From: Kyungsik Lee <kyungsik.lee@lge.com>
Date: Mon, 8 Jul 2013 16:01:48 -0700
Subject: arm: add support for LZ4-compressed kernel

Integrates the LZ4 decompression code to the arm pre-boot code.

Signed-off-by: Kyungsik Lee <kyungsik.lee@lge.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Russell King <rmk@arm.linux.org.uk>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Florian Fainelli <florian@openwrt.org>
Cc: Yann Collet <yann.collet.73@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/Documentation/x86/boot.txt b/Documentation/x86/boot.txt
index 3840b6f..fc66d42 100644
--- a/Documentation/x86/boot.txt
+++ b/Documentation/x86/boot.txt
@@ -657,9 +657,10 @@ Protocol:	2.08+
   uncompressed data should be determined using the standard magic
   numbers.  The currently supported compression formats are gzip
   (magic numbers 1F 8B or 1F 9E), bzip2 (magic number 42 5A), LZMA
-  (magic number 5D 00), and XZ (magic number FD 37).  The uncompressed
-  payload is currently always ELF (magic number 7F 45 4C 46).
-  
+  (magic number 5D 00), XZ (magic number FD 37), and LZ4 (magic number
+  02 21).  The uncompressed payload is currently always ELF (magic
+  number 7F 45 4C 46).
+
 Field name:	payload_length
 Type:		read
 Offset/size:	0x24c/4
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 5ef7af0..0ac9be6 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -41,6 +41,7 @@ config ARM
 	select HAVE_IDE if PCI || ISA || PCMCIA
 	select HAVE_IRQ_TIME_ACCOUNTING
 	select HAVE_KERNEL_GZIP
+	select HAVE_KERNEL_LZ4
 	select HAVE_KERNEL_LZMA
 	select HAVE_KERNEL_LZO
 	select HAVE_KERNEL_XZ
diff --git a/arch/arm/boot/compressed/.gitignore b/arch/arm/boot/compressed/.gitignore
index f79a08e..47279aa 100644
--- a/arch/arm/boot/compressed/.gitignore
+++ b/arch/arm/boot/compressed/.gitignore
@@ -6,6 +6,7 @@ piggy.gzip
 piggy.lzo
 piggy.lzma
 piggy.xzkern
+piggy.lz4
 vmlinux
 vmlinux.lds
 
diff --git a/arch/arm/boot/compressed/Makefile b/arch/arm/boot/compressed/Makefile
index 48d0a44..7ac1610 100644
--- a/arch/arm/boot/compressed/Makefile
+++ b/arch/arm/boot/compressed/Makefile
@@ -91,6 +91,7 @@ suffix_$(CONFIG_KERNEL_GZIP) = gzip
 suffix_$(CONFIG_KERNEL_LZO)  = lzo
 suffix_$(CONFIG_KERNEL_LZMA) = lzma
 suffix_$(CONFIG_KERNEL_XZ)   = xzkern
+suffix_$(CONFIG_KERNEL_LZ4)  = lz4
 
 # Borrowed libfdt files for the ATAG compatibility mode
 
@@ -115,7 +116,7 @@ targets       := vmlinux vmlinux.lds \
 		 font.o font.c head.o misc.o $(OBJS)
 
 # Make sure files are removed during clean
-extra-y       += piggy.gzip piggy.lzo piggy.lzma piggy.xzkern \
+extra-y       += piggy.gzip piggy.lzo piggy.lzma piggy.xzkern piggy.lz4 \
 		 lib1funcs.S ashldi3.S $(libfdt) $(libfdt_hdrs) \
 		 hyp-stub.S
 
diff --git a/arch/arm/boot/compressed/decompress.c b/arch/arm/boot/compressed/decompress.c
index 24b0475..bd245d3 100644
--- a/arch/arm/boot/compressed/decompress.c
+++ b/arch/arm/boot/compressed/decompress.c
@@ -51,6 +51,10 @@ extern char * strstr(const char * s1, const char *s2);
 #include "../../../../lib/decompress_unxz.c"
 #endif
 
+#ifdef CONFIG_KERNEL_LZ4
+#include "../../../../lib/decompress_unlz4.c"
+#endif
+
 int do_decompress(u8 *input, int len, u8 *output, void (*error)(char *x))
 {
 	return decompress(input, len, NULL, NULL, output, NULL, error);
diff --git a/arch/arm/boot/compressed/piggy.lz4.S b/arch/arm/boot/compressed/piggy.lz4.S
new file mode 100644
index 0000000..3d9a575
--- /dev/null
+++ b/arch/arm/boot/compressed/piggy.lz4.S
@@ -0,0 +1,6 @@
+	.section .piggydata,#alloc
+	.globl	input_data
+input_data:
+	.incbin	"arch/arm/boot/compressed/piggy.lz4"
+	.globl	input_data_end
+input_data_end:
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 265c672..b32ebf9 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -65,6 +65,7 @@ config X86
 	select HAVE_KERNEL_LZMA
 	select HAVE_KERNEL_XZ
 	select HAVE_KERNEL_LZO
+	select HAVE_KERNEL_LZ4
 	select HAVE_HW_BREAKPOINT
 	select HAVE_MIXED_BREAKPOINTS_REGS
 	select PERF_EVENTS
diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile
index 5ef205c..dcd90df 100644
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -4,7 +4,8 @@
 # create a compressed vmlinux image from the original vmlinux
 #
 
-targets := vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 vmlinux.bin.lzma vmlinux.bin.xz vmlinux.bin.lzo
+targets := vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 vmlinux.bin.lzma \
+	vmlinux.bin.xz vmlinux.bin.lzo vmlinux.bin.lz4
 
 KBUILD_CFLAGS := -m$(BITS) -D__KERNEL__ $(LINUX_INCLUDE) -O2
 KBUILD_CFLAGS += -fno-strict-aliasing -fPIC
@@ -63,12 +64,15 @@ $(obj)/vmlinux.bin.xz: $(vmlinux.bin.all-y) FORCE
 	$(call if_changed,xzkern)
 $(obj)/vmlinux.bin.lzo: $(vmlinux.bin.all-y) FORCE
 	$(call if_changed,lzo)
+$(obj)/vmlinux.bin.lz4: $(vmlinux.bin.all-y) FORCE
+	$(call if_changed,lz4)
 
 suffix-$(CONFIG_KERNEL_GZIP)	:= gz
 suffix-$(CONFIG_KERNEL_BZIP2)	:= bz2
 suffix-$(CONFIG_KERNEL_LZMA)	:= lzma
 suffix-$(CONFIG_KERNEL_XZ)	:= xz
 suffix-$(CONFIG_KERNEL_LZO) 	:= lzo
+suffix-$(CONFIG_KERNEL_LZ4) 	:= lz4
 
 quiet_cmd_mkpiggy = MKPIGGY $@
       cmd_mkpiggy = $(obj)/mkpiggy $< > $@ || ( rm -f $@ ; false )
diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
index 7cb56c6..0319c88 100644
--- a/arch/x86/boot/compressed/misc.c
+++ b/arch/x86/boot/compressed/misc.c
@@ -145,6 +145,10 @@ static int lines, cols;
 #include "../../../../lib/decompress_unlzo.c"
 #endif
 
+#ifdef CONFIG_KERNEL_LZ4
+#include "../../../../lib/decompress_unlz4.c"
+#endif
+
 static void scroll(void)
 {
 	int i;
-- 
cgit v0.10.2


From c72ac7a1a926dbffb59daf0f275450e5eecce16f Mon Sep 17 00:00:00 2001
From: Chanho Min <chanho.min@lge.com>
Date: Mon, 8 Jul 2013 16:01:49 -0700
Subject: lib: add lz4 compressor module

This patchset is for supporting LZ4 compression and the crypto API using
it.

As shown below, the size of data is a little bit bigger but compressing
speed is faster under the enabled unaligned memory access.  We can use
lz4 de/compression through crypto API as well.  Also, It will be useful
for another potential user of lz4 compression.

lz4 Compression Benchmark:
Compiler: ARM gcc 4.6.4
ARMv7, 1 GHz based board
   Kernel: linux 3.4
   Uncompressed data Size: 101 MB
         Compressed Size  compression Speed
   LZO   72.1MB		  32.1MB/s, 33.0MB/s(UA)
   LZ4   75.1MB		  30.4MB/s, 35.9MB/s(UA)
   LZ4HC 59.8MB		   2.4MB/s,  2.5MB/s(UA)
- UA: Unaligned memory Access support
- Latest patch set for LZO applied

This patch:

Add support for LZ4 compression in the Linux Kernel.  LZ4 Compression APIs
for kernel are based on LZ4 implementation by Yann Collet and were changed
for kernel coding style.

LZ4 homepage : http://fastcompression.blogspot.com/p/lz4.html
LZ4 source repository : http://code.google.com/p/lz4/
svn revision : r90

Two APIs are added:

lz4_compress() support basic lz4 compression whereas lz4hc_compress()
support high compression or CPU performance get lower but compression
ratio get higher.  Also, we require the pre-allocated working memory with
the defined size and destination buffer must be allocated with the size of
lz4_compressbound.

[akpm@linux-foundation.org: make lz4_compresshcctx() static]
Signed-off-by: Chanho Min <chanho.min@lge.com>
Cc: "Darrick J. Wong" <djwong@us.ibm.com>
Cc: Bob Pearson <rpearson@systemfabricworks.com>
Cc: Richard Weinberger <richard@nod.at>
Cc: Herbert Xu <herbert@gondor.hengli.com.au>
Cc: Yann Collet <yann.collet.73@gmail.com>
Cc: Kyungsik Lee <kyungsik.lee@lge.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/include/linux/lz4.h b/include/linux/lz4.h
index 7f6c75a..d21c13f 100644
--- a/include/linux/lz4.h
+++ b/include/linux/lz4.h
@@ -9,6 +9,8 @@
  * it under the terms of the GNU General Public License version 2 as
  * published by the Free Software Foundation.
  */
+#define LZ4_MEM_COMPRESS	(4096 * sizeof(unsigned char *))
+#define LZ4HC_MEM_COMPRESS	(65538 * sizeof(unsigned char *))
 
 /*
  * lz4_compressbound()
@@ -21,6 +23,40 @@ static inline size_t lz4_compressbound(size_t isize)
 }
 
 /*
+ * lz4_compress()
+ *	src     : source address of the original data
+ *	src_len : size of the original data
+ *	dst	: output buffer address of the compressed data
+ *		This requires 'dst' of size LZ4_COMPRESSBOUND.
+ *	dst_len : is the output size, which is returned after compress done
+ *	workmem : address of the working memory.
+ *		This requires 'workmem' of size LZ4_MEM_COMPRESS.
+ *	return  : Success if return 0
+ *		  Error if return (< 0)
+ *	note :  Destination buffer and workmem must be already allocated with
+ *		the defined size.
+ */
+int lz4_compress(const unsigned char *src, size_t src_len,
+		unsigned char *dst, size_t *dst_len, void *wrkmem);
+
+ /*
+  * lz4hc_compress()
+  *	 src	 : source address of the original data
+  *	 src_len : size of the original data
+  *	 dst	 : output buffer address of the compressed data
+  *		This requires 'dst' of size LZ4_COMPRESSBOUND.
+  *	 dst_len : is the output size, which is returned after compress done
+  *	 workmem : address of the working memory.
+  *		This requires 'workmem' of size LZ4HC_MEM_COMPRESS.
+  *	 return  : Success if return 0
+  *		   Error if return (< 0)
+  *	 note :  Destination buffer and workmem must be already allocated with
+  *		 the defined size.
+  */
+int lz4hc_compress(const unsigned char *src, size_t src_len,
+		unsigned char *dst, size_t *dst_len, void *wrkmem);
+
+/*
  * lz4_decompress()
  *	src     : source address of the compressed data
  *	src_len : is the input size, whcih is returned after decompress done
diff --git a/lib/Kconfig b/lib/Kconfig
index e9cdc8f..35da513 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -194,6 +194,12 @@ config LZO_COMPRESS
 config LZO_DECOMPRESS
 	tristate
 
+config LZ4_COMPRESS
+	tristate
+
+config LZ4HC_COMPRESS
+	tristate
+
 config LZ4_DECOMPRESS
 	tristate
 
diff --git a/lib/Makefile b/lib/Makefile
index 326f6c7..7baccfd 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -75,6 +75,8 @@ obj-$(CONFIG_REED_SOLOMON) += reed_solomon/
 obj-$(CONFIG_BCH) += bch.o
 obj-$(CONFIG_LZO_COMPRESS) += lzo/
 obj-$(CONFIG_LZO_DECOMPRESS) += lzo/
+obj-$(CONFIG_LZ4_COMPRESS) += lz4/
+obj-$(CONFIG_LZ4HC_COMPRESS) += lz4/
 obj-$(CONFIG_LZ4_DECOMPRESS) += lz4/
 obj-$(CONFIG_XZ_DEC) += xz/
 obj-$(CONFIG_RAID6_PQ) += raid6/
diff --git a/lib/lz4/Makefile b/lib/lz4/Makefile
index 7f548c6..8085d04 100644
--- a/lib/lz4/Makefile
+++ b/lib/lz4/Makefile
@@ -1 +1,3 @@
+obj-$(CONFIG_LZ4_COMPRESS) += lz4_compress.o
+obj-$(CONFIG_LZ4HC_COMPRESS) += lz4hc_compress.o
 obj-$(CONFIG_LZ4_DECOMPRESS) += lz4_decompress.o
diff --git a/lib/lz4/lz4_compress.c b/lib/lz4/lz4_compress.c
new file mode 100644
index 0000000..fd94058
--- /dev/null
+++ b/lib/lz4/lz4_compress.c
@@ -0,0 +1,443 @@
+/*
+ * LZ4 - Fast LZ compression algorithm
+ * Copyright (C) 2011-2012, Yann Collet.
+ * BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * You can contact the author at :
+ * - LZ4 homepage : http://fastcompression.blogspot.com/p/lz4.html
+ * - LZ4 source repository : http://code.google.com/p/lz4/
+ *
+ *  Changed for kernel use by:
+ *  Chanho Min <chanho.min@lge.com>
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/lz4.h>
+#include <asm/unaligned.h>
+#include "lz4defs.h"
+
+/*
+ * LZ4_compressCtx :
+ * -----------------
+ * Compress 'isize' bytes from 'source' into an output buffer 'dest' of
+ * maximum size 'maxOutputSize'.  * If it cannot achieve it, compression
+ * will stop, and result of the function will be zero.
+ * return : the number of bytes written in buffer 'dest', or 0 if the
+ * compression fails
+ */
+static inline int lz4_compressctx(void *ctx,
+		const char *source,
+		char *dest,
+		int isize,
+		int maxoutputsize)
+{
+	HTYPE *hashtable = (HTYPE *)ctx;
+	const u8 *ip = (u8 *)source;
+#if LZ4_ARCH64
+	const BYTE * const base = ip;
+#else
+	const int base = 0;
+#endif
+	const u8 *anchor = ip;
+	const u8 *const iend = ip + isize;
+	const u8 *const mflimit = iend - MFLIMIT;
+	#define MATCHLIMIT (iend - LASTLITERALS)
+
+	u8 *op = (u8 *) dest;
+	u8 *const oend = op + maxoutputsize;
+	int length;
+	const int skipstrength = SKIPSTRENGTH;
+	u32 forwardh;
+	int lastrun;
+
+	/* Init */
+	if (isize < MINLENGTH)
+		goto _last_literals;
+
+	memset((void *)hashtable, 0, LZ4_MEM_COMPRESS);
+
+	/* First Byte */
+	hashtable[LZ4_HASH_VALUE(ip)] = ip - base;
+	ip++;
+	forwardh = LZ4_HASH_VALUE(ip);
+
+	/* Main Loop */
+	for (;;) {
+		int findmatchattempts = (1U << skipstrength) + 3;
+		const u8 *forwardip = ip;
+		const u8 *ref;
+		u8 *token;
+
+		/* Find a match */
+		do {
+			u32 h = forwardh;
+			int step = findmatchattempts++ >> skipstrength;
+			ip = forwardip;
+			forwardip = ip + step;
+
+			if (unlikely(forwardip > mflimit))
+				goto _last_literals;
+
+			forwardh = LZ4_HASH_VALUE(forwardip);
+			ref = base + hashtable[h];
+			hashtable[h] = ip - base;
+		} while ((ref < ip - MAX_DISTANCE) || (A32(ref) != A32(ip)));
+
+		/* Catch up */
+		while ((ip > anchor) && (ref > (u8 *)source) &&
+			unlikely(ip[-1] == ref[-1])) {
+			ip--;
+			ref--;
+		}
+
+		/* Encode Literal length */
+		length = (int)(ip - anchor);
+		token = op++;
+		/* check output limit */
+		if (unlikely(op + length + (2 + 1 + LASTLITERALS) +
+			(length >> 8) > oend))
+			return 0;
+
+		if (length >= (int)RUN_MASK) {
+			int len;
+			*token = (RUN_MASK << ML_BITS);
+			len = length - RUN_MASK;
+			for (; len > 254 ; len -= 255)
+				*op++ = 255;
+			*op++ = (u8)len;
+		} else
+			*token = (length << ML_BITS);
+
+		/* Copy Literals */
+		LZ4_BLINDCOPY(anchor, op, length);
+_next_match:
+		/* Encode Offset */
+		LZ4_WRITE_LITTLEENDIAN_16(op, (u16)(ip - ref));
+
+		/* Start Counting */
+		ip += MINMATCH;
+		/* MinMatch verified */
+		ref += MINMATCH;
+		anchor = ip;
+		while (likely(ip < MATCHLIMIT - (STEPSIZE - 1))) {
+			#if LZ4_ARCH64
+			u64 diff = A64(ref) ^ A64(ip);
+			#else
+			u32 diff = A32(ref) ^ A32(ip);
+			#endif
+			if (!diff) {
+				ip += STEPSIZE;
+				ref += STEPSIZE;
+				continue;
+			}
+			ip += LZ4_NBCOMMONBYTES(diff);
+			goto _endcount;
+		}
+		#if LZ4_ARCH64
+		if ((ip < (MATCHLIMIT - 3)) && (A32(ref) == A32(ip))) {
+			ip += 4;
+			ref += 4;
+		}
+		#endif
+		if ((ip < (MATCHLIMIT - 1)) && (A16(ref) == A16(ip))) {
+			ip += 2;
+			ref += 2;
+		}
+		if ((ip < MATCHLIMIT) && (*ref == *ip))
+			ip++;
+_endcount:
+		/* Encode MatchLength */
+		length = (int)(ip - anchor);
+		/* Check output limit */
+		if (unlikely(op + (1 + LASTLITERALS) + (length >> 8) > oend))
+			return 0;
+		if (length >= (int)ML_MASK) {
+			*token += ML_MASK;
+			length -= ML_MASK;
+			for (; length > 509 ; length -= 510) {
+				*op++ = 255;
+				*op++ = 255;
+			}
+			if (length > 254) {
+				length -= 255;
+				*op++ = 255;
+			}
+			*op++ = (u8)length;
+		} else
+			*token += length;
+
+		/* Test end of chunk */
+		if (ip > mflimit) {
+			anchor = ip;
+			break;
+		}
+
+		/* Fill table */
+		hashtable[LZ4_HASH_VALUE(ip-2)] = ip - 2 - base;
+
+		/* Test next position */
+		ref = base + hashtable[LZ4_HASH_VALUE(ip)];
+		hashtable[LZ4_HASH_VALUE(ip)] = ip - base;
+		if ((ref > ip - (MAX_DISTANCE + 1)) && (A32(ref) == A32(ip))) {
+			token = op++;
+			*token = 0;
+			goto _next_match;
+		}
+
+		/* Prepare next loop */
+		anchor = ip++;
+		forwardh = LZ4_HASH_VALUE(ip);
+	}
+
+_last_literals:
+	/* Encode Last Literals */
+	lastrun = (int)(iend - anchor);
+	if (((char *)op - dest) + lastrun + 1
+		+ ((lastrun + 255 - RUN_MASK) / 255) > (u32)maxoutputsize)
+		return 0;
+
+	if (lastrun >= (int)RUN_MASK) {
+		*op++ = (RUN_MASK << ML_BITS);
+		lastrun -= RUN_MASK;
+		for (; lastrun > 254 ; lastrun -= 255)
+			*op++ = 255;
+		*op++ = (u8)lastrun;
+	} else
+		*op++ = (lastrun << ML_BITS);
+	memcpy(op, anchor, iend - anchor);
+	op += iend - anchor;
+
+	/* End */
+	return (int)(((char *)op) - dest);
+}
+
+static inline int lz4_compress64kctx(void *ctx,
+		const char *source,
+		char *dest,
+		int isize,
+		int maxoutputsize)
+{
+	u16 *hashtable = (u16 *)ctx;
+	const u8 *ip = (u8 *) source;
+	const u8 *anchor = ip;
+	const u8 *const base = ip;
+	const u8 *const iend = ip + isize;
+	const u8 *const mflimit = iend - MFLIMIT;
+	#define MATCHLIMIT (iend - LASTLITERALS)
+
+	u8 *op = (u8 *) dest;
+	u8 *const oend = op + maxoutputsize;
+	int len, length;
+	const int skipstrength = SKIPSTRENGTH;
+	u32 forwardh;
+	int lastrun;
+
+	/* Init */
+	if (isize < MINLENGTH)
+		goto _last_literals;
+
+	memset((void *)hashtable, 0, LZ4_MEM_COMPRESS);
+
+	/* First Byte */
+	ip++;
+	forwardh = LZ4_HASH64K_VALUE(ip);
+
+	/* Main Loop */
+	for (;;) {
+		int findmatchattempts = (1U << skipstrength) + 3;
+		const u8 *forwardip = ip;
+		const u8 *ref;
+		u8 *token;
+
+		/* Find a match */
+		do {
+			u32 h = forwardh;
+			int step = findmatchattempts++ >> skipstrength;
+			ip = forwardip;
+			forwardip = ip + step;
+
+			if (forwardip > mflimit)
+				goto _last_literals;
+
+			forwardh = LZ4_HASH64K_VALUE(forwardip);
+			ref = base + hashtable[h];
+			hashtable[h] = (u16)(ip - base);
+		} while (A32(ref) != A32(ip));
+
+		/* Catch up */
+		while ((ip > anchor) && (ref > (u8 *)source)
+			&& (ip[-1] == ref[-1])) {
+			ip--;
+			ref--;
+		}
+
+		/* Encode Literal length */
+		length = (int)(ip - anchor);
+		token = op++;
+		/* Check output limit */
+		if (unlikely(op + length + (2 + 1 + LASTLITERALS)
+			+ (length >> 8) > oend))
+			return 0;
+		if (length >= (int)RUN_MASK) {
+			*token = (RUN_MASK << ML_BITS);
+			len = length - RUN_MASK;
+			for (; len > 254 ; len -= 255)
+				*op++ = 255;
+			*op++ = (u8)len;
+		} else
+			*token = (length << ML_BITS);
+
+		/* Copy Literals */
+		LZ4_BLINDCOPY(anchor, op, length);
+
+_next_match:
+		/* Encode Offset */
+		LZ4_WRITE_LITTLEENDIAN_16(op, (u16)(ip - ref));
+
+		/* Start Counting */
+		ip += MINMATCH;
+		/* MinMatch verified */
+		ref += MINMATCH;
+		anchor = ip;
+
+		while (ip < MATCHLIMIT - (STEPSIZE - 1)) {
+			#if LZ4_ARCH64
+			u64 diff = A64(ref) ^ A64(ip);
+			#else
+			u32 diff = A32(ref) ^ A32(ip);
+			#endif
+
+			if (!diff) {
+				ip += STEPSIZE;
+				ref += STEPSIZE;
+				continue;
+			}
+			ip += LZ4_NBCOMMONBYTES(diff);
+			goto _endcount;
+		}
+		#if LZ4_ARCH64
+		if ((ip < (MATCHLIMIT - 3)) && (A32(ref) == A32(ip))) {
+			ip += 4;
+			ref += 4;
+		}
+		#endif
+		if ((ip < (MATCHLIMIT - 1)) && (A16(ref) == A16(ip))) {
+			ip += 2;
+			ref += 2;
+		}
+		if ((ip < MATCHLIMIT) && (*ref == *ip))
+			ip++;
+_endcount:
+
+		/* Encode MatchLength */
+		len = (int)(ip - anchor);
+		/* Check output limit */
+		if (unlikely(op + (1 + LASTLITERALS) + (len >> 8) > oend))
+			return 0;
+		if (len >= (int)ML_MASK) {
+			*token += ML_MASK;
+			len -= ML_MASK;
+			for (; len > 509 ; len -= 510) {
+				*op++ = 255;
+				*op++ = 255;
+			}
+			if (len > 254) {
+				len -= 255;
+				*op++ = 255;
+			}
+			*op++ = (u8)len;
+		} else
+			*token += len;
+
+		/* Test end of chunk */
+		if (ip > mflimit) {
+			anchor = ip;
+			break;
+		}
+
+		/* Fill table */
+		hashtable[LZ4_HASH64K_VALUE(ip-2)] = (u16)(ip - 2 - base);
+
+		/* Test next position */
+		ref = base + hashtable[LZ4_HASH64K_VALUE(ip)];
+		hashtable[LZ4_HASH64K_VALUE(ip)] = (u16)(ip - base);
+		if (A32(ref) == A32(ip)) {
+			token = op++;
+			*token = 0;
+			goto _next_match;
+		}
+
+		/* Prepare next loop */
+		anchor = ip++;
+		forwardh = LZ4_HASH64K_VALUE(ip);
+	}
+
+_last_literals:
+	/* Encode Last Literals */
+	lastrun = (int)(iend - anchor);
+	if (op + lastrun + 1 + (lastrun - RUN_MASK + 255) / 255 > oend)
+		return 0;
+	if (lastrun >= (int)RUN_MASK) {
+		*op++ = (RUN_MASK << ML_BITS);
+		lastrun -= RUN_MASK;
+		for (; lastrun > 254 ; lastrun -= 255)
+			*op++ = 255;
+		*op++ = (u8)lastrun;
+	} else
+		*op++ = (lastrun << ML_BITS);
+	memcpy(op, anchor, iend - anchor);
+	op += iend - anchor;
+	/* End */
+	return (int)(((char *)op) - dest);
+}
+
+int lz4_compress(const unsigned char *src, size_t src_len,
+			unsigned char *dst, size_t *dst_len, void *wrkmem)
+{
+	int ret = -1;
+	int out_len = 0;
+
+	if (src_len < LZ4_64KLIMIT)
+		out_len = lz4_compress64kctx(wrkmem, src, dst, src_len,
+				lz4_compressbound(src_len));
+	else
+		out_len = lz4_compressctx(wrkmem, src, dst, src_len,
+				lz4_compressbound(src_len));
+
+	if (out_len < 0)
+		goto exit;
+
+	*dst_len = out_len;
+
+	return 0;
+exit:
+	return ret;
+}
+EXPORT_SYMBOL_GPL(lz4_compress);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("LZ4 compressor");
diff --git a/lib/lz4/lz4defs.h b/lib/lz4/lz4defs.h
index 43ac31d..abcecdc 100644
--- a/lib/lz4/lz4defs.h
+++ b/lib/lz4/lz4defs.h
@@ -22,23 +22,40 @@
  * Architecture-specific macros
  */
 #define BYTE	u8
+typedef struct _U16_S { u16 v; } U16_S;
+typedef struct _U32_S { u32 v; } U32_S;
+typedef struct _U64_S { u64 v; } U64_S;
 #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)		\
 	|| defined(CONFIG_ARM) && __LINUX_ARM_ARCH__ >= 6	\
 	&& defined(ARM_EFFICIENT_UNALIGNED_ACCESS)
-typedef struct _U32_S { u32 v; } U32_S;
-typedef struct _U64_S { u64 v; } U64_S;
 
+#define A16(x) (((U16_S *)(x))->v)
 #define A32(x) (((U32_S *)(x))->v)
 #define A64(x) (((U64_S *)(x))->v)
 
 #define PUT4(s, d) (A32(d) = A32(s))
 #define PUT8(s, d) (A64(d) = A64(s))
+#define LZ4_WRITE_LITTLEENDIAN_16(p, v)	\
+	do {	\
+		A16(p) = v; \
+		p += 2; \
+	} while (0)
 #else /* CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS */
 
+#define A64(x) get_unaligned((u64 *)&(((U16_S *)(x))->v))
+#define A32(x) get_unaligned((u32 *)&(((U16_S *)(x))->v))
+#define A16(x) get_unaligned((u16 *)&(((U16_S *)(x))->v))
+
 #define PUT4(s, d) \
 	put_unaligned(get_unaligned((const u32 *) s), (u32 *) d)
 #define PUT8(s, d) \
 	put_unaligned(get_unaligned((const u64 *) s), (u64 *) d)
+
+#define LZ4_WRITE_LITTLEENDIAN_16(p, v)	\
+	do {	\
+		put_unaligned(v, (u16 *)(p)); \
+		p += 2; \
+	} while (0)
 #endif
 
 #define COPYLENGTH 8
@@ -46,6 +63,29 @@ typedef struct _U64_S { u64 v; } U64_S;
 #define ML_MASK  ((1U << ML_BITS) - 1)
 #define RUN_BITS (8 - ML_BITS)
 #define RUN_MASK ((1U << RUN_BITS) - 1)
+#define MEMORY_USAGE	14
+#define MINMATCH	4
+#define SKIPSTRENGTH	6
+#define LASTLITERALS	5
+#define MFLIMIT		(COPYLENGTH + MINMATCH)
+#define MINLENGTH	(MFLIMIT + 1)
+#define MAXD_LOG	16
+#define MAXD		(1 << MAXD_LOG)
+#define MAXD_MASK	(u32)(MAXD - 1)
+#define MAX_DISTANCE	(MAXD - 1)
+#define HASH_LOG	(MAXD_LOG - 1)
+#define HASHTABLESIZE	(1 << HASH_LOG)
+#define MAX_NB_ATTEMPTS	256
+#define OPTIMAL_ML	(int)((ML_MASK-1)+MINMATCH)
+#define LZ4_64KLIMIT	((1<<16) + (MFLIMIT - 1))
+#define HASHLOG64K	((MEMORY_USAGE - 2) + 1)
+#define HASH64KTABLESIZE	(1U << HASHLOG64K)
+#define LZ4_HASH_VALUE(p)	(((A32(p)) * 2654435761U) >> \
+				((MINMATCH * 8) - (MEMORY_USAGE-2)))
+#define LZ4_HASH64K_VALUE(p)	(((A32(p)) * 2654435761U) >> \
+				((MINMATCH * 8) - HASHLOG64K))
+#define HASH_VALUE(p)		(((A32(p)) * 2654435761U) >> \
+				((MINMATCH * 8) - HASH_LOG))
 
 #if LZ4_ARCH64/* 64-bit */
 #define STEPSIZE 8
@@ -65,6 +105,13 @@ typedef struct _U64_S { u64 v; } U64_S;
 			LZ4_WILDCOPY(s, d, e);	\
 		}				\
 	} while (0)
+#define HTYPE u32
+
+#ifdef __BIG_ENDIAN
+#define LZ4_NBCOMMONBYTES(val) (__builtin_clzll(val) >> 3)
+#else
+#define LZ4_NBCOMMONBYTES(val) (__builtin_ctzll(val) >> 3)
+#endif
 
 #else	/* 32-bit */
 #define STEPSIZE 4
@@ -83,6 +130,14 @@ typedef struct _U64_S { u64 v; } U64_S;
 	} while (0)
 
 #define LZ4_SECURECOPY	LZ4_WILDCOPY
+#define HTYPE const u8*
+
+#ifdef __BIG_ENDIAN
+#define LZ4_NBCOMMONBYTES(val) (__builtin_clz(val) >> 3)
+#else
+#define LZ4_NBCOMMONBYTES(val) (__builtin_ctz(val) >> 3)
+#endif
+
 #endif
 
 #define LZ4_READ_LITTLEENDIAN_16(d, s, p) \
@@ -92,3 +147,10 @@ typedef struct _U64_S { u64 v; } U64_S;
 	do {				\
 		LZ4_COPYPACKET(s, d);	\
 	} while (d < e)
+
+#define LZ4_BLINDCOPY(s, d, l)	\
+	do {	\
+		u8 *e = (d) + l;	\
+		LZ4_WILDCOPY(s, d, e);	\
+		d = e;	\
+	} while (0)
diff --git a/lib/lz4/lz4hc_compress.c b/lib/lz4/lz4hc_compress.c
new file mode 100644
index 0000000..eb1a74f
--- /dev/null
+++ b/lib/lz4/lz4hc_compress.c
@@ -0,0 +1,539 @@
+/*
+ * LZ4 HC - High Compression Mode of LZ4
+ * Copyright (C) 2011-2012, Yann Collet.
+ * BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * You can contact the author at :
+ * - LZ4 homepage : http://fastcompression.blogspot.com/p/lz4.html
+ * - LZ4 source repository : http://code.google.com/p/lz4/
+ *
+ *  Changed for kernel use by:
+ *  Chanho Min <chanho.min@lge.com>
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/lz4.h>
+#include <asm/unaligned.h>
+#include "lz4defs.h"
+
+struct lz4hc_data {
+	const u8 *base;
+	HTYPE hashtable[HASHTABLESIZE];
+	u16 chaintable[MAXD];
+	const u8 *nexttoupdate;
+} __attribute__((__packed__));
+
+static inline int lz4hc_init(struct lz4hc_data *hc4, const u8 *base)
+{
+	memset((void *)hc4->hashtable, 0, sizeof(hc4->hashtable));
+	memset(hc4->chaintable, 0xFF, sizeof(hc4->chaintable));
+
+#if LZ4_ARCH64
+	hc4->nexttoupdate = base + 1;
+#else
+	hc4->nexttoupdate = base;
+#endif
+	hc4->base = base;
+	return 1;
+}
+
+/* Update chains up to ip (excluded) */
+static inline void lz4hc_insert(struct lz4hc_data *hc4, const u8 *ip)
+{
+	u16 *chaintable = hc4->chaintable;
+	HTYPE *hashtable  = hc4->hashtable;
+#if LZ4_ARCH64
+	const BYTE * const base = hc4->base;
+#else
+	const int base = 0;
+#endif
+
+	while (hc4->nexttoupdate < ip) {
+		const u8 *p = hc4->nexttoupdate;
+		size_t delta = p - (hashtable[HASH_VALUE(p)] + base);
+		if (delta > MAX_DISTANCE)
+			delta = MAX_DISTANCE;
+		chaintable[(size_t)(p) & MAXD_MASK] = (u16)delta;
+		hashtable[HASH_VALUE(p)] = (p) - base;
+		hc4->nexttoupdate++;
+	}
+}
+
+static inline size_t lz4hc_commonlength(const u8 *p1, const u8 *p2,
+		const u8 *const matchlimit)
+{
+	const u8 *p1t = p1;
+
+	while (p1t < matchlimit - (STEPSIZE - 1)) {
+#if LZ4_ARCH64
+		u64 diff = A64(p2) ^ A64(p1t);
+#else
+		u32 diff = A32(p2) ^ A32(p1t);
+#endif
+		if (!diff) {
+			p1t += STEPSIZE;
+			p2 += STEPSIZE;
+			continue;
+		}
+		p1t += LZ4_NBCOMMONBYTES(diff);
+		return p1t - p1;
+	}
+#if LZ4_ARCH64
+	if ((p1t < (matchlimit-3)) && (A32(p2) == A32(p1t))) {
+		p1t += 4;
+		p2 += 4;
+	}
+#endif
+
+	if ((p1t < (matchlimit - 1)) && (A16(p2) == A16(p1t))) {
+		p1t += 2;
+		p2 += 2;
+	}
+	if ((p1t < matchlimit) && (*p2 == *p1t))
+		p1t++;
+	return p1t - p1;
+}
+
+static inline int lz4hc_insertandfindbestmatch(struct lz4hc_data *hc4,
+		const u8 *ip, const u8 *const matchlimit, const u8 **matchpos)
+{
+	u16 *const chaintable = hc4->chaintable;
+	HTYPE *const hashtable = hc4->hashtable;
+	const u8 *ref;
+#if LZ4_ARCH64
+	const BYTE * const base = hc4->base;
+#else
+	const int base = 0;
+#endif
+	int nbattempts = MAX_NB_ATTEMPTS;
+	size_t repl = 0, ml = 0;
+	u16 delta;
+
+	/* HC4 match finder */
+	lz4hc_insert(hc4, ip);
+	ref = hashtable[HASH_VALUE(ip)] + base;
+
+	/* potential repetition */
+	if (ref >= ip-4) {
+		/* confirmed */
+		if (A32(ref) == A32(ip)) {
+			delta = (u16)(ip-ref);
+			repl = ml  = lz4hc_commonlength(ip + MINMATCH,
+					ref + MINMATCH, matchlimit) + MINMATCH;
+			*matchpos = ref;
+		}
+		ref -= (size_t)chaintable[(size_t)(ref) & MAXD_MASK];
+	}
+
+	while ((ref >= ip - MAX_DISTANCE) && nbattempts) {
+		nbattempts--;
+		if (*(ref + ml) == *(ip + ml)) {
+			if (A32(ref) == A32(ip)) {
+				size_t mlt =
+					lz4hc_commonlength(ip + MINMATCH,
+					ref + MINMATCH, matchlimit) + MINMATCH;
+				if (mlt > ml) {
+					ml = mlt;
+					*matchpos = ref;
+				}
+			}
+		}
+		ref -= (size_t)chaintable[(size_t)(ref) & MAXD_MASK];
+	}
+
+	/* Complete table */
+	if (repl) {
+		const BYTE *ptr = ip;
+		const BYTE *end;
+		end = ip + repl - (MINMATCH-1);
+		/* Pre-Load */
+		while (ptr < end - delta) {
+			chaintable[(size_t)(ptr) & MAXD_MASK] = delta;
+			ptr++;
+		}
+		do {
+			chaintable[(size_t)(ptr) & MAXD_MASK] = delta;
+			/* Head of chain */
+			hashtable[HASH_VALUE(ptr)] = (ptr) - base;
+			ptr++;
+		} while (ptr < end);
+		hc4->nexttoupdate = end;
+	}
+
+	return (int)ml;
+}
+
+static inline int lz4hc_insertandgetwidermatch(struct lz4hc_data *hc4,
+	const u8 *ip, const u8 *startlimit, const u8 *matchlimit, int longest,
+	const u8 **matchpos, const u8 **startpos)
+{
+	u16 *const chaintable = hc4->chaintable;
+	HTYPE *const hashtable = hc4->hashtable;
+#if LZ4_ARCH64
+	const BYTE * const base = hc4->base;
+#else
+	const int base = 0;
+#endif
+	const u8 *ref;
+	int nbattempts = MAX_NB_ATTEMPTS;
+	int delta = (int)(ip - startlimit);
+
+	/* First Match */
+	lz4hc_insert(hc4, ip);
+	ref = hashtable[HASH_VALUE(ip)] + base;
+
+	while ((ref >= ip - MAX_DISTANCE) && (ref >= hc4->base)
+		&& (nbattempts)) {
+		nbattempts--;
+		if (*(startlimit + longest) == *(ref - delta + longest)) {
+			if (A32(ref) == A32(ip)) {
+				const u8 *reft = ref + MINMATCH;
+				const u8 *ipt = ip + MINMATCH;
+				const u8 *startt = ip;
+
+				while (ipt < matchlimit-(STEPSIZE - 1)) {
+					#if LZ4_ARCH64
+					u64 diff = A64(reft) ^ A64(ipt);
+					#else
+					u32 diff = A32(reft) ^ A32(ipt);
+					#endif
+
+					if (!diff) {
+						ipt += STEPSIZE;
+						reft += STEPSIZE;
+						continue;
+					}
+					ipt += LZ4_NBCOMMONBYTES(diff);
+					goto _endcount;
+				}
+				#if LZ4_ARCH64
+				if ((ipt < (matchlimit - 3))
+					&& (A32(reft) == A32(ipt))) {
+					ipt += 4;
+					reft += 4;
+				}
+				ipt += 2;
+				#endif
+				if ((ipt < (matchlimit - 1))
+					&& (A16(reft) == A16(ipt))) {
+					reft += 2;
+				}
+				if ((ipt < matchlimit) && (*reft == *ipt))
+					ipt++;
+_endcount:
+				reft = ref;
+
+				while ((startt > startlimit)
+					&& (reft > hc4->base)
+					&& (startt[-1] == reft[-1])) {
+					startt--;
+					reft--;
+				}
+
+				if ((ipt - startt) > longest) {
+					longest = (int)(ipt - startt);
+					*matchpos = reft;
+					*startpos = startt;
+				}
+			}
+		}
+		ref -= (size_t)chaintable[(size_t)(ref) & MAXD_MASK];
+	}
+	return longest;
+}
+
+static inline int lz4_encodesequence(const u8 **ip, u8 **op, const u8 **anchor,
+		int ml, const u8 *ref)
+{
+	int length, len;
+	u8 *token;
+
+	/* Encode Literal length */
+	length = (int)(*ip - *anchor);
+	token = (*op)++;
+	if (length >= (int)RUN_MASK) {
+		*token = (RUN_MASK << ML_BITS);
+		len = length - RUN_MASK;
+		for (; len > 254 ; len -= 255)
+			*(*op)++ = 255;
+		*(*op)++ = (u8)len;
+	} else
+		*token = (length << ML_BITS);
+
+	/* Copy Literals */
+	LZ4_BLINDCOPY(*anchor, *op, length);
+
+	/* Encode Offset */
+	LZ4_WRITE_LITTLEENDIAN_16(*op, (u16)(*ip - ref));
+
+	/* Encode MatchLength */
+	len = (int)(ml - MINMATCH);
+	if (len >= (int)ML_MASK) {
+		*token += ML_MASK;
+		len -= ML_MASK;
+		for (; len > 509 ; len -= 510) {
+			*(*op)++ = 255;
+			*(*op)++ = 255;
+		}
+		if (len > 254) {
+			len -= 255;
+			*(*op)++ = 255;
+		}
+		*(*op)++ = (u8)len;
+	} else
+		*token += len;
+
+	/* Prepare next loop */
+	*ip += ml;
+	*anchor = *ip;
+
+	return 0;
+}
+
+static int lz4_compresshcctx(struct lz4hc_data *ctx,
+		const char *source,
+		char *dest,
+		int isize)
+{
+	const u8 *ip = (const u8 *)source;
+	const u8 *anchor = ip;
+	const u8 *const iend = ip + isize;
+	const u8 *const mflimit = iend - MFLIMIT;
+	const u8 *const matchlimit = (iend - LASTLITERALS);
+
+	u8 *op = (u8 *)dest;
+
+	int ml, ml2, ml3, ml0;
+	const u8 *ref = NULL;
+	const u8 *start2 = NULL;
+	const u8 *ref2 = NULL;
+	const u8 *start3 = NULL;
+	const u8 *ref3 = NULL;
+	const u8 *start0;
+	const u8 *ref0;
+	int lastrun;
+
+	ip++;
+
+	/* Main Loop */
+	while (ip < mflimit) {
+		ml = lz4hc_insertandfindbestmatch(ctx, ip, matchlimit, (&ref));
+		if (!ml) {
+			ip++;
+			continue;
+		}
+
+		/* saved, in case we would skip too much */
+		start0 = ip;
+		ref0 = ref;
+		ml0 = ml;
+_search2:
+		if (ip+ml < mflimit)
+			ml2 = lz4hc_insertandgetwidermatch(ctx, ip + ml - 2,
+				ip + 1, matchlimit, ml, &ref2, &start2);
+		else
+			ml2 = ml;
+		/* No better match */
+		if (ml2 == ml) {
+			lz4_encodesequence(&ip, &op, &anchor, ml, ref);
+			continue;
+		}
+
+		if (start0 < ip) {
+			/* empirical */
+			if (start2 < ip + ml0) {
+				ip = start0;
+				ref = ref0;
+				ml = ml0;
+			}
+		}
+		/*
+		 * Here, start0==ip
+		 * First Match too small : removed
+		 */
+		if ((start2 - ip) < 3) {
+			ml = ml2;
+			ip = start2;
+			ref = ref2;
+			goto _search2;
+		}
+
+_search3:
+		/*
+		 * Currently we have :
+		 * ml2 > ml1, and
+		 * ip1+3 <= ip2 (usually < ip1+ml1)
+		 */
+		if ((start2 - ip) < OPTIMAL_ML) {
+			int correction;
+			int new_ml = ml;
+			if (new_ml > OPTIMAL_ML)
+				new_ml = OPTIMAL_ML;
+			if (ip + new_ml > start2 + ml2 - MINMATCH)
+				new_ml = (int)(start2 - ip) + ml2 - MINMATCH;
+			correction = new_ml - (int)(start2 - ip);
+			if (correction > 0) {
+				start2 += correction;
+				ref2 += correction;
+				ml2 -= correction;
+			}
+		}
+		/*
+		 * Now, we have start2 = ip+new_ml,
+		 * with new_ml=min(ml, OPTIMAL_ML=18)
+		 */
+		if (start2 + ml2 < mflimit)
+			ml3 = lz4hc_insertandgetwidermatch(ctx,
+				start2 + ml2 - 3, start2, matchlimit,
+				ml2, &ref3, &start3);
+		else
+			ml3 = ml2;
+
+		/* No better match : 2 sequences to encode */
+		if (ml3 == ml2) {
+			/* ip & ref are known; Now for ml */
+			if (start2 < ip+ml)
+				ml = (int)(start2 - ip);
+
+			/* Now, encode 2 sequences */
+			lz4_encodesequence(&ip, &op, &anchor, ml, ref);
+			ip = start2;
+			lz4_encodesequence(&ip, &op, &anchor, ml2, ref2);
+			continue;
+		}
+
+		/* Not enough space for match 2 : remove it */
+		if (start3 < ip + ml + 3) {
+			/*
+			 * can write Seq1 immediately ==> Seq2 is removed,
+			 * so Seq3 becomes Seq1
+			 */
+			if (start3 >= (ip + ml)) {
+				if (start2 < ip + ml) {
+					int correction =
+						(int)(ip + ml - start2);
+					start2 += correction;
+					ref2 += correction;
+					ml2 -= correction;
+					if (ml2 < MINMATCH) {
+						start2 = start3;
+						ref2 = ref3;
+						ml2 = ml3;
+					}
+				}
+
+				lz4_encodesequence(&ip, &op, &anchor, ml, ref);
+				ip  = start3;
+				ref = ref3;
+				ml  = ml3;
+
+				start0 = start2;
+				ref0 = ref2;
+				ml0 = ml2;
+				goto _search2;
+			}
+
+			start2 = start3;
+			ref2 = ref3;
+			ml2 = ml3;
+			goto _search3;
+		}
+
+		/*
+		 * OK, now we have 3 ascending matches; let's write at least
+		 * the first one ip & ref are known; Now for ml
+		 */
+		if (start2 < ip + ml) {
+			if ((start2 - ip) < (int)ML_MASK) {
+				int correction;
+				if (ml > OPTIMAL_ML)
+					ml = OPTIMAL_ML;
+				if (ip + ml > start2 + ml2 - MINMATCH)
+					ml = (int)(start2 - ip) + ml2
+						- MINMATCH;
+				correction = ml - (int)(start2 - ip);
+				if (correction > 0) {
+					start2 += correction;
+					ref2 += correction;
+					ml2 -= correction;
+				}
+			} else
+				ml = (int)(start2 - ip);
+		}
+		lz4_encodesequence(&ip, &op, &anchor, ml, ref);
+
+		ip = start2;
+		ref = ref2;
+		ml = ml2;
+
+		start2 = start3;
+		ref2 = ref3;
+		ml2 = ml3;
+
+		goto _search3;
+	}
+
+	/* Encode Last Literals */
+	lastrun = (int)(iend - anchor);
+	if (lastrun >= (int)RUN_MASK) {
+		*op++ = (RUN_MASK << ML_BITS);
+		lastrun -= RUN_MASK;
+		for (; lastrun > 254 ; lastrun -= 255)
+			*op++ = 255;
+		*op++ = (u8) lastrun;
+	} else
+		*op++ = (lastrun << ML_BITS);
+	memcpy(op, anchor, iend - anchor);
+	op += iend - anchor;
+	/* End */
+	return (int) (((char *)op) - dest);
+}
+
+int lz4hc_compress(const unsigned char *src, size_t src_len,
+			unsigned char *dst, size_t *dst_len, void *wrkmem)
+{
+	int ret = -1;
+	int out_len = 0;
+
+	struct lz4hc_data *hc4 = (struct lz4hc_data *)wrkmem;
+	lz4hc_init(hc4, (const u8 *)src);
+	out_len = lz4_compresshcctx((struct lz4hc_data *)hc4, (const u8 *)src,
+		(char *)dst, (int)src_len);
+
+	if (out_len < 0)
+		goto exit;
+
+	*dst_len = out_len;
+	return 0;
+
+exit:
+	return ret;
+}
+EXPORT_SYMBOL_GPL(lz4hc_compress);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("LZ4HC compressor");
-- 
cgit v0.10.2


From 0ea8530dcf762526459b29ac713a623b51fd691f Mon Sep 17 00:00:00 2001
From: Chanho Min <chanho.min@lge.com>
Date: Mon, 8 Jul 2013 16:01:51 -0700
Subject: crypto: add lz4 Cryptographic API

Add support for lz4 and lz4hc compression algorithm using the lib/lz4/*
codebase.

[akpm@linux-foundation.org: fix warnings]
Signed-off-by: Chanho Min <chanho.min@lge.com>
Cc: "Darrick J. Wong" <djwong@us.ibm.com>
Cc: Bob Pearson <rpearson@systemfabricworks.com>
Cc: Richard Weinberger <richard@nod.at>
Cc: Herbert Xu <herbert@gondor.hengli.com.au>
Cc: Yann Collet <yann.collet.73@gmail.com>
Cc: Kyungsik Lee <kyungsik.lee@lge.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/crypto/Kconfig b/crypto/Kconfig
index 904ffe8..69ce573 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -1336,6 +1336,22 @@ config CRYPTO_842
 	help
 	  This is the 842 algorithm.
 
+config CRYPTO_LZ4
+	tristate "LZ4 compression algorithm"
+	select CRYPTO_ALGAPI
+	select LZ4_COMPRESS
+	select LZ4_DECOMPRESS
+	help
+	  This is the LZ4 algorithm.
+
+config CRYPTO_LZ4HC
+	tristate "LZ4HC compression algorithm"
+	select CRYPTO_ALGAPI
+	select LZ4HC_COMPRESS
+	select LZ4_DECOMPRESS
+	help
+	  This is the LZ4 high compression mode algorithm.
+
 comment "Random Number Generation"
 
 config CRYPTO_ANSI_CPRNG
diff --git a/crypto/Makefile b/crypto/Makefile
index 62af87d..2d5ed08 100644
--- a/crypto/Makefile
+++ b/crypto/Makefile
@@ -86,6 +86,8 @@ obj-$(CONFIG_CRYPTO_CRC32) += crc32.o
 obj-$(CONFIG_CRYPTO_CRCT10DIF) += crct10dif.o
 obj-$(CONFIG_CRYPTO_AUTHENC) += authenc.o authencesn.o
 obj-$(CONFIG_CRYPTO_LZO) += lzo.o
+obj-$(CONFIG_CRYPTO_LZ4) += lz4.o
+obj-$(CONFIG_CRYPTO_LZ4HC) += lz4hc.o
 obj-$(CONFIG_CRYPTO_842) += 842.o
 obj-$(CONFIG_CRYPTO_RNG2) += rng.o
 obj-$(CONFIG_CRYPTO_RNG2) += krng.o
diff --git a/crypto/lz4.c b/crypto/lz4.c
new file mode 100644
index 0000000..4586dd1
--- /dev/null
+++ b/crypto/lz4.c
@@ -0,0 +1,106 @@
+/*
+ * Cryptographic API.
+ *
+ * Copyright (c) 2013 Chanho Min <chanho.min@lge.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/crypto.h>
+#include <linux/vmalloc.h>
+#include <linux/lz4.h>
+
+struct lz4_ctx {
+	void *lz4_comp_mem;
+};
+
+static int lz4_init(struct crypto_tfm *tfm)
+{
+	struct lz4_ctx *ctx = crypto_tfm_ctx(tfm);
+
+	ctx->lz4_comp_mem = vmalloc(LZ4_MEM_COMPRESS);
+	if (!ctx->lz4_comp_mem)
+		return -ENOMEM;
+
+	return 0;
+}
+
+static void lz4_exit(struct crypto_tfm *tfm)
+{
+	struct lz4_ctx *ctx = crypto_tfm_ctx(tfm);
+	vfree(ctx->lz4_comp_mem);
+}
+
+static int lz4_compress_crypto(struct crypto_tfm *tfm, const u8 *src,
+			    unsigned int slen, u8 *dst, unsigned int *dlen)
+{
+	struct lz4_ctx *ctx = crypto_tfm_ctx(tfm);
+	size_t tmp_len = *dlen;
+	int err;
+
+	err = lz4_compress(src, slen, dst, &tmp_len, ctx->lz4_comp_mem);
+
+	if (err < 0)
+		return -EINVAL;
+
+	*dlen = tmp_len;
+	return 0;
+}
+
+static int lz4_decompress_crypto(struct crypto_tfm *tfm, const u8 *src,
+			      unsigned int slen, u8 *dst, unsigned int *dlen)
+{
+	int err;
+	size_t tmp_len = *dlen;
+	size_t __slen = slen;
+
+	err = lz4_decompress(src, &__slen, dst, tmp_len);
+	if (err < 0)
+		return -EINVAL;
+
+	*dlen = tmp_len;
+	return err;
+}
+
+static struct crypto_alg alg_lz4 = {
+	.cra_name		= "lz4",
+	.cra_flags		= CRYPTO_ALG_TYPE_COMPRESS,
+	.cra_ctxsize		= sizeof(struct lz4_ctx),
+	.cra_module		= THIS_MODULE,
+	.cra_list		= LIST_HEAD_INIT(alg_lz4.cra_list),
+	.cra_init		= lz4_init,
+	.cra_exit		= lz4_exit,
+	.cra_u			= { .compress = {
+	.coa_compress		= lz4_compress_crypto,
+	.coa_decompress		= lz4_decompress_crypto } }
+};
+
+static int __init lz4_mod_init(void)
+{
+	return crypto_register_alg(&alg_lz4);
+}
+
+static void __exit lz4_mod_fini(void)
+{
+	crypto_unregister_alg(&alg_lz4);
+}
+
+module_init(lz4_mod_init);
+module_exit(lz4_mod_fini);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("LZ4 Compression Algorithm");
diff --git a/crypto/lz4hc.c b/crypto/lz4hc.c
new file mode 100644
index 0000000..151ba31
--- /dev/null
+++ b/crypto/lz4hc.c
@@ -0,0 +1,106 @@
+/*
+ * Cryptographic API.
+ *
+ * Copyright (c) 2013 Chanho Min <chanho.min@lge.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ */
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/crypto.h>
+#include <linux/vmalloc.h>
+#include <linux/lz4.h>
+
+struct lz4hc_ctx {
+	void *lz4hc_comp_mem;
+};
+
+static int lz4hc_init(struct crypto_tfm *tfm)
+{
+	struct lz4hc_ctx *ctx = crypto_tfm_ctx(tfm);
+
+	ctx->lz4hc_comp_mem = vmalloc(LZ4HC_MEM_COMPRESS);
+	if (!ctx->lz4hc_comp_mem)
+		return -ENOMEM;
+
+	return 0;
+}
+
+static void lz4hc_exit(struct crypto_tfm *tfm)
+{
+	struct lz4hc_ctx *ctx = crypto_tfm_ctx(tfm);
+
+	vfree(ctx->lz4hc_comp_mem);
+}
+
+static int lz4hc_compress_crypto(struct crypto_tfm *tfm, const u8 *src,
+			    unsigned int slen, u8 *dst, unsigned int *dlen)
+{
+	struct lz4hc_ctx *ctx = crypto_tfm_ctx(tfm);
+	size_t tmp_len = *dlen;
+	int err;
+
+	err = lz4hc_compress(src, slen, dst, &tmp_len, ctx->lz4hc_comp_mem);
+
+	if (err < 0)
+		return -EINVAL;
+
+	*dlen = tmp_len;
+	return 0;
+}
+
+static int lz4hc_decompress_crypto(struct crypto_tfm *tfm, const u8 *src,
+			      unsigned int slen, u8 *dst, unsigned int *dlen)
+{
+	int err;
+	size_t tmp_len = *dlen;
+	size_t __slen = slen;
+
+	err = lz4_decompress(src, &__slen, dst, tmp_len);
+	if (err < 0)
+		return -EINVAL;
+
+	*dlen = tmp_len;
+	return err;
+}
+
+static struct crypto_alg alg_lz4hc = {
+	.cra_name		= "lz4hc",
+	.cra_flags		= CRYPTO_ALG_TYPE_COMPRESS,
+	.cra_ctxsize		= sizeof(struct lz4hc_ctx),
+	.cra_module		= THIS_MODULE,
+	.cra_list		= LIST_HEAD_INIT(alg_lz4hc.cra_list),
+	.cra_init		= lz4hc_init,
+	.cra_exit		= lz4hc_exit,
+	.cra_u			= { .compress = {
+	.coa_compress		= lz4hc_compress_crypto,
+	.coa_decompress		= lz4hc_decompress_crypto } }
+};
+
+static int __init lz4hc_mod_init(void)
+{
+	return crypto_register_alg(&alg_lz4hc);
+}
+
+static void __exit lz4hc_mod_fini(void)
+{
+	crypto_unregister_alg(&alg_lz4hc);
+}
+
+module_init(lz4hc_mod_init);
+module_exit(lz4hc_mod_fini);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("LZ4HC Compression Algorithm");
-- 
cgit v0.10.2


From 1105200480b4faeb673d1b23658650b003302c06 Mon Sep 17 00:00:00 2001
From: Akinobu Mita <akinobu.mita@gmail.com>
Date: Mon, 8 Jul 2013 16:01:52 -0700
Subject: lib/scatterlist: factor out sg_miter_get_next_page() from
 sg_miter_next()

This patchset introduces sg_pcopy_from_buffer() and sg_pcopy_to_buffer(),
which copy data between a linear buffer and an SG list.

The only difference between sg_pcopy_{from,to}_buffer() and
sg_copy_{from,to}_buffer() is an additional argument that specifies the
number of bytes to skip the SG list before copying.

The main reason for introducing these functions is to fix a problem in
scsi_debug module.  And there is a local function in crypto/talitos
module, which can be replaced by sg_pcopy_to_buffer().

This patch:

sg_miter_get_next_page() is used to proceed page iterator to the next page
if necessary, and will be used to implement the variants of
sg_copy_{from,to}_buffer() later.

Signed-off-by: Akinobu Mita <akinobu.mita@gmail.com>
Acked-by: Tejun Heo <tj@kernel.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Imre Deak <imre.deak@intel.com>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: "James E.J. Bottomley" <JBottomley@parallels.com>
Cc: Douglas Gilbert <dgilbert@interlog.com>
Cc: Horia Geanta <horia.geanta@freescale.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/lib/scatterlist.c b/lib/scatterlist.c
index a1cf8ca..c35b929 100644
--- a/lib/scatterlist.c
+++ b/lib/scatterlist.c
@@ -453,6 +453,28 @@ void sg_miter_start(struct sg_mapping_iter *miter, struct scatterlist *sgl,
 }
 EXPORT_SYMBOL(sg_miter_start);
 
+static bool sg_miter_get_next_page(struct sg_mapping_iter *miter)
+{
+	if (!miter->__remaining) {
+		struct scatterlist *sg;
+		unsigned long pgoffset;
+
+		if (!__sg_page_iter_next(&miter->piter))
+			return false;
+
+		sg = miter->piter.sg;
+		pgoffset = miter->piter.sg_pgoffset;
+
+		miter->__offset = pgoffset ? 0 : sg->offset;
+		miter->__remaining = sg->offset + sg->length -
+				(pgoffset << PAGE_SHIFT) - miter->__offset;
+		miter->__remaining = min_t(unsigned long, miter->__remaining,
+					   PAGE_SIZE - miter->__offset);
+	}
+
+	return true;
+}
+
 /**
  * sg_miter_next - proceed mapping iterator to the next mapping
  * @miter: sg mapping iter to proceed
@@ -478,22 +500,9 @@ bool sg_miter_next(struct sg_mapping_iter *miter)
 	 * Get to the next page if necessary.
 	 * __remaining, __offset is adjusted by sg_miter_stop
 	 */
-	if (!miter->__remaining) {
-		struct scatterlist *sg;
-		unsigned long pgoffset;
-
-		if (!__sg_page_iter_next(&miter->piter))
-			return false;
-
-		sg = miter->piter.sg;
-		pgoffset = miter->piter.sg_pgoffset;
+	if (!sg_miter_get_next_page(miter))
+		return false;
 
-		miter->__offset = pgoffset ? 0 : sg->offset;
-		miter->__remaining = sg->offset + sg->length -
-				(pgoffset << PAGE_SHIFT) - miter->__offset;
-		miter->__remaining = min_t(unsigned long, miter->__remaining,
-					   PAGE_SIZE - miter->__offset);
-	}
 	miter->page = sg_page_iter_page(&miter->piter);
 	miter->consumed = miter->length = miter->__remaining;
 
-- 
cgit v0.10.2


From df642cea25c90dc7d5dcd9d3b480b6b59de7d787 Mon Sep 17 00:00:00 2001
From: Akinobu Mita <akinobu.mita@gmail.com>
Date: Mon, 8 Jul 2013 16:01:54 -0700
Subject: lib/scatterlist: introduce sg_pcopy_from_buffer() and
 sg_pcopy_to_buffer()

The only difference between sg_pcopy_{from,to}_buffer() and
sg_copy_{from,to}_buffer() is an additional argument that specifies the
number of bytes to skip the SG list before copying.

Signed-off-by: Akinobu Mita <akinobu.mita@gmail.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: "James E.J. Bottomley" <JBottomley@parallels.com>
Cc: Douglas Gilbert <dgilbert@interlog.com>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: Horia Geanta <horia.geanta@freescale.com>
Cc: Imre Deak <imre.deak@intel.com>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/include/linux/scatterlist.h b/include/linux/scatterlist.h
index 2680677..adae88f 100644
--- a/include/linux/scatterlist.h
+++ b/include/linux/scatterlist.h
@@ -244,6 +244,11 @@ size_t sg_copy_from_buffer(struct scatterlist *sgl, unsigned int nents,
 size_t sg_copy_to_buffer(struct scatterlist *sgl, unsigned int nents,
 			 void *buf, size_t buflen);
 
+size_t sg_pcopy_from_buffer(struct scatterlist *sgl, unsigned int nents,
+			    void *buf, size_t buflen, off_t skip);
+size_t sg_pcopy_to_buffer(struct scatterlist *sgl, unsigned int nents,
+			  void *buf, size_t buflen, off_t skip);
+
 /*
  * Maximum number of entries that will be allocated in one piece, if
  * a list larger than this is required then chaining will be utilized.
diff --git a/lib/scatterlist.c b/lib/scatterlist.c
index c35b929..129a82f 100644
--- a/lib/scatterlist.c
+++ b/lib/scatterlist.c
@@ -476,6 +476,43 @@ static bool sg_miter_get_next_page(struct sg_mapping_iter *miter)
 }
 
 /**
+ * sg_miter_skip - reposition mapping iterator
+ * @miter: sg mapping iter to be skipped
+ * @offset: number of bytes to plus the current location
+ *
+ * Description:
+ *   Sets the offset of @miter to its current location plus @offset bytes.
+ *   If mapping iterator @miter has been proceeded by sg_miter_next(), this
+ *   stops @miter.
+ *
+ * Context:
+ *   Don't care if @miter is stopped, or not proceeded yet.
+ *   Otherwise, preemption disabled if the SG_MITER_ATOMIC is set.
+ *
+ * Returns:
+ *   true if @miter contains the valid mapping.  false if end of sg
+ *   list is reached.
+ */
+static bool sg_miter_skip(struct sg_mapping_iter *miter, off_t offset)
+{
+	sg_miter_stop(miter);
+
+	while (offset) {
+		off_t consumed;
+
+		if (!sg_miter_get_next_page(miter))
+			return false;
+
+		consumed = min_t(off_t, offset, miter->__remaining);
+		miter->__offset += consumed;
+		miter->__remaining -= consumed;
+		offset -= consumed;
+	}
+
+	return true;
+}
+
+/**
  * sg_miter_next - proceed mapping iterator to the next mapping
  * @miter: sg mapping iter to proceed
  *
@@ -561,14 +598,16 @@ EXPORT_SYMBOL(sg_miter_stop);
  * @nents:		 Number of SG entries
  * @buf:		 Where to copy from
  * @buflen:		 The number of bytes to copy
- * @to_buffer: 		 transfer direction (non zero == from an sg list to a
- * 			 buffer, 0 == from a buffer to an sg list
+ * @skip:		 Number of bytes to skip before copying
+ * @to_buffer:		 transfer direction (true == from an sg list to a
+ *			 buffer, false == from a buffer to an sg list
  *
  * Returns the number of copied bytes.
  *
  **/
 static size_t sg_copy_buffer(struct scatterlist *sgl, unsigned int nents,
-			     void *buf, size_t buflen, int to_buffer)
+			     void *buf, size_t buflen, off_t skip,
+			     bool to_buffer)
 {
 	unsigned int offset = 0;
 	struct sg_mapping_iter miter;
@@ -582,6 +621,9 @@ static size_t sg_copy_buffer(struct scatterlist *sgl, unsigned int nents,
 
 	sg_miter_start(&miter, sgl, nents, sg_flags);
 
+	if (!sg_miter_skip(&miter, skip))
+		return false;
+
 	local_irq_save(flags);
 
 	while (sg_miter_next(&miter) && offset < buflen) {
@@ -616,7 +658,7 @@ static size_t sg_copy_buffer(struct scatterlist *sgl, unsigned int nents,
 size_t sg_copy_from_buffer(struct scatterlist *sgl, unsigned int nents,
 			   void *buf, size_t buflen)
 {
-	return sg_copy_buffer(sgl, nents, buf, buflen, 0);
+	return sg_copy_buffer(sgl, nents, buf, buflen, 0, false);
 }
 EXPORT_SYMBOL(sg_copy_from_buffer);
 
@@ -633,6 +675,42 @@ EXPORT_SYMBOL(sg_copy_from_buffer);
 size_t sg_copy_to_buffer(struct scatterlist *sgl, unsigned int nents,
 			 void *buf, size_t buflen)
 {
-	return sg_copy_buffer(sgl, nents, buf, buflen, 1);
+	return sg_copy_buffer(sgl, nents, buf, buflen, 0, true);
 }
 EXPORT_SYMBOL(sg_copy_to_buffer);
+
+/**
+ * sg_pcopy_from_buffer - Copy from a linear buffer to an SG list
+ * @sgl:		 The SG list
+ * @nents:		 Number of SG entries
+ * @buf:		 Where to copy from
+ * @skip:		 Number of bytes to skip before copying
+ * @buflen:		 The number of bytes to copy
+ *
+ * Returns the number of copied bytes.
+ *
+ **/
+size_t sg_pcopy_from_buffer(struct scatterlist *sgl, unsigned int nents,
+			    void *buf, size_t buflen, off_t skip)
+{
+	return sg_copy_buffer(sgl, nents, buf, buflen, skip, false);
+}
+EXPORT_SYMBOL(sg_pcopy_from_buffer);
+
+/**
+ * sg_pcopy_to_buffer - Copy from an SG list to a linear buffer
+ * @sgl:		 The SG list
+ * @nents:		 Number of SG entries
+ * @buf:		 Where to copy to
+ * @skip:		 Number of bytes to skip before copying
+ * @buflen:		 The number of bytes to copy
+ *
+ * Returns the number of copied bytes.
+ *
+ **/
+size_t sg_pcopy_to_buffer(struct scatterlist *sgl, unsigned int nents,
+			  void *buf, size_t buflen, off_t skip)
+{
+	return sg_copy_buffer(sgl, nents, buf, buflen, skip, true);
+}
+EXPORT_SYMBOL(sg_pcopy_to_buffer);
-- 
cgit v0.10.2


From d05257238f9bbe96477162448c2dda08309af208 Mon Sep 17 00:00:00 2001
From: Akinobu Mita <akinobu.mita@gmail.com>
Date: Mon, 8 Jul 2013 16:01:55 -0700
Subject: crypto: talitos: use sg_pcopy_to_buffer()

Use sg_pcopy_to_buffer() which is better than the function previously used.
Because it doesn't do kmap/kunmap for skipped pages.

Signed-off-by: Akinobu Mita <akinobu.mita@gmail.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: "James E.J. Bottomley" <JBottomley@parallels.com>
Cc: Douglas Gilbert <dgilbert@interlog.com>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: Horia Geanta <horia.geanta@freescale.com>
Cc: Imre Deak <imre.deak@intel.com>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/crypto/talitos.c b/drivers/crypto/talitos.c
index 5b2b5e6..661dc3e 100644
--- a/drivers/crypto/talitos.c
+++ b/drivers/crypto/talitos.c
@@ -1112,64 +1112,6 @@ static int sg_count(struct scatterlist *sg_list, int nbytes, bool *chained)
 	return sg_nents;
 }
 
-/**
- * sg_copy_end_to_buffer - Copy end data from SG list to a linear buffer
- * @sgl:		 The SG list
- * @nents:		 Number of SG entries
- * @buf:		 Where to copy to
- * @buflen:		 The number of bytes to copy
- * @skip:		 The number of bytes to skip before copying.
- *                       Note: skip + buflen should equal SG total size.
- *
- * Returns the number of copied bytes.
- *
- **/
-static size_t sg_copy_end_to_buffer(struct scatterlist *sgl, unsigned int nents,
-				    void *buf, size_t buflen, unsigned int skip)
-{
-	unsigned int offset = 0;
-	unsigned int boffset = 0;
-	struct sg_mapping_iter miter;
-	unsigned long flags;
-	unsigned int sg_flags = SG_MITER_ATOMIC;
-	size_t total_buffer = buflen + skip;
-
-	sg_flags |= SG_MITER_FROM_SG;
-
-	sg_miter_start(&miter, sgl, nents, sg_flags);
-
-	local_irq_save(flags);
-
-	while (sg_miter_next(&miter) && offset < total_buffer) {
-		unsigned int len;
-		unsigned int ignore;
-
-		if ((offset + miter.length) > skip) {
-			if (offset < skip) {
-				/* Copy part of this segment */
-				ignore = skip - offset;
-				len = miter.length - ignore;
-				if (boffset + len > buflen)
-					len = buflen - boffset;
-				memcpy(buf + boffset, miter.addr + ignore, len);
-			} else {
-				/* Copy all of this segment (up to buflen) */
-				len = miter.length;
-				if (boffset + len > buflen)
-					len = buflen - boffset;
-				memcpy(buf + boffset, miter.addr, len);
-			}
-			boffset += len;
-		}
-		offset += miter.length;
-	}
-
-	sg_miter_stop(&miter);
-
-	local_irq_restore(flags);
-	return boffset;
-}
-
 /*
  * allocate and map the extended descriptor
  */
@@ -1800,7 +1742,7 @@ static int ahash_process_req(struct ahash_request *areq, unsigned int nbytes)
 
 	if (to_hash_later) {
 		int nents = sg_count(areq->src, nbytes, &chained);
-		sg_copy_end_to_buffer(areq->src, nents,
+		sg_pcopy_to_buffer(areq->src, nents,
 				      req_ctx->bufnext,
 				      to_hash_later,
 				      nbytes - to_hash_later);
-- 
cgit v0.10.2


From a451751172b39702e94c683882ab01d816b673c7 Mon Sep 17 00:00:00 2001
From: Akinobu Mita <akinobu.mita@gmail.com>
Date: Mon, 8 Jul 2013 16:01:57 -0700
Subject: scsi_debug: fix do_device_access() with wrap around range

do_device_access() is a function that abstracts copying SG list from/to
ramdisk storage (fake_storep).

It must deal with the ranges exceeding actual fake_storep size, because
such ranges are valid if virtual_gb is set greater than zero, and they
should be treated as fake_storep is repeatedly mirrored up to virtual
size.

Unfortunately, it can't deal with the range which wraps around the end of
fake_storep.  A wrap around range is copied by two
sg_copy_{from,to}_buffer() calls, but sg_copy_{from,to}_buffer() can't
copy from/to in the middle of SG list, therefore the second call can't
copy correctly.

This fixes it by using sg_pcopy_{from,to}_buffer() that can copy from/to
the middle of SG list.

This also simplifies the assignment of sdb->resid in
fill_from_dev_buffer().  Because fill_from_dev_buffer() is now only called
once per command execution cycle.  So it is not necessary to take care to
decrease sdb->resid if fill_from_dev_buffer() is called more than once.

Signed-off-by: Akinobu Mita <akinobu.mita@gmail.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: "James E.J. Bottomley" <JBottomley@parallels.com>
Cc: Douglas Gilbert <dgilbert@interlog.com>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: Horia Geanta <horia.geanta@freescale.com>
Cc: Imre Deak <imre.deak@intel.com>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/scsi/scsi_debug.c b/drivers/scsi/scsi_debug.c
index 0a537a0..d055450 100644
--- a/drivers/scsi/scsi_debug.c
+++ b/drivers/scsi/scsi_debug.c
@@ -439,10 +439,7 @@ static int fill_from_dev_buffer(struct scsi_cmnd *scp, unsigned char *arr,
 
 	act_len = sg_copy_from_buffer(sdb->table.sgl, sdb->table.nents,
 				      arr, arr_len);
-	if (sdb->resid)
-		sdb->resid -= act_len;
-	else
-		sdb->resid = scsi_bufflen(scp) - act_len;
+	sdb->resid = scsi_bufflen(scp) - act_len;
 
 	return 0;
 }
@@ -1693,24 +1690,48 @@ static int check_device_access_params(struct sdebug_dev_info *devi,
 	return 0;
 }
 
+/* Returns number of bytes copied or -1 if error. */
 static int do_device_access(struct scsi_cmnd *scmd,
 			    struct sdebug_dev_info *devi,
 			    unsigned long long lba, unsigned int num, int write)
 {
 	int ret;
 	unsigned long long block, rest = 0;
-	int (*func)(struct scsi_cmnd *, unsigned char *, int);
+	struct scsi_data_buffer *sdb;
+	enum dma_data_direction dir;
+	size_t (*func)(struct scatterlist *, unsigned int, void *, size_t,
+		       off_t);
+
+	if (write) {
+		sdb = scsi_out(scmd);
+		dir = DMA_TO_DEVICE;
+		func = sg_pcopy_to_buffer;
+	} else {
+		sdb = scsi_in(scmd);
+		dir = DMA_FROM_DEVICE;
+		func = sg_pcopy_from_buffer;
+	}
 
-	func = write ? fetch_to_dev_buffer : fill_from_dev_buffer;
+	if (!sdb->length)
+		return 0;
+	if (!(scsi_bidi_cmnd(scmd) || scmd->sc_data_direction == dir))
+		return -1;
 
 	block = do_div(lba, sdebug_store_sectors);
 	if (block + num > sdebug_store_sectors)
 		rest = block + num - sdebug_store_sectors;
 
-	ret = func(scmd, fake_storep + (block * scsi_debug_sector_size),
-		   (num - rest) * scsi_debug_sector_size);
-	if (!ret && rest)
-		ret = func(scmd, fake_storep, rest * scsi_debug_sector_size);
+	ret = func(sdb->table.sgl, sdb->table.nents,
+		   fake_storep + (block * scsi_debug_sector_size),
+		   (num - rest) * scsi_debug_sector_size, 0);
+	if (ret != (num - rest) * scsi_debug_sector_size)
+		return ret;
+
+	if (rest) {
+		ret += func(sdb->table.sgl, sdb->table.nents,
+			    fake_storep, rest * scsi_debug_sector_size,
+			    (num - rest) * scsi_debug_sector_size);
+	}
 
 	return ret;
 }
@@ -1849,7 +1870,12 @@ static int resp_read(struct scsi_cmnd *SCpnt, unsigned long long lba,
 	read_lock_irqsave(&atomic_rw, iflags);
 	ret = do_device_access(SCpnt, devip, lba, num, 0);
 	read_unlock_irqrestore(&atomic_rw, iflags);
-	return ret;
+	if (ret == -1)
+		return DID_ERROR << 16;
+
+	scsi_in(SCpnt)->resid = scsi_bufflen(SCpnt) - ret;
+
+	return 0;
 }
 
 void dump_sector(unsigned char *buf, int len)
-- 
cgit v0.10.2


From 27daabd9b6a157c34a6e7a7f509fa26866e6420f Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Mon, 8 Jul 2013 16:01:58 -0700
Subject: lib/scatterlist: error handling in __sg_alloc_table()

I was reviewing code which I suspected might allocate a zero size SG
table.  That will cause memory corruption.  Also we can't return before
doing the memset or we could end up using uninitialized memory in the
cleanup path.

Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Cc: Akinobu Mita <akinobu.mita@gmail.com>
Cc: Imre Deak <imre.deak@intel.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
Cc: Maxim Levitsky <maximlevitsky@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/lib/scatterlist.c b/lib/scatterlist.c
index 129a82f..a685c8a 100644
--- a/lib/scatterlist.c
+++ b/lib/scatterlist.c
@@ -247,13 +247,15 @@ int __sg_alloc_table(struct sg_table *table, unsigned int nents,
 	struct scatterlist *sg, *prv;
 	unsigned int left;
 
+	memset(table, 0, sizeof(*table));
+
+	if (nents == 0)
+		return -EINVAL;
 #ifndef ARCH_HAS_SG_CHAIN
 	if (WARN_ON_ONCE(nents > max_ents))
 		return -EINVAL;
 #endif
 
-	memset(table, 0, sizeof(*table));
-
 	left = nents;
 	prv = NULL;
 	do {
-- 
cgit v0.10.2