From a20c6bec0b8ae775e2e8f350819cef98eea9a832 Mon Sep 17 00:00:00 2001
From: Fred Isaman <iisaman@netapp.com>
Date: Fri, 20 Apr 2012 14:47:36 -0400
Subject: NFS: grab open context in direct read

Signed-off-by: Fred Isaman <iisaman@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 481be7f..8a89423 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -260,7 +260,7 @@ static void nfs_direct_read_release(void *calldata)
 
 	if (put_dreq(dreq))
 		nfs_direct_complete(dreq);
-	nfs_readdata_free(data);
+	nfs_readdata_release(data);
 }
 
 static const struct rpc_call_ops nfs_read_direct_ops = {
@@ -337,7 +337,7 @@ static ssize_t nfs_direct_read_schedule_segment(struct nfs_direct_req *dreq,
 		data->inode = inode;
 		data->cred = msg.rpc_cred;
 		data->args.fh = NFS_FH(inode);
-		data->args.context = ctx;
+		data->args.context = get_nfs_open_context(ctx);
 		data->args.lock_context = dreq->l_ctx;
 		data->args.offset = pos;
 		data->args.pgbase = pgbase;
-- 
cgit v0.10.2


From 1acbbb4e16209e85c35ff6cacad61d802c07289b Mon Sep 17 00:00:00 2001
From: Fred Isaman <iisaman@netapp.com>
Date: Fri, 20 Apr 2012 14:47:37 -0400
Subject: NFS4.1: make pnfs_ld_[read|write]_done consistent

The two functions had diverged quite a bit, with the write function
being a bit more robust than the read.

However, these still break badly in the desc->pg_bsize < PAGE_CACHE_SIZE case,
as then there is nothing hanging on the data->pages list, and the resend
ends up doing nothing.  This will be fixed in a patch later in the series.

Signed-off-by: Fred Isaman <iisaman@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 38512bc..9c4d14a 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -1189,6 +1189,17 @@ static int pnfs_write_done_resend_to_mds(struct inode *inode, struct list_head *
 	return 0;
 }
 
+static void pnfs_ld_handle_write_error(struct nfs_write_data *data)
+{
+	dprintk("pnfs write error = %d\n", data->pnfs_error);
+	if (NFS_SERVER(data->inode)->pnfs_curr_ld->flags &
+	    PNFS_LAYOUTRET_ON_ERROR) {
+		clear_bit(NFS_INO_LAYOUTCOMMIT, &NFS_I(data->inode)->flags);
+		pnfs_return_layout(data->inode);
+	}
+	data->task.tk_status = pnfs_write_done_resend_to_mds(data->inode, &data->pages);
+}
+
 /*
  * Called by non rpc-based layout drivers
  */
@@ -1197,19 +1208,8 @@ void pnfs_ld_write_done(struct nfs_write_data *data)
 	if (likely(!data->pnfs_error)) {
 		pnfs_set_layoutcommit(data);
 		data->mds_ops->rpc_call_done(&data->task, data);
-	} else {
-		dprintk("pnfs write error = %d\n", data->pnfs_error);
-		if (NFS_SERVER(data->inode)->pnfs_curr_ld->flags &
-						PNFS_LAYOUTRET_ON_ERROR) {
-			/* Don't lo_commit on error, Server will needs to
-			 * preform a file recovery.
-			 */
-			clear_bit(NFS_INO_LAYOUTCOMMIT,
-				  &NFS_I(data->inode)->flags);
-			pnfs_return_layout(data->inode);
-		}
-		data->task.tk_status = pnfs_write_done_resend_to_mds(data->inode, &data->pages);
-	}
+	} else
+		pnfs_ld_handle_write_error(data);
 	put_lseg(data->lseg);
 	data->mds_ops->rpc_release(data);
 }
@@ -1293,26 +1293,38 @@ pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc)
 }
 EXPORT_SYMBOL_GPL(pnfs_generic_pg_writepages);
 
-static void pnfs_ld_handle_read_error(struct nfs_read_data *data)
+static int pnfs_read_done_resend_to_mds(struct inode *inode, struct list_head *head)
 {
 	struct nfs_pageio_descriptor pgio;
+	LIST_HEAD(failed);
 
-	put_lseg(data->lseg);
-	data->lseg = NULL;
-	dprintk("pnfs write error = %d\n", data->pnfs_error);
-	if (NFS_SERVER(data->inode)->pnfs_curr_ld->flags &
-						PNFS_LAYOUTRET_ON_ERROR)
-		pnfs_return_layout(data->inode);
-
-	nfs_pageio_init_read_mds(&pgio, data->inode);
-
-	while (!list_empty(&data->pages)) {
-		struct nfs_page *req = nfs_list_entry(data->pages.next);
+	/* Resend all requests through the MDS */
+	nfs_pageio_init_read_mds(&pgio, inode);
+	while (!list_empty(head)) {
+		struct nfs_page *req = nfs_list_entry(head->next);
 
 		nfs_list_remove_request(req);
-		nfs_pageio_add_request(&pgio, req);
+		if (!nfs_pageio_add_request(&pgio, req))
+			nfs_list_add_request(req, &failed);
 	}
 	nfs_pageio_complete(&pgio);
+
+	if (!list_empty(&failed)) {
+		list_move(&failed, head);
+		return -EIO;
+	}
+	return 0;
+}
+
+static void pnfs_ld_handle_read_error(struct nfs_read_data *data)
+{
+	dprintk("pnfs read error = %d\n", data->pnfs_error);
+	if (NFS_SERVER(data->inode)->pnfs_curr_ld->flags &
+	    PNFS_LAYOUTRET_ON_ERROR) {
+		clear_bit(NFS_INO_LAYOUTCOMMIT, &NFS_I(data->inode)->flags);
+		pnfs_return_layout(data->inode);
+	}
+	data->task.tk_status = pnfs_read_done_resend_to_mds(data->inode, &data->pages);
 }
 
 /*
-- 
cgit v0.10.2


From 799ba8d53d32c84bd2a867ca2689538a48176140 Mon Sep 17 00:00:00 2001
From: Fred Isaman <iisaman@netapp.com>
Date: Fri, 20 Apr 2012 14:47:38 -0400
Subject: NFS4.1: Add lseg to struct nfs4_fl_commit_bucket

Also create a commit_info structure to hold the bucket array and push
it up from the lseg to the layout where it really belongs.

While we are at it, fix a refcounting bug due to an (incorrect)
implicit assumption that filelayout_scan_ds_commit_list always
completely emptied the src list.

This clarifies refcounting, removes the ugly find_only_write_lseg
functions, and pushes the file layout commit code along on the path to
supporting multiple lsegs.

Signed-off-by: Fred Isaman <iisaman@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
index 5acfd9e..15aeba2 100644
--- a/fs/nfs/nfs4filelayout.c
+++ b/fs/nfs/nfs4filelayout.c
@@ -650,10 +650,66 @@ filelayout_free_lseg(struct pnfs_layout_segment *lseg)
 
 	dprintk("--> %s\n", __func__);
 	nfs4_fl_put_deviceid(fl->dsaddr);
-	kfree(fl->commit_buckets);
+	/* This assumes a single RW lseg */
+	if (lseg->pls_range.iomode == IOMODE_RW) {
+		struct nfs4_filelayout *flo;
+
+		flo = FILELAYOUT_FROM_HDR(lseg->pls_layout);
+		flo->commit_info.nbuckets = 0;
+		kfree(flo->commit_info.buckets);
+		flo->commit_info.buckets = NULL;
+	}
 	_filelayout_free_lseg(fl);
 }
 
+static int
+filelayout_alloc_commit_info(struct pnfs_layout_segment *lseg,
+			     gfp_t gfp_flags)
+{
+	struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg);
+	struct nfs4_filelayout *flo = FILELAYOUT_FROM_HDR(lseg->pls_layout);
+
+	struct nfs4_fl_commit_bucket *buckets;
+	int size;
+
+	if (fl->commit_through_mds)
+		return 0;
+	if (flo->commit_info.nbuckets != 0) {
+		/* This assumes there is only one IOMODE_RW lseg.  What
+		 * we really want to do is have a layout_hdr level
+		 * dictionary of <multipath_list4, fh> keys, each
+		 * associated with a struct list_head, populated by calls
+		 * to filelayout_write_pagelist().
+		 * */
+		return 0;
+	}
+
+	size = (fl->stripe_type == STRIPE_SPARSE) ?
+		fl->dsaddr->ds_num : fl->dsaddr->stripe_count;
+
+	buckets = kcalloc(size, sizeof(struct nfs4_fl_commit_bucket),
+			  gfp_flags);
+	if (!buckets)
+		return -ENOMEM;
+	else {
+		int i;
+
+		spin_lock(&lseg->pls_layout->plh_inode->i_lock);
+		if (flo->commit_info.nbuckets != 0)
+			kfree(buckets);
+		else {
+			flo->commit_info.buckets = buckets;
+			flo->commit_info.nbuckets = size;
+			for (i = 0; i < size; i++) {
+				INIT_LIST_HEAD(&buckets[i].written);
+				INIT_LIST_HEAD(&buckets[i].committing);
+			}
+		}
+		spin_unlock(&lseg->pls_layout->plh_inode->i_lock);
+		return 0;
+	}
+}
+
 static struct pnfs_layout_segment *
 filelayout_alloc_lseg(struct pnfs_layout_hdr *layoutid,
 		      struct nfs4_layoutget_res *lgr,
@@ -673,29 +729,6 @@ filelayout_alloc_lseg(struct pnfs_layout_hdr *layoutid,
 		_filelayout_free_lseg(fl);
 		return NULL;
 	}
-
-	/* This assumes there is only one IOMODE_RW lseg.  What
-	 * we really want to do is have a layout_hdr level
-	 * dictionary of <multipath_list4, fh> keys, each
-	 * associated with a struct list_head, populated by calls
-	 * to filelayout_write_pagelist().
-	 * */
-	if ((!fl->commit_through_mds) && (lgr->range.iomode == IOMODE_RW)) {
-		int i;
-		int size = (fl->stripe_type == STRIPE_SPARSE) ?
-			fl->dsaddr->ds_num : fl->dsaddr->stripe_count;
-
-		fl->commit_buckets = kcalloc(size, sizeof(struct nfs4_fl_commit_bucket), gfp_flags);
-		if (!fl->commit_buckets) {
-			filelayout_free_lseg(&fl->generic_hdr);
-			return NULL;
-		}
-		fl->number_of_buckets = size;
-		for (i = 0; i < size; i++) {
-			INIT_LIST_HEAD(&fl->commit_buckets[i].written);
-			INIT_LIST_HEAD(&fl->commit_buckets[i].committing);
-		}
-	}
 	return &fl->generic_hdr;
 }
 
@@ -747,6 +780,8 @@ static void
 filelayout_pg_init_write(struct nfs_pageio_descriptor *pgio,
 			 struct nfs_page *req)
 {
+	int status;
+
 	BUG_ON(pgio->pg_lseg != NULL);
 
 	pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
@@ -757,7 +792,16 @@ filelayout_pg_init_write(struct nfs_pageio_descriptor *pgio,
 					   GFP_NOFS);
 	/* If no lseg, fall back to write through mds */
 	if (pgio->pg_lseg == NULL)
-		nfs_pageio_reset_write_mds(pgio);
+		goto out_mds;
+	status = filelayout_alloc_commit_info(pgio->pg_lseg, GFP_NOFS);
+	if (status < 0) {
+		put_lseg(pgio->pg_lseg);
+		pgio->pg_lseg = NULL;
+		goto out_mds;
+	}
+	return;
+out_mds:
+	nfs_pageio_reset_write_mds(pgio);
 }
 
 static const struct nfs_pageio_ops filelayout_pg_read_ops = {
@@ -793,17 +837,13 @@ filelayout_clear_request_commit(struct nfs_page *req)
 	if (!test_and_clear_bit(PG_COMMIT_TO_DS, &req->wb_flags))
 		goto out;
 	if (list_is_singular(&req->wb_list)) {
-		struct pnfs_layout_segment *lseg;
+		struct nfs4_fl_commit_bucket *bucket;
 
-		/* From here we can find the bucket, but for the moment,
-		 * since there is only one relevant lseg...
-		 */
-		list_for_each_entry(lseg, &NFS_I(inode)->layout->plh_segs, pls_list) {
-			if (lseg->pls_range.iomode == IOMODE_RW) {
-				freeme = lseg;
-				break;
-			}
-		}
+		bucket = list_first_entry(&req->wb_list,
+					  struct nfs4_fl_commit_bucket,
+					  written);
+		freeme = bucket->wlseg;
+		bucket->wlseg = NULL;
 	}
 out:
 	nfs_request_remove_commit_list(req);
@@ -818,6 +858,7 @@ filelayout_choose_commit_list(struct nfs_page *req,
 	struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg);
 	u32 i, j;
 	struct list_head *list;
+	struct nfs4_fl_commit_bucket *buckets;
 
 	if (fl->commit_through_mds)
 		return &NFS_I(req->wb_context->dentry->d_inode)->commit_list;
@@ -831,15 +872,16 @@ filelayout_choose_commit_list(struct nfs_page *req,
 	j = nfs4_fl_calc_j_index(lseg,
 				 (loff_t)req->wb_index << PAGE_CACHE_SHIFT);
 	i = select_bucket_index(fl, j);
-	list = &fl->commit_buckets[i].written;
+	buckets = FILELAYOUT_FROM_HDR(lseg->pls_layout)->commit_info.buckets;
+	list = &buckets[i].written;
 	if (list_empty(list)) {
 		/* Non-empty buckets hold a reference on the lseg.  That ref
 		 * is normally transferred to the COMMIT call and released
 		 * there.  It could also be released if the last req is pulled
 		 * off due to a rewrite, in which case it will be done in
-		 * filelayout_remove_commit_req
+		 * filelayout_clear_request_commit
 		 */
-		get_lseg(lseg);
+		buckets[i].wlseg = get_lseg(lseg);
 	}
 	set_bit(PG_COMMIT_TO_DS, &req->wb_flags);
 	return list;
@@ -908,32 +950,6 @@ static int filelayout_initiate_commit(struct nfs_write_data *data, int how)
 				   &filelayout_commit_call_ops, how);
 }
 
-/*
- * This is only useful while we are using whole file layouts.
- */
-static struct pnfs_layout_segment *
-find_only_write_lseg_locked(struct inode *inode)
-{
-	struct pnfs_layout_segment *lseg;
-
-	list_for_each_entry(lseg, &NFS_I(inode)->layout->plh_segs, pls_list)
-		if (lseg->pls_range.iomode == IOMODE_RW)
-			return lseg;
-	return NULL;
-}
-
-static struct pnfs_layout_segment *find_only_write_lseg(struct inode *inode)
-{
-	struct pnfs_layout_segment *rv;
-
-	spin_lock(&inode->i_lock);
-	rv = find_only_write_lseg_locked(inode);
-	if (rv)
-		get_lseg(rv);
-	spin_unlock(&inode->i_lock);
-	return rv;
-}
-
 static int
 filelayout_scan_ds_commit_list(struct nfs4_fl_commit_bucket *bucket, int max,
 		spinlock_t *lock)
@@ -955,6 +971,13 @@ filelayout_scan_ds_commit_list(struct nfs4_fl_commit_bucket *bucket, int max,
 		if (ret == max)
 			break;
 	}
+	if (ret) {
+		bucket->clseg = bucket->wlseg;
+		if (list_empty(src))
+			bucket->wlseg = NULL;
+		else
+			get_lseg(bucket->clseg);
+	}
 	return ret;
 }
 
@@ -964,18 +987,14 @@ filelayout_scan_ds_commit_list(struct nfs4_fl_commit_bucket *bucket, int max,
 static int filelayout_scan_commit_lists(struct inode *inode, int max,
 		spinlock_t *lock)
 {
-	struct pnfs_layout_segment *lseg;
-	struct nfs4_filelayout_segment *fl;
+	struct nfs4_fl_commit_info *fl_cinfo;
 	int i, rv = 0, cnt;
 
-	lseg = find_only_write_lseg_locked(inode);
-	if (!lseg)
-		goto out_done;
-	fl = FILELAYOUT_LSEG(lseg);
-	if (fl->commit_through_mds)
+	fl_cinfo = &FILELAYOUT_FROM_HDR(NFS_I(inode)->layout)->commit_info;
+	if (fl_cinfo->nbuckets == 0)
 		goto out_done;
-	for (i = 0; i < fl->number_of_buckets && max != 0; i++) {
-		cnt = filelayout_scan_ds_commit_list(&fl->commit_buckets[i],
+	for (i = 0; i < fl_cinfo->nbuckets && max != 0; i++) {
+		cnt = filelayout_scan_ds_commit_list(&fl_cinfo->buckets[i],
 				max, lock);
 		max -= cnt;
 		rv += cnt;
@@ -987,38 +1006,35 @@ out_done:
 static unsigned int
 alloc_ds_commits(struct inode *inode, struct list_head *list)
 {
-	struct pnfs_layout_segment *lseg;
-	struct nfs4_filelayout_segment *fl;
+	struct nfs4_fl_commit_info *fl_cinfo;
+	struct nfs4_fl_commit_bucket *bucket;
 	struct nfs_write_data *data;
 	int i, j;
 	unsigned int nreq = 0;
 
-	/* Won't need this when non-whole file layout segments are supported
-	 * instead we will use a pnfs_layout_hdr structure */
-	lseg = find_only_write_lseg(inode);
-	if (!lseg)
-		return 0;
-	fl = FILELAYOUT_LSEG(lseg);
-	for (i = 0; i < fl->number_of_buckets; i++) {
-		if (list_empty(&fl->commit_buckets[i].committing))
+	fl_cinfo = &FILELAYOUT_FROM_HDR(NFS_I(inode)->layout)->commit_info;
+	bucket = fl_cinfo->buckets;
+	for (i = 0; i < fl_cinfo->nbuckets; i++, bucket++) {
+		if (list_empty(&bucket->committing))
 			continue;
 		data = nfs_commitdata_alloc();
 		if (!data)
 			break;
 		data->ds_commit_index = i;
-		data->lseg = lseg;
+		data->lseg = bucket->clseg;
+		bucket->clseg = NULL;
 		list_add(&data->pages, list);
 		nreq++;
 	}
 
 	/* Clean up on error */
-	for (j = i; j < fl->number_of_buckets; j++) {
-		if (list_empty(&fl->commit_buckets[i].committing))
+	for (j = i; j < fl_cinfo->nbuckets; j++, bucket++) {
+		if (list_empty(&bucket->committing))
 			continue;
-		nfs_retry_commit(&fl->commit_buckets[i].committing, lseg);
-		put_lseg(lseg);  /* associated with emptying bucket */
+		nfs_retry_commit(&bucket->committing, bucket->clseg);
+		put_lseg(bucket->clseg);
+		bucket->clseg = NULL;
 	}
-	put_lseg(lseg);
 	/* Caller will clean up entries put on list */
 	return nreq;
 }
@@ -1058,7 +1074,10 @@ filelayout_commit_pagelist(struct inode *inode, struct list_head *mds_pages,
 			nfs_initiate_commit(data, NFS_CLIENT(inode),
 					    data->mds_ops, how);
 		} else {
-			nfs_init_commit(data, &FILELAYOUT_LSEG(data->lseg)->commit_buckets[data->ds_commit_index].committing, data->lseg);
+			struct nfs4_fl_commit_info *fl_cinfo;
+
+			fl_cinfo = &FILELAYOUT_FROM_HDR(data->lseg->pls_layout)->commit_info;
+			nfs_init_commit(data, &fl_cinfo->buckets[data->ds_commit_index].committing, data->lseg);
 			filelayout_initiate_commit(data, how);
 		}
 	}
@@ -1072,10 +1091,27 @@ filelayout_free_deveiceid_node(struct nfs4_deviceid_node *d)
 	nfs4_fl_free_deviceid(container_of(d, struct nfs4_file_layout_dsaddr, id_node));
 }
 
+static struct pnfs_layout_hdr *
+filelayout_alloc_layout_hdr(struct inode *inode, gfp_t gfp_flags)
+{
+	struct nfs4_filelayout *flo;
+
+	flo = kzalloc(sizeof(*flo), gfp_flags);
+	return &flo->generic_hdr;
+}
+
+static void
+filelayout_free_layout_hdr(struct pnfs_layout_hdr *lo)
+{
+	kfree(FILELAYOUT_FROM_HDR(lo));
+}
+
 static struct pnfs_layoutdriver_type filelayout_type = {
 	.id			= LAYOUT_NFSV4_1_FILES,
 	.name			= "LAYOUT_NFSV4_1_FILES",
 	.owner			= THIS_MODULE,
+	.alloc_layout_hdr	= filelayout_alloc_layout_hdr,
+	.free_layout_hdr	= filelayout_free_layout_hdr,
 	.alloc_lseg		= filelayout_alloc_lseg,
 	.free_lseg		= filelayout_free_lseg,
 	.pg_read_ops		= &filelayout_pg_read_ops,
diff --git a/fs/nfs/nfs4filelayout.h b/fs/nfs/nfs4filelayout.h
index 21190bb..333a3ac 100644
--- a/fs/nfs/nfs4filelayout.h
+++ b/fs/nfs/nfs4filelayout.h
@@ -77,6 +77,13 @@ struct nfs4_file_layout_dsaddr {
 struct nfs4_fl_commit_bucket {
 	struct list_head written;
 	struct list_head committing;
+	struct pnfs_layout_segment *wlseg;
+	struct pnfs_layout_segment *clseg;
+};
+
+struct nfs4_fl_commit_info {
+	int nbuckets;
+	struct nfs4_fl_commit_bucket *buckets;
 };
 
 struct nfs4_filelayout_segment {
@@ -89,10 +96,19 @@ struct nfs4_filelayout_segment {
 	struct nfs4_file_layout_dsaddr *dsaddr; /* Point to GETDEVINFO data */
 	unsigned int num_fh;
 	struct nfs_fh **fh_array;
-	struct nfs4_fl_commit_bucket *commit_buckets; /* Sort commits to ds */
-	int number_of_buckets;
 };
 
+struct nfs4_filelayout {
+	struct pnfs_layout_hdr generic_hdr;
+	struct nfs4_fl_commit_info commit_info;
+};
+
+static inline struct nfs4_filelayout *
+FILELAYOUT_FROM_HDR(struct pnfs_layout_hdr *lo)
+{
+	return container_of(lo, struct nfs4_filelayout, generic_hdr);
+}
+
 static inline struct nfs4_filelayout_segment *
 FILELAYOUT_LSEG(struct pnfs_layout_segment *lseg)
 {
-- 
cgit v0.10.2


From 0b7c01533aa9f4a228d07d2768d084acb3a387bc Mon Sep 17 00:00:00 2001
From: Fred Isaman <iisaman@netapp.com>
Date: Fri, 20 Apr 2012 14:47:39 -0400
Subject: NFS: add a struct nfs_commit_data to replace nfs_write_data in
 commits

Commits don't need the vectors of pages, etc. that writes do. Split out
a separate structure for the commit operation.

Signed-off-by: Fred Isaman <iisaman@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 8a89423..5897dfe 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -82,7 +82,7 @@ struct nfs_direct_req {
 
 	/* commit state */
 	struct list_head	rewrite_list;	/* saved nfs_write_data structs */
-	struct nfs_write_data *	commit_data;	/* special write_data for commits */
+	struct nfs_commit_data *commit_data;	/* special write_data for commits */
 	int			flags;
 #define NFS_ODIRECT_DO_COMMIT		(1)	/* an unstable reply was received */
 #define NFS_ODIRECT_RESCHED_WRITES	(2)	/* write verification failed */
@@ -524,7 +524,7 @@ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
 
 static void nfs_direct_commit_result(struct rpc_task *task, void *calldata)
 {
-	struct nfs_write_data *data = calldata;
+	struct nfs_commit_data *data = calldata;
 
 	/* Call the NFS version-specific code */
 	NFS_PROTO(data->inode)->commit_done(task, data);
@@ -532,8 +532,8 @@ static void nfs_direct_commit_result(struct rpc_task *task, void *calldata)
 
 static void nfs_direct_commit_release(void *calldata)
 {
-	struct nfs_write_data *data = calldata;
-	struct nfs_direct_req *dreq = (struct nfs_direct_req *) data->req;
+	struct nfs_commit_data *data = calldata;
+	struct nfs_direct_req *dreq = data->dreq;
 	int status = data->task.tk_status;
 
 	if (status < 0) {
@@ -551,14 +551,14 @@ static void nfs_direct_commit_release(void *calldata)
 }
 
 static const struct rpc_call_ops nfs_commit_direct_ops = {
-	.rpc_call_prepare = nfs_write_prepare,
+	.rpc_call_prepare = nfs_commit_prepare,
 	.rpc_call_done = nfs_direct_commit_result,
 	.rpc_release = nfs_direct_commit_release,
 };
 
 static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq)
 {
-	struct nfs_write_data *data = dreq->commit_data;
+	struct nfs_commit_data *data = dreq->commit_data;
 	struct rpc_task *task;
 	struct rpc_message msg = {
 		.rpc_argp = &data->args,
@@ -581,9 +581,6 @@ static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq)
 	data->args.fh = NFS_FH(data->inode);
 	data->args.offset = 0;
 	data->args.count = 0;
-	data->args.context = dreq->ctx;
-	data->args.lock_context = dreq->l_ctx;
-	data->res.count = 0;
 	data->res.fattr = &data->fattr;
 	data->res.verf = &data->verf;
 	nfs_fattr_init(&data->fattr);
@@ -625,7 +622,7 @@ static void nfs_alloc_commit_data(struct nfs_direct_req *dreq)
 {
 	dreq->commit_data = nfs_commitdata_alloc();
 	if (dreq->commit_data != NULL)
-		dreq->commit_data->req = (struct nfs_page *) dreq;
+		dreq->commit_data->dreq = dreq;
 }
 #else
 static inline void nfs_alloc_commit_data(struct nfs_direct_req *dreq)
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index b777bda..29ab441 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -314,24 +314,25 @@ extern void nfs_pageio_init_write_mds(struct nfs_pageio_descriptor *pgio,
 				  struct inode *inode, int ioflags);
 extern void nfs_pageio_reset_write_mds(struct nfs_pageio_descriptor *pgio);
 extern void nfs_writedata_release(struct nfs_write_data *wdata);
-extern void nfs_commit_free(struct nfs_write_data *p);
+extern void nfs_commit_free(struct nfs_commit_data *p);
 extern int nfs_initiate_write(struct nfs_write_data *data,
 			      struct rpc_clnt *clnt,
 			      const struct rpc_call_ops *call_ops,
 			      int how);
 extern void nfs_write_prepare(struct rpc_task *task, void *calldata);
-extern int nfs_initiate_commit(struct nfs_write_data *data,
-			       struct rpc_clnt *clnt,
+extern void nfs_commit_prepare(struct rpc_task *task, void *calldata);
+extern int nfs_initiate_commit(struct rpc_clnt *clnt,
+			       struct nfs_commit_data *data,
 			       const struct rpc_call_ops *call_ops,
 			       int how);
-extern void nfs_init_commit(struct nfs_write_data *data,
+extern void nfs_init_commit(struct nfs_commit_data *data,
 			    struct list_head *head,
 			    struct pnfs_layout_segment *lseg);
 void nfs_retry_commit(struct list_head *page_list,
 		      struct pnfs_layout_segment *lseg);
 void nfs_commit_clear_lock(struct nfs_inode *nfsi);
-void nfs_commitdata_release(void *data);
-void nfs_commit_release_pages(struct nfs_write_data *data);
+void nfs_commitdata_release(struct nfs_commit_data *data);
+void nfs_commit_release_pages(struct nfs_commit_data *data);
 void nfs_request_add_commit_list(struct nfs_page *req, struct list_head *head);
 void nfs_request_remove_commit_list(struct nfs_page *req);
 
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index 5242eae..b1daca7 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -848,7 +848,12 @@ static void nfs3_proc_write_rpc_prepare(struct rpc_task *task, struct nfs_write_
 	rpc_call_start(task);
 }
 
-static int nfs3_commit_done(struct rpc_task *task, struct nfs_write_data *data)
+static void nfs3_proc_commit_rpc_prepare(struct rpc_task *task, struct nfs_commit_data *data)
+{
+	rpc_call_start(task);
+}
+
+static int nfs3_commit_done(struct rpc_task *task, struct nfs_commit_data *data)
 {
 	if (nfs3_async_handle_jukebox(task, data->inode))
 		return -EAGAIN;
@@ -856,7 +861,7 @@ static int nfs3_commit_done(struct rpc_task *task, struct nfs_write_data *data)
 	return 0;
 }
 
-static void nfs3_proc_commit_setup(struct nfs_write_data *data, struct rpc_message *msg)
+static void nfs3_proc_commit_setup(struct nfs_commit_data *data, struct rpc_message *msg)
 {
 	msg->rpc_proc = &nfs3_procedures[NFS3PROC_COMMIT];
 }
@@ -907,6 +912,7 @@ const struct nfs_rpc_ops nfs_v3_clientops = {
 	.write_rpc_prepare = nfs3_proc_write_rpc_prepare,
 	.write_done	= nfs3_write_done,
 	.commit_setup	= nfs3_proc_commit_setup,
+	.commit_rpc_prepare = nfs3_proc_commit_rpc_prepare,
 	.commit_done	= nfs3_commit_done,
 	.lock		= nfs3_proc_lock,
 	.clear_acl_cache = nfs3_forget_cached_acls,
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index a77cc9a..01e53e9 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -1287,7 +1287,7 @@ static void nfs3_xdr_enc_readdirplus3args(struct rpc_rqst *req,
  *	};
  */
 static void encode_commit3args(struct xdr_stream *xdr,
-			       const struct nfs_writeargs *args)
+			       const struct nfs_commitargs *args)
 {
 	__be32 *p;
 
@@ -1300,7 +1300,7 @@ static void encode_commit3args(struct xdr_stream *xdr,
 
 static void nfs3_xdr_enc_commit3args(struct rpc_rqst *req,
 				     struct xdr_stream *xdr,
-				     const struct nfs_writeargs *args)
+				     const struct nfs_commitargs *args)
 {
 	encode_commit3args(xdr, args);
 }
@@ -2319,7 +2319,7 @@ out_status:
  */
 static int nfs3_xdr_dec_commit3res(struct rpc_rqst *req,
 				   struct xdr_stream *xdr,
-				   struct nfs_writeres *result)
+				   struct nfs_commitres *result)
 {
 	enum nfs_stat status;
 	int error;
diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
index 15aeba2..675ce3b 100644
--- a/fs/nfs/nfs4filelayout.c
+++ b/fs/nfs/nfs4filelayout.c
@@ -250,7 +250,7 @@ static int filelayout_write_done_cb(struct rpc_task *task,
 }
 
 /* Fake up some data that will cause nfs_commit_release to retry the writes. */
-static void prepare_to_resend_writes(struct nfs_write_data *data)
+static void prepare_to_resend_writes(struct nfs_commit_data *data)
 {
 	struct nfs_page *first = nfs_list_entry(data->pages.next);
 
@@ -261,11 +261,11 @@ static void prepare_to_resend_writes(struct nfs_write_data *data)
 }
 
 static int filelayout_commit_done_cb(struct rpc_task *task,
-				     struct nfs_write_data *data)
+				     struct nfs_commit_data *data)
 {
 	int reset = 0;
 
-	if (filelayout_async_handle_error(task, data->args.context->state,
+	if (filelayout_async_handle_error(task, data->context->state,
 					  data->ds_clp, &reset) == -EAGAIN) {
 		dprintk("%s calling restart ds_clp %p ds_clp->cl_session %p\n",
 			__func__, data->ds_clp, data->ds_clp->cl_session);
@@ -315,15 +315,42 @@ static void filelayout_write_release(void *data)
 	wdata->mds_ops->rpc_release(data);
 }
 
-static void filelayout_commit_release(void *data)
+static void filelayout_commit_prepare(struct rpc_task *task, void *data)
 {
-	struct nfs_write_data *wdata = (struct nfs_write_data *)data;
+	struct nfs_commit_data *wdata = data;
 
-	nfs_commit_release_pages(wdata);
-	if (atomic_dec_and_test(&NFS_I(wdata->inode)->commits_outstanding))
-		nfs_commit_clear_lock(NFS_I(wdata->inode));
-	put_lseg(wdata->lseg);
-	nfs_commitdata_release(wdata);
+	if (nfs41_setup_sequence(wdata->ds_clp->cl_session,
+				&wdata->args.seq_args, &wdata->res.seq_res,
+				task))
+		return;
+
+	rpc_call_start(task);
+}
+
+static void filelayout_write_commit_done(struct rpc_task *task, void *data)
+{
+	struct nfs_commit_data *wdata = data;
+
+	/* Note this may cause RPC to be resent */
+	wdata->mds_ops->rpc_call_done(task, data);
+}
+
+static void filelayout_commit_count_stats(struct rpc_task *task, void *data)
+{
+	struct nfs_commit_data *cdata = data;
+
+	rpc_count_iostats(task, NFS_SERVER(cdata->inode)->client->cl_metrics);
+}
+
+static void filelayout_commit_release(void *calldata)
+{
+	struct nfs_commit_data *data = calldata;
+
+	nfs_commit_release_pages(data);
+	if (atomic_dec_and_test(&NFS_I(data->inode)->commits_outstanding))
+		nfs_commit_clear_lock(NFS_I(data->inode));
+	put_lseg(data->lseg);
+	nfs_commitdata_release(data);
 }
 
 static const struct rpc_call_ops filelayout_read_call_ops = {
@@ -341,9 +368,9 @@ static const struct rpc_call_ops filelayout_write_call_ops = {
 };
 
 static const struct rpc_call_ops filelayout_commit_call_ops = {
-	.rpc_call_prepare = filelayout_write_prepare,
-	.rpc_call_done = filelayout_write_call_done,
-	.rpc_count_stats = filelayout_write_count_stats,
+	.rpc_call_prepare = filelayout_commit_prepare,
+	.rpc_call_done = filelayout_write_commit_done,
+	.rpc_count_stats = filelayout_commit_count_stats,
 	.rpc_release = filelayout_commit_release,
 };
 
@@ -922,7 +949,7 @@ select_ds_fh_from_commit(struct pnfs_layout_segment *lseg, u32 i)
 	return flseg->fh_array[i];
 }
 
-static int filelayout_initiate_commit(struct nfs_write_data *data, int how)
+static int filelayout_initiate_commit(struct nfs_commit_data *data, int how)
 {
 	struct pnfs_layout_segment *lseg = data->lseg;
 	struct nfs4_pnfs_ds *ds;
@@ -941,12 +968,12 @@ static int filelayout_initiate_commit(struct nfs_write_data *data, int how)
 		return -EAGAIN;
 	}
 	dprintk("%s ino %lu, how %d\n", __func__, data->inode->i_ino, how);
-	data->write_done_cb = filelayout_commit_done_cb;
+	data->commit_done_cb = filelayout_commit_done_cb;
 	data->ds_clp = ds->ds_clp;
 	fh = select_ds_fh_from_commit(lseg, data->ds_commit_index);
 	if (fh)
 		data->args.fh = fh;
-	return nfs_initiate_commit(data, ds->ds_clp->cl_rpcclient,
+	return nfs_initiate_commit(ds->ds_clp->cl_rpcclient, data,
 				   &filelayout_commit_call_ops, how);
 }
 
@@ -1008,7 +1035,7 @@ alloc_ds_commits(struct inode *inode, struct list_head *list)
 {
 	struct nfs4_fl_commit_info *fl_cinfo;
 	struct nfs4_fl_commit_bucket *bucket;
-	struct nfs_write_data *data;
+	struct nfs_commit_data *data;
 	int i, j;
 	unsigned int nreq = 0;
 
@@ -1044,7 +1071,7 @@ static int
 filelayout_commit_pagelist(struct inode *inode, struct list_head *mds_pages,
 			   int how)
 {
-	struct nfs_write_data	*data, *tmp;
+	struct nfs_commit_data *data, *tmp;
 	LIST_HEAD(list);
 	unsigned int nreq = 0;
 
@@ -1071,7 +1098,7 @@ filelayout_commit_pagelist(struct inode *inode, struct list_head *mds_pages,
 		list_del_init(&data->pages);
 		if (!data->lseg) {
 			nfs_init_commit(data, mds_pages, NULL);
-			nfs_initiate_commit(data, NFS_CLIENT(inode),
+			nfs_initiate_commit(NFS_CLIENT(inode), data,
 					    data->mds_ops, how);
 		} else {
 			struct nfs4_fl_commit_info *fl_cinfo;
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 75eb883..cc04b6e 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -3468,7 +3468,17 @@ static void nfs4_proc_write_rpc_prepare(struct rpc_task *task, struct nfs_write_
 	rpc_call_start(task);
 }
 
-static int nfs4_commit_done_cb(struct rpc_task *task, struct nfs_write_data *data)
+static void nfs4_proc_commit_rpc_prepare(struct rpc_task *task, struct nfs_commit_data *data)
+{
+	if (nfs4_setup_sequence(NFS_SERVER(data->inode),
+				&data->args.seq_args,
+				&data->res.seq_res,
+				task))
+		return;
+	rpc_call_start(task);
+}
+
+static int nfs4_commit_done_cb(struct rpc_task *task, struct nfs_commit_data *data)
 {
 	struct inode *inode = data->inode;
 
@@ -3480,14 +3490,14 @@ static int nfs4_commit_done_cb(struct rpc_task *task, struct nfs_write_data *dat
 	return 0;
 }
 
-static int nfs4_commit_done(struct rpc_task *task, struct nfs_write_data *data)
+static int nfs4_commit_done(struct rpc_task *task, struct nfs_commit_data *data)
 {
 	if (!nfs4_sequence_done(task, &data->res.seq_res))
 		return -EAGAIN;
-	return data->write_done_cb(task, data);
+	return data->commit_done_cb(task, data);
 }
 
-static void nfs4_proc_commit_setup(struct nfs_write_data *data, struct rpc_message *msg)
+static void nfs4_proc_commit_setup(struct nfs_commit_data *data, struct rpc_message *msg)
 {
 	struct nfs_server *server = NFS_SERVER(data->inode);
 
@@ -3496,8 +3506,8 @@ static void nfs4_proc_commit_setup(struct nfs_write_data *data, struct rpc_messa
 		data->res.fattr = NULL;
 	} else
 		data->args.bitmask = server->cache_consistency_bitmask;
-	if (!data->write_done_cb)
-		data->write_done_cb = nfs4_commit_done_cb;
+	if (data->commit_done_cb == NULL)
+		data->commit_done_cb = nfs4_commit_done_cb;
 	data->res.server = server;
 	msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_COMMIT];
 	nfs41_init_sequence(&data->args.seq_args, &data->res.seq_res, 1);
@@ -6591,6 +6601,7 @@ const struct nfs_rpc_ops nfs_v4_clientops = {
 	.write_rpc_prepare = nfs4_proc_write_rpc_prepare,
 	.write_done	= nfs4_write_done,
 	.commit_setup	= nfs4_proc_commit_setup,
+	.commit_rpc_prepare = nfs4_proc_commit_rpc_prepare,
 	.commit_done	= nfs4_commit_done,
 	.lock		= nfs4_proc_lock,
 	.clear_acl_cache = nfs4_zap_acl_attr,
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index c54aae3..4c3cc0e 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -1103,7 +1103,7 @@ static void encode_close(struct xdr_stream *xdr, const struct nfs_closeargs *arg
 	encode_nfs4_stateid(xdr, arg->stateid);
 }
 
-static void encode_commit(struct xdr_stream *xdr, const struct nfs_writeargs *args, struct compound_hdr *hdr)
+static void encode_commit(struct xdr_stream *xdr, const struct nfs_commitargs *args, struct compound_hdr *hdr)
 {
 	__be32 *p;
 
@@ -2448,7 +2448,7 @@ static void nfs4_xdr_enc_write(struct rpc_rqst *req, struct xdr_stream *xdr,
  *  a COMMIT request
  */
 static void nfs4_xdr_enc_commit(struct rpc_rqst *req, struct xdr_stream *xdr,
-				struct nfs_writeargs *args)
+				struct nfs_commitargs *args)
 {
 	struct compound_hdr hdr = {
 		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
@@ -4102,7 +4102,7 @@ static int decode_verifier(struct xdr_stream *xdr, void *verifier)
 	return decode_opaque_fixed(xdr, verifier, NFS4_VERIFIER_SIZE);
 }
 
-static int decode_commit(struct xdr_stream *xdr, struct nfs_writeres *res)
+static int decode_commit(struct xdr_stream *xdr, struct nfs_commitres *res)
 {
 	int status;
 
@@ -6353,7 +6353,7 @@ out:
  * Decode COMMIT response
  */
 static int nfs4_xdr_dec_commit(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
-			       struct nfs_writeres *res)
+			       struct nfs_commitres *res)
 {
 	struct compound_hdr hdr;
 	int status;
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index b63b6f4..bf80503 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -688,8 +688,13 @@ static void nfs_proc_write_rpc_prepare(struct rpc_task *task, struct nfs_write_d
 	rpc_call_start(task);
 }
 
+static void nfs_proc_commit_rpc_prepare(struct rpc_task *task, struct nfs_commit_data *data)
+{
+	BUG();
+}
+
 static void
-nfs_proc_commit_setup(struct nfs_write_data *data, struct rpc_message *msg)
+nfs_proc_commit_setup(struct nfs_commit_data *data, struct rpc_message *msg)
 {
 	BUG();
 }
@@ -764,6 +769,7 @@ const struct nfs_rpc_ops nfs_v2_clientops = {
 	.write_rpc_prepare = nfs_proc_write_rpc_prepare,
 	.write_done	= nfs_write_done,
 	.commit_setup	= nfs_proc_commit_setup,
+	.commit_rpc_prepare = nfs_proc_commit_rpc_prepare,
 	.lock		= nfs_proc_lock,
 	.lock_check_bounds = nfs_lock_check_bounds,
 	.close_context	= nfs_close_context,
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index c074623..54f7c0f 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -48,11 +48,12 @@ static const struct rpc_call_ops nfs_commit_ops;
 
 static struct kmem_cache *nfs_wdata_cachep;
 static mempool_t *nfs_wdata_mempool;
+static struct kmem_cache *nfs_cdata_cachep;
 static mempool_t *nfs_commit_mempool;
 
-struct nfs_write_data *nfs_commitdata_alloc(void)
+struct nfs_commit_data *nfs_commitdata_alloc(void)
 {
-	struct nfs_write_data *p = mempool_alloc(nfs_commit_mempool, GFP_NOFS);
+	struct nfs_commit_data *p = mempool_alloc(nfs_commit_mempool, GFP_NOFS);
 
 	if (p) {
 		memset(p, 0, sizeof(*p));
@@ -62,10 +63,8 @@ struct nfs_write_data *nfs_commitdata_alloc(void)
 }
 EXPORT_SYMBOL_GPL(nfs_commitdata_alloc);
 
-void nfs_commit_free(struct nfs_write_data *p)
+void nfs_commit_free(struct nfs_commit_data *p)
 {
-	if (p && (p->pagevec != &p->page_array[0]))
-		kfree(p->pagevec);
 	mempool_free(p, nfs_commit_mempool);
 }
 EXPORT_SYMBOL_GPL(nfs_commit_free);
@@ -1179,6 +1178,13 @@ void nfs_write_prepare(struct rpc_task *task, void *calldata)
 	NFS_PROTO(data->inode)->write_rpc_prepare(task, data);
 }
 
+void nfs_commit_prepare(struct rpc_task *task, void *calldata)
+{
+	struct nfs_commit_data *data = calldata;
+
+	NFS_PROTO(data->inode)->commit_rpc_prepare(task, data);
+}
+
 static const struct rpc_call_ops nfs_write_partial_ops = {
 	.rpc_call_prepare = nfs_write_prepare,
 	.rpc_call_done = nfs_writeback_done_partial,
@@ -1355,16 +1361,14 @@ void nfs_commit_clear_lock(struct nfs_inode *nfsi)
 }
 EXPORT_SYMBOL_GPL(nfs_commit_clear_lock);
 
-void nfs_commitdata_release(void *data)
+void nfs_commitdata_release(struct nfs_commit_data *data)
 {
-	struct nfs_write_data *wdata = data;
-
-	put_nfs_open_context(wdata->args.context);
-	nfs_commit_free(wdata);
+	put_nfs_open_context(data->context);
+	nfs_commit_free(data);
 }
 EXPORT_SYMBOL_GPL(nfs_commitdata_release);
 
-int nfs_initiate_commit(struct nfs_write_data *data, struct rpc_clnt *clnt,
+int nfs_initiate_commit(struct rpc_clnt *clnt, struct nfs_commit_data *data,
 			const struct rpc_call_ops *call_ops,
 			int how)
 {
@@ -1403,7 +1407,7 @@ EXPORT_SYMBOL_GPL(nfs_initiate_commit);
 /*
  * Set up the argument/result storage required for the RPC call.
  */
-void nfs_init_commit(struct nfs_write_data *data,
+void nfs_init_commit(struct nfs_commit_data *data,
 			    struct list_head *head,
 			    struct pnfs_layout_segment *lseg)
 {
@@ -1424,8 +1428,7 @@ void nfs_init_commit(struct nfs_write_data *data,
 	/* Note: we always request a commit of the entire inode */
 	data->args.offset = 0;
 	data->args.count  = 0;
-	data->args.context = get_nfs_open_context(first->wb_context);
-	data->res.count   = 0;
+	data->context     = get_nfs_open_context(first->wb_context);
 	data->res.fattr   = &data->fattr;
 	data->res.verf    = &data->verf;
 	nfs_fattr_init(&data->fattr);
@@ -1455,7 +1458,7 @@ EXPORT_SYMBOL_GPL(nfs_retry_commit);
 static int
 nfs_commit_list(struct inode *inode, struct list_head *head, int how)
 {
-	struct nfs_write_data	*data;
+	struct nfs_commit_data	*data;
 
 	data = nfs_commitdata_alloc();
 
@@ -1464,7 +1467,7 @@ nfs_commit_list(struct inode *inode, struct list_head *head, int how)
 
 	/* Set up the argument struct */
 	nfs_init_commit(data, head, NULL);
-	return nfs_initiate_commit(data, NFS_CLIENT(inode), data->mds_ops, how);
+	return nfs_initiate_commit(NFS_CLIENT(inode), data, data->mds_ops, how);
  out_bad:
 	nfs_retry_commit(head, NULL);
 	nfs_commit_clear_lock(NFS_I(inode));
@@ -1476,7 +1479,7 @@ nfs_commit_list(struct inode *inode, struct list_head *head, int how)
  */
 static void nfs_commit_done(struct rpc_task *task, void *calldata)
 {
-	struct nfs_write_data	*data = calldata;
+	struct nfs_commit_data	*data = calldata;
 
         dprintk("NFS: %5u nfs_commit_done (status %d)\n",
                                 task->tk_pid, task->tk_status);
@@ -1485,7 +1488,7 @@ static void nfs_commit_done(struct rpc_task *task, void *calldata)
 	NFS_PROTO(data->inode)->commit_done(task, data);
 }
 
-void nfs_commit_release_pages(struct nfs_write_data *data)
+void nfs_commit_release_pages(struct nfs_commit_data *data)
 {
 	struct nfs_page	*req;
 	int status = data->task.tk_status;
@@ -1526,7 +1529,7 @@ EXPORT_SYMBOL_GPL(nfs_commit_release_pages);
 
 static void nfs_commit_release(void *calldata)
 {
-	struct nfs_write_data *data = calldata;
+	struct nfs_commit_data *data = calldata;
 
 	nfs_commit_release_pages(data);
 	nfs_commit_clear_lock(NFS_I(data->inode));
@@ -1534,7 +1537,7 @@ static void nfs_commit_release(void *calldata)
 }
 
 static const struct rpc_call_ops nfs_commit_ops = {
-	.rpc_call_prepare = nfs_write_prepare,
+	.rpc_call_prepare = nfs_commit_prepare,
 	.rpc_call_done = nfs_commit_done,
 	.rpc_release = nfs_commit_release,
 };
@@ -1753,6 +1756,13 @@ int __init nfs_init_writepagecache(void)
 	if (nfs_wdata_mempool == NULL)
 		return -ENOMEM;
 
+	nfs_cdata_cachep = kmem_cache_create("nfs_commit_data",
+					     sizeof(struct nfs_commit_data),
+					     0, SLAB_HWCACHE_ALIGN,
+					     NULL);
+	if (nfs_cdata_cachep == NULL)
+		return -ENOMEM;
+
 	nfs_commit_mempool = mempool_create_slab_pool(MIN_POOL_COMMIT,
 						      nfs_wdata_cachep);
 	if (nfs_commit_mempool == NULL)
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 52a1bdb..d5d68f3 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -552,8 +552,8 @@ extern int nfs_wb_page(struct inode *inode, struct page* page);
 extern int nfs_wb_page_cancel(struct inode *inode, struct page* page);
 #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
 extern int  nfs_commit_inode(struct inode *, int);
-extern struct nfs_write_data *nfs_commitdata_alloc(void);
-extern void nfs_commit_free(struct nfs_write_data *wdata);
+extern struct nfs_commit_data *nfs_commitdata_alloc(void);
+extern void nfs_commit_free(struct nfs_commit_data *data);
 #else
 static inline int
 nfs_commit_inode(struct inode *inode, int how)
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 7ba3551..8fb036a 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -519,6 +519,24 @@ struct nfs_writeres {
 };
 
 /*
+ * Arguments to the commit call.
+ */
+struct nfs_commitargs {
+	struct nfs_fh		*fh;
+	__u64			offset;
+	__u32			count;
+	const u32		*bitmask;
+	struct nfs4_sequence_args	seq_args;
+};
+
+struct nfs_commitres {
+	struct nfs_fattr	*fattr;
+	struct nfs_writeverf	*verf;
+	const struct nfs_server *server;
+	struct nfs4_sequence_res	seq_res;
+};
+
+/*
  * Common arguments to the unlink call
  */
 struct nfs_removeargs {
@@ -1171,6 +1189,8 @@ struct nfs_read_data {
 	struct page		*page_array[NFS_PAGEVEC_SIZE];
 };
 
+struct nfs_direct_req;
+
 struct nfs_write_data {
 	struct rpc_task		task;
 	struct inode		*inode;
@@ -1186,7 +1206,6 @@ struct nfs_write_data {
 	struct nfs_writeres	res;		/* result struct */
 	struct pnfs_layout_segment *lseg;
 	struct nfs_client	*ds_clp;	/* pNFS data server */
-	int			ds_commit_index;
 	const struct rpc_call_ops *mds_ops;
 	int (*write_done_cb) (struct rpc_task *task, struct nfs_write_data *data);
 #ifdef CONFIG_NFS_V4
@@ -1197,6 +1216,25 @@ struct nfs_write_data {
 	struct page		*page_array[NFS_PAGEVEC_SIZE];
 };
 
+struct nfs_commit_data {
+	struct rpc_task		task;
+	struct inode		*inode;
+	struct rpc_cred		*cred;
+	struct nfs_fattr	fattr;
+	struct nfs_writeverf	verf;
+	struct list_head	pages;		/* Coalesced requests we wish to flush */
+	struct list_head	list;		/* lists of struct nfs_write_data */
+	struct nfs_direct_req	*dreq;		/* O_DIRECT request */
+	struct nfs_commitargs	args;		/* argument struct */
+	struct nfs_commitres	res;		/* result struct */
+	struct nfs_open_context *context;
+	struct pnfs_layout_segment *lseg;
+	struct nfs_client	*ds_clp;	/* pNFS data server */
+	int			ds_commit_index;
+	const struct rpc_call_ops *mds_ops;
+	int (*commit_done_cb) (struct rpc_task *task, struct nfs_commit_data *data);
+};
+
 struct nfs_unlinkdata {
 	struct hlist_node list;
 	struct nfs_removeargs args;
@@ -1277,8 +1315,9 @@ struct nfs_rpc_ops {
 	void	(*write_setup)  (struct nfs_write_data *, struct rpc_message *);
 	void	(*write_rpc_prepare)(struct rpc_task *, struct nfs_write_data *);
 	int	(*write_done)  (struct rpc_task *, struct nfs_write_data *);
-	void	(*commit_setup) (struct nfs_write_data *, struct rpc_message *);
-	int	(*commit_done) (struct rpc_task *, struct nfs_write_data *);
+	void	(*commit_setup) (struct nfs_commit_data *, struct rpc_message *);
+	void	(*commit_rpc_prepare)(struct rpc_task *, struct nfs_commit_data *);
+	int	(*commit_done) (struct rpc_task *, struct nfs_commit_data *);
 	int	(*lock)(struct file *, int, struct file_lock *);
 	int	(*lock_check_bounds)(const struct file_lock *);
 	void	(*clear_acl_cache)(struct inode *);
-- 
cgit v0.10.2


From 31f6852a4c187c031456581b35e146c0d5bbdecd Mon Sep 17 00:00:00 2001
From: Fred Isaman <iisaman@netapp.com>
Date: Fri, 20 Apr 2012 14:47:40 -0400
Subject: NFS: dprintks in directio code were referencing task after put

Signed-off-by: Fred Isaman <iisaman@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 5897dfe..fb7fbaa 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -357,15 +357,15 @@ static ssize_t nfs_direct_read_schedule_segment(struct nfs_direct_req *dreq,
 		task = rpc_run_task(&task_setup_data);
 		if (IS_ERR(task))
 			break;
-		rpc_put_task(task);
 
 		dprintk("NFS: %5u initiated direct read call "
 			"(req %s/%Ld, %zu bytes @ offset %Lu)\n",
-				data->task.tk_pid,
+				task->tk_pid,
 				inode->i_sb->s_id,
 				(long long)NFS_FILEID(inode),
 				bytes,
 				(unsigned long long)data->args.offset);
+		rpc_put_task(task);
 
 		started += bytes;
 		user_addr += bytes;
@@ -784,15 +784,15 @@ static ssize_t nfs_direct_write_schedule_segment(struct nfs_direct_req *dreq,
 		task = rpc_run_task(&task_setup_data);
 		if (IS_ERR(task))
 			break;
-		rpc_put_task(task);
 
 		dprintk("NFS: %5u initiated direct write call "
 			"(req %s/%Ld, %zu bytes @ offset %Lu)\n",
-				data->task.tk_pid,
+				task->tk_pid,
 				inode->i_sb->s_id,
 				(long long)NFS_FILEID(inode),
 				bytes,
 				(unsigned long long)data->args.offset);
+		rpc_put_task(task);
 
 		started += bytes;
 		user_addr += bytes;
-- 
cgit v0.10.2


From c5996c4efb95bbb80a25acc890357c9eae998eeb Mon Sep 17 00:00:00 2001
From: Fred Isaman <iisaman@netapp.com>
Date: Fri, 20 Apr 2012 14:47:41 -0400
Subject: NFS: reverse arg order in nfs_initiate_[read|write]

Make it consistent with nfs_initiate_commit.

Signed-off-by: Fred Isaman <iisaman@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 29ab441..650127f 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -296,7 +296,8 @@ extern int nfs4_get_rootfh(struct nfs_server *server, struct nfs_fh *mntfh);
 
 struct nfs_pageio_descriptor;
 /* read.c */
-extern int nfs_initiate_read(struct nfs_read_data *data, struct rpc_clnt *clnt,
+extern int nfs_initiate_read(struct rpc_clnt *clnt,
+			     struct nfs_read_data *data,
 			     const struct rpc_call_ops *call_ops);
 extern void nfs_read_prepare(struct rpc_task *task, void *calldata);
 extern int nfs_generic_pagein(struct nfs_pageio_descriptor *desc,
@@ -315,8 +316,8 @@ extern void nfs_pageio_init_write_mds(struct nfs_pageio_descriptor *pgio,
 extern void nfs_pageio_reset_write_mds(struct nfs_pageio_descriptor *pgio);
 extern void nfs_writedata_release(struct nfs_write_data *wdata);
 extern void nfs_commit_free(struct nfs_commit_data *p);
-extern int nfs_initiate_write(struct nfs_write_data *data,
-			      struct rpc_clnt *clnt,
+extern int nfs_initiate_write(struct rpc_clnt *clnt,
+			      struct nfs_write_data *data,
 			      const struct rpc_call_ops *call_ops,
 			      int how);
 extern void nfs_write_prepare(struct rpc_task *task, void *calldata);
diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
index 675ce3b..adbadcb 100644
--- a/fs/nfs/nfs4filelayout.c
+++ b/fs/nfs/nfs4filelayout.c
@@ -413,7 +413,7 @@ filelayout_read_pagelist(struct nfs_read_data *data)
 	data->mds_offset = offset;
 
 	/* Perform an asynchronous read to ds */
-	status = nfs_initiate_read(data, ds->ds_clp->cl_rpcclient,
+	status = nfs_initiate_read(ds->ds_clp->cl_rpcclient, data,
 				   &filelayout_read_call_ops);
 	BUG_ON(status != 0);
 	return PNFS_ATTEMPTED;
@@ -460,7 +460,7 @@ filelayout_write_pagelist(struct nfs_write_data *data, int sync)
 	data->args.offset = filelayout_get_dserver_offset(lseg, offset);
 
 	/* Perform an asynchronous write */
-	status = nfs_initiate_write(data, ds->ds_clp->cl_rpcclient,
+	status = nfs_initiate_write(ds->ds_clp->cl_rpcclient, data,
 				    &filelayout_write_call_ops, sync);
 	BUG_ON(status != 0);
 	return PNFS_ATTEMPTED;
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 0a4be28..4ddba67 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -169,7 +169,8 @@ static void nfs_readpage_release(struct nfs_page *req)
 	nfs_release_request(req);
 }
 
-int nfs_initiate_read(struct nfs_read_data *data, struct rpc_clnt *clnt,
+int nfs_initiate_read(struct rpc_clnt *clnt,
+		      struct nfs_read_data *data,
 		      const struct rpc_call_ops *call_ops)
 {
 	struct inode *inode = data->inode;
@@ -240,7 +241,7 @@ static int nfs_do_read(struct nfs_read_data *data,
 {
 	struct inode *inode = data->args.context->dentry->d_inode;
 
-	return nfs_initiate_read(data, NFS_CLIENT(inode), call_ops);
+	return nfs_initiate_read(NFS_CLIENT(inode), data, call_ops);
 }
 
 static int
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 54f7c0f..76735dd 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -836,8 +836,8 @@ static int flush_task_priority(int how)
 	return RPC_PRIORITY_NORMAL;
 }
 
-int nfs_initiate_write(struct nfs_write_data *data,
-		       struct rpc_clnt *clnt,
+int nfs_initiate_write(struct rpc_clnt *clnt,
+		       struct nfs_write_data *data,
 		       const struct rpc_call_ops *call_ops,
 		       int how)
 {
@@ -937,7 +937,7 @@ static int nfs_do_write(struct nfs_write_data *data,
 {
 	struct inode *inode = data->args.context->dentry->d_inode;
 
-	return nfs_initiate_write(data, NFS_CLIENT(inode), call_ops, how);
+	return nfs_initiate_write(NFS_CLIENT(inode), data, call_ops, how);
 }
 
 static int nfs_do_multiple_writes(struct list_head *head,
-- 
cgit v0.10.2


From cd12ae326f5c040f61d64233514609adabe84ab8 Mon Sep 17 00:00:00 2001
From: Fred Isaman <iisaman@netapp.com>
Date: Fri, 20 Apr 2012 14:47:42 -0400
Subject: NFS: remove unnecessary casts of void pointers in nfs4filelayout.c

Signed-off-by: Fred Isaman <iisaman@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
index adbadcb..31afd81 100644
--- a/fs/nfs/nfs4filelayout.c
+++ b/fs/nfs/nfs4filelayout.c
@@ -191,7 +191,7 @@ filelayout_set_layoutcommit(struct nfs_write_data *wdata)
  */
 static void filelayout_read_prepare(struct rpc_task *task, void *data)
 {
-	struct nfs_read_data *rdata = (struct nfs_read_data *)data;
+	struct nfs_read_data *rdata = data;
 
 	rdata->read_done_cb = filelayout_read_done_cb;
 
@@ -205,7 +205,7 @@ static void filelayout_read_prepare(struct rpc_task *task, void *data)
 
 static void filelayout_read_call_done(struct rpc_task *task, void *data)
 {
-	struct nfs_read_data *rdata = (struct nfs_read_data *)data;
+	struct nfs_read_data *rdata = data;
 
 	dprintk("--> %s task->tk_status %d\n", __func__, task->tk_status);
 
@@ -215,14 +215,14 @@ static void filelayout_read_call_done(struct rpc_task *task, void *data)
 
 static void filelayout_read_count_stats(struct rpc_task *task, void *data)
 {
-	struct nfs_read_data *rdata = (struct nfs_read_data *)data;
+	struct nfs_read_data *rdata = data;
 
 	rpc_count_iostats(task, NFS_SERVER(rdata->inode)->client->cl_metrics);
 }
 
 static void filelayout_read_release(void *data)
 {
-	struct nfs_read_data *rdata = (struct nfs_read_data *)data;
+	struct nfs_read_data *rdata = data;
 
 	put_lseg(rdata->lseg);
 	rdata->mds_ops->rpc_release(data);
@@ -282,7 +282,7 @@ static int filelayout_commit_done_cb(struct rpc_task *task,
 
 static void filelayout_write_prepare(struct rpc_task *task, void *data)
 {
-	struct nfs_write_data *wdata = (struct nfs_write_data *)data;
+	struct nfs_write_data *wdata = data;
 
 	if (nfs41_setup_sequence(wdata->ds_clp->cl_session,
 				&wdata->args.seq_args, &wdata->res.seq_res,
@@ -294,7 +294,7 @@ static void filelayout_write_prepare(struct rpc_task *task, void *data)
 
 static void filelayout_write_call_done(struct rpc_task *task, void *data)
 {
-	struct nfs_write_data *wdata = (struct nfs_write_data *)data;
+	struct nfs_write_data *wdata = data;
 
 	/* Note this may cause RPC to be resent */
 	wdata->mds_ops->rpc_call_done(task, data);
@@ -302,14 +302,14 @@ static void filelayout_write_call_done(struct rpc_task *task, void *data)
 
 static void filelayout_write_count_stats(struct rpc_task *task, void *data)
 {
-	struct nfs_write_data *wdata = (struct nfs_write_data *)data;
+	struct nfs_write_data *wdata = data;
 
 	rpc_count_iostats(task, NFS_SERVER(wdata->inode)->client->cl_metrics);
 }
 
 static void filelayout_write_release(void *data)
 {
-	struct nfs_write_data *wdata = (struct nfs_write_data *)data;
+	struct nfs_write_data *wdata = data;
 
 	put_lseg(wdata->lseg);
 	wdata->mds_ops->rpc_release(data);
-- 
cgit v0.10.2


From b5542849764aa56fd3f05c0041195b637b9d2ac2 Mon Sep 17 00:00:00 2001
From: Fred Isaman <iisaman@netapp.com>
Date: Fri, 20 Apr 2012 14:47:43 -0400
Subject: NFS: use req_offset where appropriate

Signed-off-by: Fred Isaman <iisaman@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
index 31afd81..c536328 100644
--- a/fs/nfs/nfs4filelayout.c
+++ b/fs/nfs/nfs4filelayout.c
@@ -776,8 +776,8 @@ filelayout_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
 	    !nfs_generic_pg_test(pgio, prev, req))
 		return false;
 
-	p_stripe = (u64)prev->wb_index << PAGE_CACHE_SHIFT;
-	r_stripe = (u64)req->wb_index << PAGE_CACHE_SHIFT;
+	p_stripe = (u64)req_offset(prev);
+	r_stripe = (u64)req_offset(req);
 	stripe_unit = FILELAYOUT_LSEG(pgio->pg_lseg)->stripe_unit;
 
 	do_div(p_stripe, stripe_unit);
@@ -896,8 +896,7 @@ filelayout_choose_commit_list(struct nfs_page *req,
 	 * to store the value calculated in filelayout_write_pagelist
 	 * and just use that here.
 	 */
-	j = nfs4_fl_calc_j_index(lseg,
-				 (loff_t)req->wb_index << PAGE_CACHE_SHIFT);
+	j = nfs4_fl_calc_j_index(lseg, req_offset(req));
 	i = select_bucket_index(fl, j);
 	buckets = FILELAYOUT_FROM_HDR(lseg->pls_layout)->commit_info.buckets;
 	list = &buckets[i].written;
-- 
cgit v0.10.2


From cd841605f7a721878d8a2d1362484723d8abf569 Mon Sep 17 00:00:00 2001
From: Fred Isaman <iisaman@netapp.com>
Date: Fri, 20 Apr 2012 14:47:44 -0400
Subject: NFS: create common nfs_pgio_header for both read and write

In order to avoid duplicating all the data in nfs_read_data whenever we
split it up into multiple RPC calls (either due to a short read result
or due to rsize < PAGE_SIZE), we split out the bits that are the same
per RPC call into a separate "header" structure.

The goal this patch moves towards is to have a single header
refcounted by several rpc_data structures.  Thus, want to always refer
from rpc_data to the header, and not the other way.  This patch comes
close to that ideal, but the directio code currently needs some
special casing, isolated in the nfs_direct_[read_write]hdr_release()
functions.  This will be dealt with in a future patch.

Signed-off-by: Fred Isaman <iisaman@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index 7f6a23f..7a48251 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -187,7 +187,6 @@ static void bl_end_io_read(struct bio *bio, int err)
 	struct parallel_io *par = bio->bi_private;
 	const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
 	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
-	struct nfs_read_data *rdata = (struct nfs_read_data *)par->data;
 
 	do {
 		struct page *page = bvec->bv_page;
@@ -198,9 +197,12 @@ static void bl_end_io_read(struct bio *bio, int err)
 			SetPageUptodate(page);
 	} while (bvec >= bio->bi_io_vec);
 	if (!uptodate) {
-		if (!rdata->pnfs_error)
-			rdata->pnfs_error = -EIO;
-		pnfs_set_lo_fail(rdata->lseg);
+		struct nfs_read_data *rdata = par->data;
+		struct nfs_pgio_header *header = rdata->header;
+
+		if (!header->pnfs_error)
+			header->pnfs_error = -EIO;
+		pnfs_set_lo_fail(header->lseg);
 	}
 	bio_put(bio);
 	put_parallel(par);
@@ -221,7 +223,7 @@ bl_end_par_io_read(void *data, int unused)
 {
 	struct nfs_read_data *rdata = data;
 
-	rdata->task.tk_status = rdata->pnfs_error;
+	rdata->task.tk_status = rdata->header->pnfs_error;
 	INIT_WORK(&rdata->task.u.tk_work, bl_read_cleanup);
 	schedule_work(&rdata->task.u.tk_work);
 }
@@ -229,6 +231,7 @@ bl_end_par_io_read(void *data, int unused)
 static enum pnfs_try_status
 bl_read_pagelist(struct nfs_read_data *rdata)
 {
+	struct nfs_pgio_header *header = rdata->header;
 	int i, hole;
 	struct bio *bio = NULL;
 	struct pnfs_block_extent *be = NULL, *cow_read = NULL;
@@ -256,10 +259,10 @@ bl_read_pagelist(struct nfs_read_data *rdata)
 			bl_put_extent(cow_read);
 			bio = bl_submit_bio(READ, bio);
 			/* Get the next one */
-			be = bl_find_get_extent(BLK_LSEG2EXT(rdata->lseg),
+			be = bl_find_get_extent(BLK_LSEG2EXT(header->lseg),
 					     isect, &cow_read);
 			if (!be) {
-				rdata->pnfs_error = -EIO;
+				header->pnfs_error = -EIO;
 				goto out;
 			}
 			extent_length = be->be_length -
@@ -286,7 +289,7 @@ bl_read_pagelist(struct nfs_read_data *rdata)
 						 isect, pages[i], be_read,
 						 bl_end_io_read, par);
 			if (IS_ERR(bio)) {
-				rdata->pnfs_error = PTR_ERR(bio);
+				header->pnfs_error = PTR_ERR(bio);
 				bio = NULL;
 				goto out;
 			}
@@ -294,9 +297,9 @@ bl_read_pagelist(struct nfs_read_data *rdata)
 		isect += PAGE_CACHE_SECTORS;
 		extent_length -= PAGE_CACHE_SECTORS;
 	}
-	if ((isect << SECTOR_SHIFT) >= rdata->inode->i_size) {
+	if ((isect << SECTOR_SHIFT) >= header->inode->i_size) {
 		rdata->res.eof = 1;
-		rdata->res.count = rdata->inode->i_size - f_offset;
+		rdata->res.count = header->inode->i_size - f_offset;
 	} else {
 		rdata->res.count = (isect << SECTOR_SHIFT) - f_offset;
 	}
@@ -345,7 +348,6 @@ static void bl_end_io_write_zero(struct bio *bio, int err)
 	struct parallel_io *par = bio->bi_private;
 	const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
 	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
-	struct nfs_write_data *wdata = (struct nfs_write_data *)par->data;
 
 	do {
 		struct page *page = bvec->bv_page;
@@ -358,9 +360,12 @@ static void bl_end_io_write_zero(struct bio *bio, int err)
 	} while (bvec >= bio->bi_io_vec);
 
 	if (unlikely(!uptodate)) {
-		if (!wdata->pnfs_error)
-			wdata->pnfs_error = -EIO;
-		pnfs_set_lo_fail(wdata->lseg);
+		struct nfs_write_data *data = par->data;
+		struct nfs_pgio_header *header = data->header;
+
+		if (!header->pnfs_error)
+			header->pnfs_error = -EIO;
+		pnfs_set_lo_fail(header->lseg);
 	}
 	bio_put(bio);
 	put_parallel(par);
@@ -370,12 +375,13 @@ static void bl_end_io_write(struct bio *bio, int err)
 {
 	struct parallel_io *par = bio->bi_private;
 	const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
-	struct nfs_write_data *wdata = (struct nfs_write_data *)par->data;
+	struct nfs_write_data *data = par->data;
+	struct nfs_pgio_header *header = data->header;
 
 	if (!uptodate) {
-		if (!wdata->pnfs_error)
-			wdata->pnfs_error = -EIO;
-		pnfs_set_lo_fail(wdata->lseg);
+		if (!header->pnfs_error)
+			header->pnfs_error = -EIO;
+		pnfs_set_lo_fail(header->lseg);
 	}
 	bio_put(bio);
 	put_parallel(par);
@@ -391,9 +397,9 @@ static void bl_write_cleanup(struct work_struct *work)
 	dprintk("%s enter\n", __func__);
 	task = container_of(work, struct rpc_task, u.tk_work);
 	wdata = container_of(task, struct nfs_write_data, task);
-	if (likely(!wdata->pnfs_error)) {
+	if (likely(!wdata->header->pnfs_error)) {
 		/* Marks for LAYOUTCOMMIT */
-		mark_extents_written(BLK_LSEG2EXT(wdata->lseg),
+		mark_extents_written(BLK_LSEG2EXT(wdata->header->lseg),
 				     wdata->args.offset, wdata->args.count);
 	}
 	pnfs_ld_write_done(wdata);
@@ -404,12 +410,12 @@ static void bl_end_par_io_write(void *data, int num_se)
 {
 	struct nfs_write_data *wdata = data;
 
-	if (unlikely(wdata->pnfs_error)) {
-		bl_free_short_extents(&BLK_LSEG2EXT(wdata->lseg)->bl_inval,
+	if (unlikely(wdata->header->pnfs_error)) {
+		bl_free_short_extents(&BLK_LSEG2EXT(wdata->header->lseg)->bl_inval,
 					num_se);
 	}
 
-	wdata->task.tk_status = wdata->pnfs_error;
+	wdata->task.tk_status = wdata->header->pnfs_error;
 	wdata->verf.committed = NFS_FILE_SYNC;
 	INIT_WORK(&wdata->task.u.tk_work, bl_write_cleanup);
 	schedule_work(&wdata->task.u.tk_work);
@@ -540,6 +546,7 @@ check_page:
 static enum pnfs_try_status
 bl_write_pagelist(struct nfs_write_data *wdata, int sync)
 {
+	struct nfs_pgio_header *header = wdata->header;
 	int i, ret, npg_zero, pg_index, last = 0;
 	struct bio *bio = NULL;
 	struct pnfs_block_extent *be = NULL, *cow_read = NULL;
@@ -552,7 +559,7 @@ bl_write_pagelist(struct nfs_write_data *wdata, int sync)
 	pgoff_t index;
 	u64 temp;
 	int npg_per_block =
-	    NFS_SERVER(wdata->inode)->pnfs_blksize >> PAGE_CACHE_SHIFT;
+	    NFS_SERVER(header->inode)->pnfs_blksize >> PAGE_CACHE_SHIFT;
 
 	dprintk("%s enter, %Zu@%lld\n", __func__, count, offset);
 	/* At this point, wdata->pages is a (sequential) list of nfs_pages.
@@ -566,7 +573,7 @@ bl_write_pagelist(struct nfs_write_data *wdata, int sync)
 	/* At this point, have to be more careful with error handling */
 
 	isect = (sector_t) ((offset & (long)PAGE_CACHE_MASK) >> SECTOR_SHIFT);
-	be = bl_find_get_extent(BLK_LSEG2EXT(wdata->lseg), isect, &cow_read);
+	be = bl_find_get_extent(BLK_LSEG2EXT(header->lseg), isect, &cow_read);
 	if (!be || !is_writable(be, isect)) {
 		dprintk("%s no matching extents!\n", __func__);
 		goto out_mds;
@@ -597,10 +604,10 @@ fill_invalid_ext:
 			dprintk("%s zero %dth page: index %lu isect %llu\n",
 				__func__, npg_zero, index,
 				(unsigned long long)isect);
-			page = bl_find_get_zeroing_page(wdata->inode, index,
+			page = bl_find_get_zeroing_page(header->inode, index,
 							cow_read);
 			if (unlikely(IS_ERR(page))) {
-				wdata->pnfs_error = PTR_ERR(page);
+				header->pnfs_error = PTR_ERR(page);
 				goto out;
 			} else if (page == NULL)
 				goto next_page;
@@ -612,7 +619,7 @@ fill_invalid_ext:
 					__func__, ret);
 				end_page_writeback(page);
 				page_cache_release(page);
-				wdata->pnfs_error = ret;
+				header->pnfs_error = ret;
 				goto out;
 			}
 			if (likely(!bl_push_one_short_extent(be->be_inval)))
@@ -620,11 +627,11 @@ fill_invalid_ext:
 			else {
 				end_page_writeback(page);
 				page_cache_release(page);
-				wdata->pnfs_error = -ENOMEM;
+				header->pnfs_error = -ENOMEM;
 				goto out;
 			}
 			/* FIXME: This should be done in bi_end_io */
-			mark_extents_written(BLK_LSEG2EXT(wdata->lseg),
+			mark_extents_written(BLK_LSEG2EXT(header->lseg),
 					     page->index << PAGE_CACHE_SHIFT,
 					     PAGE_CACHE_SIZE);
 
@@ -632,7 +639,7 @@ fill_invalid_ext:
 						 isect, page, be,
 						 bl_end_io_write_zero, par);
 			if (IS_ERR(bio)) {
-				wdata->pnfs_error = PTR_ERR(bio);
+				header->pnfs_error = PTR_ERR(bio);
 				bio = NULL;
 				goto out;
 			}
@@ -653,10 +660,10 @@ next_page:
 			bl_put_extent(be);
 			bio = bl_submit_bio(WRITE, bio);
 			/* Get the next one */
-			be = bl_find_get_extent(BLK_LSEG2EXT(wdata->lseg),
+			be = bl_find_get_extent(BLK_LSEG2EXT(header->lseg),
 					     isect, NULL);
 			if (!be || !is_writable(be, isect)) {
-				wdata->pnfs_error = -EINVAL;
+				header->pnfs_error = -EINVAL;
 				goto out;
 			}
 			if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
@@ -664,7 +671,7 @@ next_page:
 								be->be_inval)))
 					par->bse_count++;
 				else {
-					wdata->pnfs_error = -ENOMEM;
+					header->pnfs_error = -ENOMEM;
 					goto out;
 				}
 			}
@@ -677,7 +684,7 @@ next_page:
 			if (unlikely(ret)) {
 				dprintk("%s bl_mark_sectors_init fail %d\n",
 					__func__, ret);
-				wdata->pnfs_error = ret;
+				header->pnfs_error = ret;
 				goto out;
 			}
 		}
@@ -685,7 +692,7 @@ next_page:
 					 isect, pages[i], be,
 					 bl_end_io_write, par);
 		if (IS_ERR(bio)) {
-			wdata->pnfs_error = PTR_ERR(bio);
+			header->pnfs_error = PTR_ERR(bio);
 			bio = NULL;
 			goto out;
 		}
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index fb7fbaa..56176af 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -242,7 +242,7 @@ static void nfs_direct_read_release(void *calldata)
 {
 
 	struct nfs_read_data *data = calldata;
-	struct nfs_direct_req *dreq = (struct nfs_direct_req *) data->req;
+	struct nfs_direct_req *dreq = (struct nfs_direct_req *)data->header->req;
 	int status = data->task.tk_status;
 
 	spin_lock(&dreq->lock);
@@ -269,6 +269,15 @@ static const struct rpc_call_ops nfs_read_direct_ops = {
 	.rpc_release = nfs_direct_read_release,
 };
 
+static void nfs_direct_readhdr_release(struct nfs_read_header *rhdr)
+{
+	struct nfs_read_data *data = &rhdr->rpc_data;
+
+	if (data->pagevec != data->page_array)
+		kfree(data->pagevec);
+	nfs_readhdr_free(&rhdr->header);
+}
+
 /*
  * For each rsize'd chunk of the user's buffer, dispatch an NFS READ
  * operation.  If nfs_readdata_alloc() or get_user_pages() fails,
@@ -301,6 +310,7 @@ static ssize_t nfs_direct_read_schedule_segment(struct nfs_direct_req *dreq,
 	ssize_t started = 0;
 
 	do {
+		struct nfs_read_header *rhdr;
 		struct nfs_read_data *data;
 		size_t bytes;
 
@@ -308,23 +318,24 @@ static ssize_t nfs_direct_read_schedule_segment(struct nfs_direct_req *dreq,
 		bytes = min(rsize,count);
 
 		result = -ENOMEM;
-		data = nfs_readdata_alloc(nfs_page_array_len(pgbase, bytes));
-		if (unlikely(!data))
+		rhdr = nfs_readhdr_alloc(nfs_page_array_len(pgbase, bytes));
+		if (unlikely(!rhdr))
 			break;
+		data = &rhdr->rpc_data;
 
 		down_read(&current->mm->mmap_sem);
 		result = get_user_pages(current, current->mm, user_addr,
 					data->npages, 1, 0, data->pagevec, NULL);
 		up_read(&current->mm->mmap_sem);
 		if (result < 0) {
-			nfs_readdata_free(data);
+			nfs_direct_readhdr_release(rhdr);
 			break;
 		}
 		if ((unsigned)result < data->npages) {
 			bytes = result * PAGE_SIZE;
 			if (bytes <= pgbase) {
 				nfs_direct_release_pages(data->pagevec, result);
-				nfs_readdata_free(data);
+				nfs_direct_readhdr_release(rhdr);
 				break;
 			}
 			bytes -= pgbase;
@@ -333,9 +344,9 @@ static ssize_t nfs_direct_read_schedule_segment(struct nfs_direct_req *dreq,
 
 		get_dreq(dreq);
 
-		data->req = (struct nfs_page *) dreq;
-		data->inode = inode;
-		data->cred = msg.rpc_cred;
+		rhdr->header.req = (struct nfs_page *) dreq;
+		rhdr->header.inode = inode;
+		rhdr->header.cred = msg.rpc_cred;
 		data->args.fh = NFS_FH(inode);
 		data->args.context = get_nfs_open_context(ctx);
 		data->args.lock_context = dreq->l_ctx;
@@ -447,13 +458,23 @@ out:
 	return result;
 }
 
+static void nfs_direct_writehdr_release(struct nfs_write_header *whdr)
+{
+	struct nfs_write_data *data = &whdr->rpc_data;
+
+	if (data->pagevec != data->page_array)
+		kfree(data->pagevec);
+	nfs_writehdr_free(&whdr->header);
+}
+
 static void nfs_direct_free_writedata(struct nfs_direct_req *dreq)
 {
 	while (!list_empty(&dreq->rewrite_list)) {
-		struct nfs_write_data *data = list_entry(dreq->rewrite_list.next, struct nfs_write_data, pages);
-		list_del(&data->pages);
-		nfs_direct_release_pages(data->pagevec, data->npages);
-		nfs_writedata_free(data);
+		struct nfs_pgio_header *hdr = list_entry(dreq->rewrite_list.next, struct nfs_pgio_header, pages);
+		struct nfs_write_header *whdr = container_of(hdr, struct nfs_write_header, header);
+		list_del(&hdr->pages);
+		nfs_direct_release_pages(whdr->rpc_data.pagevec, whdr->rpc_data.npages);
+		nfs_direct_writehdr_release(whdr);
 	}
 }
 
@@ -463,6 +484,7 @@ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
 	struct inode *inode = dreq->inode;
 	struct list_head *p;
 	struct nfs_write_data *data;
+	struct nfs_pgio_header *hdr;
 	struct rpc_task *task;
 	struct rpc_message msg = {
 		.rpc_cred = dreq->ctx->cred,
@@ -479,7 +501,8 @@ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
 	get_dreq(dreq);
 
 	list_for_each(p, &dreq->rewrite_list) {
-		data = list_entry(p, struct nfs_write_data, pages);
+		hdr = list_entry(p, struct nfs_pgio_header, pages);
+		data = &(container_of(hdr, struct nfs_write_header, header))->rpc_data;
 
 		get_dreq(dreq);
 
@@ -652,7 +675,8 @@ static void nfs_direct_write_result(struct rpc_task *task, void *calldata)
 static void nfs_direct_write_release(void *calldata)
 {
 	struct nfs_write_data *data = calldata;
-	struct nfs_direct_req *dreq = (struct nfs_direct_req *) data->req;
+	struct nfs_pgio_header *hdr = data->header;
+	struct nfs_direct_req *dreq = (struct nfs_direct_req *) hdr->req;
 	int status = data->task.tk_status;
 
 	spin_lock(&dreq->lock);
@@ -684,7 +708,7 @@ out_unlock:
 	spin_unlock(&dreq->lock);
 
 	if (put_dreq(dreq))
-		nfs_direct_write_complete(dreq, data->inode);
+		nfs_direct_write_complete(dreq, hdr->inode);
 }
 
 static const struct rpc_call_ops nfs_write_direct_ops = {
@@ -725,6 +749,7 @@ static ssize_t nfs_direct_write_schedule_segment(struct nfs_direct_req *dreq,
 	ssize_t started = 0;
 
 	do {
+		struct nfs_write_header *whdr;
 		struct nfs_write_data *data;
 		size_t bytes;
 
@@ -732,23 +757,25 @@ static ssize_t nfs_direct_write_schedule_segment(struct nfs_direct_req *dreq,
 		bytes = min(wsize,count);
 
 		result = -ENOMEM;
-		data = nfs_writedata_alloc(nfs_page_array_len(pgbase, bytes));
-		if (unlikely(!data))
+		whdr = nfs_writehdr_alloc(nfs_page_array_len(pgbase, bytes));
+		if (unlikely(!whdr))
 			break;
 
+		data = &whdr->rpc_data;
+
 		down_read(&current->mm->mmap_sem);
 		result = get_user_pages(current, current->mm, user_addr,
 					data->npages, 0, 0, data->pagevec, NULL);
 		up_read(&current->mm->mmap_sem);
 		if (result < 0) {
-			nfs_writedata_free(data);
+			nfs_direct_writehdr_release(whdr);
 			break;
 		}
 		if ((unsigned)result < data->npages) {
 			bytes = result * PAGE_SIZE;
 			if (bytes <= pgbase) {
 				nfs_direct_release_pages(data->pagevec, result);
-				nfs_writedata_free(data);
+				nfs_direct_writehdr_release(whdr);
 				break;
 			}
 			bytes -= pgbase;
@@ -757,11 +784,11 @@ static ssize_t nfs_direct_write_schedule_segment(struct nfs_direct_req *dreq,
 
 		get_dreq(dreq);
 
-		list_move_tail(&data->pages, &dreq->rewrite_list);
+		list_move_tail(&whdr->header.pages, &dreq->rewrite_list);
 
-		data->req = (struct nfs_page *) dreq;
-		data->inode = inode;
-		data->cred = msg.rpc_cred;
+		whdr->header.req = (struct nfs_page *) dreq;
+		whdr->header.inode = inode;
+		whdr->header.cred = msg.rpc_cred;
 		data->args.fh = NFS_FH(inode);
 		data->args.context = ctx;
 		data->args.lock_context = dreq->l_ctx;
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 650127f..7dc9be1 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -296,6 +296,8 @@ extern int nfs4_get_rootfh(struct nfs_server *server, struct nfs_fh *mntfh);
 
 struct nfs_pageio_descriptor;
 /* read.c */
+extern struct nfs_read_header *nfs_readhdr_alloc(unsigned int npages);
+extern void nfs_readhdr_free(struct nfs_pgio_header *hdr);
 extern int nfs_initiate_read(struct rpc_clnt *clnt,
 			     struct nfs_read_data *data,
 			     const struct rpc_call_ops *call_ops);
@@ -309,6 +311,8 @@ extern void nfs_pageio_reset_read_mds(struct nfs_pageio_descriptor *pgio);
 extern void nfs_readdata_release(struct nfs_read_data *rdata);
 
 /* write.c */
+extern struct nfs_write_header *nfs_writehdr_alloc(unsigned int npages);
+extern void nfs_writehdr_free(struct nfs_pgio_header *hdr);
 extern int nfs_generic_flush(struct nfs_pageio_descriptor *desc,
 		struct list_head *head);
 extern void nfs_pageio_init_write_mds(struct nfs_pageio_descriptor *pgio,
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index b1daca7..56dcefc 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -811,11 +811,13 @@ nfs3_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle,
 
 static int nfs3_read_done(struct rpc_task *task, struct nfs_read_data *data)
 {
-	if (nfs3_async_handle_jukebox(task, data->inode))
+	struct inode *inode = data->header->inode;
+
+	if (nfs3_async_handle_jukebox(task, inode))
 		return -EAGAIN;
 
-	nfs_invalidate_atime(data->inode);
-	nfs_refresh_inode(data->inode, &data->fattr);
+	nfs_invalidate_atime(inode);
+	nfs_refresh_inode(inode, &data->fattr);
 	return 0;
 }
 
@@ -831,10 +833,12 @@ static void nfs3_proc_read_rpc_prepare(struct rpc_task *task, struct nfs_read_da
 
 static int nfs3_write_done(struct rpc_task *task, struct nfs_write_data *data)
 {
-	if (nfs3_async_handle_jukebox(task, data->inode))
+	struct inode *inode = data->header->inode;
+
+	if (nfs3_async_handle_jukebox(task, inode))
 		return -EAGAIN;
 	if (task->tk_status >= 0)
-		nfs_post_op_update_inode_force_wcc(data->inode, data->res.fattr);
+		nfs_post_op_update_inode_force_wcc(inode, data->res.fattr);
 	return 0;
 }
 
diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
index c536328..ad1d680 100644
--- a/fs/nfs/nfs4filelayout.c
+++ b/fs/nfs/nfs4filelayout.c
@@ -148,6 +148,7 @@ wait_on_recovery:
 static int filelayout_read_done_cb(struct rpc_task *task,
 				struct nfs_read_data *data)
 {
+	struct nfs_pgio_header *hdr = data->header;
 	int reset = 0;
 
 	dprintk("%s DS read\n", __func__);
@@ -157,7 +158,7 @@ static int filelayout_read_done_cb(struct rpc_task *task,
 		dprintk("%s calling restart ds_clp %p ds_clp->cl_session %p\n",
 			__func__, data->ds_clp, data->ds_clp->cl_session);
 		if (reset) {
-			pnfs_set_lo_fail(data->lseg);
+			pnfs_set_lo_fail(hdr->lseg);
 			nfs4_reset_read(task, data);
 		}
 		rpc_restart_call_prepare(task);
@@ -175,13 +176,15 @@ static int filelayout_read_done_cb(struct rpc_task *task,
 static void
 filelayout_set_layoutcommit(struct nfs_write_data *wdata)
 {
-	if (FILELAYOUT_LSEG(wdata->lseg)->commit_through_mds ||
+	struct nfs_pgio_header *hdr = wdata->header;
+
+	if (FILELAYOUT_LSEG(hdr->lseg)->commit_through_mds ||
 	    wdata->res.verf->committed == NFS_FILE_SYNC)
 		return;
 
 	pnfs_set_layoutcommit(wdata);
-	dprintk("%s ionde %lu pls_end_pos %lu\n", __func__, wdata->inode->i_ino,
-		(unsigned long) NFS_I(wdata->inode)->layout->plh_lwb);
+	dprintk("%s ionde %lu pls_end_pos %lu\n", __func__, hdr->inode->i_ino,
+		(unsigned long) NFS_I(hdr->inode)->layout->plh_lwb);
 }
 
 /*
@@ -210,27 +213,28 @@ static void filelayout_read_call_done(struct rpc_task *task, void *data)
 	dprintk("--> %s task->tk_status %d\n", __func__, task->tk_status);
 
 	/* Note this may cause RPC to be resent */
-	rdata->mds_ops->rpc_call_done(task, data);
+	rdata->header->mds_ops->rpc_call_done(task, data);
 }
 
 static void filelayout_read_count_stats(struct rpc_task *task, void *data)
 {
 	struct nfs_read_data *rdata = data;
 
-	rpc_count_iostats(task, NFS_SERVER(rdata->inode)->client->cl_metrics);
+	rpc_count_iostats(task, NFS_SERVER(rdata->header->inode)->client->cl_metrics);
 }
 
 static void filelayout_read_release(void *data)
 {
 	struct nfs_read_data *rdata = data;
 
-	put_lseg(rdata->lseg);
-	rdata->mds_ops->rpc_release(data);
+	put_lseg(rdata->header->lseg);
+	rdata->header->mds_ops->rpc_release(data);
 }
 
 static int filelayout_write_done_cb(struct rpc_task *task,
 				struct nfs_write_data *data)
 {
+	struct nfs_pgio_header *hdr = data->header;
 	int reset = 0;
 
 	if (filelayout_async_handle_error(task, data->args.context->state,
@@ -238,7 +242,7 @@ static int filelayout_write_done_cb(struct rpc_task *task,
 		dprintk("%s calling restart ds_clp %p ds_clp->cl_session %p\n",
 			__func__, data->ds_clp, data->ds_clp->cl_session);
 		if (reset) {
-			pnfs_set_lo_fail(data->lseg);
+			pnfs_set_lo_fail(hdr->lseg);
 			nfs4_reset_write(task, data);
 		}
 		rpc_restart_call_prepare(task);
@@ -297,22 +301,22 @@ static void filelayout_write_call_done(struct rpc_task *task, void *data)
 	struct nfs_write_data *wdata = data;
 
 	/* Note this may cause RPC to be resent */
-	wdata->mds_ops->rpc_call_done(task, data);
+	wdata->header->mds_ops->rpc_call_done(task, data);
 }
 
 static void filelayout_write_count_stats(struct rpc_task *task, void *data)
 {
 	struct nfs_write_data *wdata = data;
 
-	rpc_count_iostats(task, NFS_SERVER(wdata->inode)->client->cl_metrics);
+	rpc_count_iostats(task, NFS_SERVER(wdata->header->inode)->client->cl_metrics);
 }
 
 static void filelayout_write_release(void *data)
 {
 	struct nfs_write_data *wdata = data;
 
-	put_lseg(wdata->lseg);
-	wdata->mds_ops->rpc_release(data);
+	put_lseg(wdata->header->lseg);
+	wdata->header->mds_ops->rpc_release(data);
 }
 
 static void filelayout_commit_prepare(struct rpc_task *task, void *data)
@@ -377,7 +381,8 @@ static const struct rpc_call_ops filelayout_commit_call_ops = {
 static enum pnfs_try_status
 filelayout_read_pagelist(struct nfs_read_data *data)
 {
-	struct pnfs_layout_segment *lseg = data->lseg;
+	struct nfs_pgio_header *hdr = data->header;
+	struct pnfs_layout_segment *lseg = hdr->lseg;
 	struct nfs4_pnfs_ds *ds;
 	loff_t offset = data->args.offset;
 	u32 j, idx;
@@ -385,7 +390,7 @@ filelayout_read_pagelist(struct nfs_read_data *data)
 	int status;
 
 	dprintk("--> %s ino %lu pgbase %u req %Zu@%llu\n",
-		__func__, data->inode->i_ino,
+		__func__, hdr->inode->i_ino,
 		data->args.pgbase, (size_t)data->args.count, offset);
 
 	if (test_bit(NFS_DEVICEID_INVALID, &FILELAYOUT_DEVID_NODE(lseg)->flags))
@@ -423,7 +428,8 @@ filelayout_read_pagelist(struct nfs_read_data *data)
 static enum pnfs_try_status
 filelayout_write_pagelist(struct nfs_write_data *data, int sync)
 {
-	struct pnfs_layout_segment *lseg = data->lseg;
+	struct nfs_pgio_header *hdr = data->header;
+	struct pnfs_layout_segment *lseg = hdr->lseg;
 	struct nfs4_pnfs_ds *ds;
 	loff_t offset = data->args.offset;
 	u32 j, idx;
@@ -445,7 +451,7 @@ filelayout_write_pagelist(struct nfs_write_data *data, int sync)
 		return PNFS_NOT_ATTEMPTED;
 	}
 	dprintk("%s ino %lu sync %d req %Zu@%llu DS: %s\n", __func__,
-		data->inode->i_ino, sync, (size_t) data->args.count, offset,
+		hdr->inode->i_ino, sync, (size_t) data->args.count, offset,
 		ds->ds_remotestr);
 
 	data->write_done_cb = filelayout_write_done_cb;
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index cc04b6e..5375862 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -3336,12 +3336,12 @@ static int nfs4_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle,
 
 void __nfs4_read_done_cb(struct nfs_read_data *data)
 {
-	nfs_invalidate_atime(data->inode);
+	nfs_invalidate_atime(data->header->inode);
 }
 
 static int nfs4_read_done_cb(struct rpc_task *task, struct nfs_read_data *data)
 {
-	struct nfs_server *server = NFS_SERVER(data->inode);
+	struct nfs_server *server = NFS_SERVER(data->header->inode);
 
 	if (nfs4_async_handle_error(task, server, data->args.context->state) == -EAGAIN) {
 		rpc_restart_call_prepare(task);
@@ -3376,7 +3376,7 @@ static void nfs4_proc_read_setup(struct nfs_read_data *data, struct rpc_message
 
 static void nfs4_proc_read_rpc_prepare(struct rpc_task *task, struct nfs_read_data *data)
 {
-	if (nfs4_setup_sequence(NFS_SERVER(data->inode),
+	if (nfs4_setup_sequence(NFS_SERVER(data->header->inode),
 				&data->args.seq_args,
 				&data->res.seq_res,
 				task))
@@ -3387,22 +3387,25 @@ static void nfs4_proc_read_rpc_prepare(struct rpc_task *task, struct nfs_read_da
 /* Reset the the nfs_read_data to send the read to the MDS. */
 void nfs4_reset_read(struct rpc_task *task, struct nfs_read_data *data)
 {
+	struct nfs_pgio_header *hdr = data->header;
+	struct inode *inode = hdr->inode;
+
 	dprintk("%s Reset task for i/o through\n", __func__);
-	put_lseg(data->lseg);
-	data->lseg = NULL;
+	put_lseg(hdr->lseg);
+	hdr->lseg = NULL;
+	data->ds_clp = NULL;
 	/* offsets will differ in the dense stripe case */
 	data->args.offset = data->mds_offset;
-	data->ds_clp = NULL;
-	data->args.fh     = NFS_FH(data->inode);
+	data->args.fh     = NFS_FH(inode);
 	data->read_done_cb = nfs4_read_done_cb;
-	task->tk_ops = data->mds_ops;
-	rpc_task_reset_client(task, NFS_CLIENT(data->inode));
+	task->tk_ops = hdr->mds_ops;
+	rpc_task_reset_client(task, NFS_CLIENT(inode));
 }
 EXPORT_SYMBOL_GPL(nfs4_reset_read);
 
 static int nfs4_write_done_cb(struct rpc_task *task, struct nfs_write_data *data)
 {
-	struct inode *inode = data->inode;
+	struct inode *inode = data->header->inode;
 	
 	if (nfs4_async_handle_error(task, NFS_SERVER(inode), data->args.context->state) == -EAGAIN) {
 		rpc_restart_call_prepare(task);
@@ -3426,25 +3429,28 @@ static int nfs4_write_done(struct rpc_task *task, struct nfs_write_data *data)
 /* Reset the the nfs_write_data to send the write to the MDS. */
 void nfs4_reset_write(struct rpc_task *task, struct nfs_write_data *data)
 {
+	struct nfs_pgio_header *hdr = data->header;
+	struct inode *inode = hdr->inode;
+
 	dprintk("%s Reset task for i/o through\n", __func__);
-	put_lseg(data->lseg);
-	data->lseg          = NULL;
-	data->ds_clp        = NULL;
+	put_lseg(hdr->lseg);
+	hdr->lseg        = NULL;
+	data->ds_clp     = NULL;
 	data->write_done_cb = nfs4_write_done_cb;
-	data->args.fh       = NFS_FH(data->inode);
+	data->args.fh       = NFS_FH(inode);
 	data->args.bitmask  = data->res.server->cache_consistency_bitmask;
 	data->args.offset   = data->mds_offset;
 	data->res.fattr     = &data->fattr;
-	task->tk_ops        = data->mds_ops;
-	rpc_task_reset_client(task, NFS_CLIENT(data->inode));
+	task->tk_ops        = hdr->mds_ops;
+	rpc_task_reset_client(task, NFS_CLIENT(inode));
 }
 EXPORT_SYMBOL_GPL(nfs4_reset_write);
 
 static void nfs4_proc_write_setup(struct nfs_write_data *data, struct rpc_message *msg)
 {
-	struct nfs_server *server = NFS_SERVER(data->inode);
+	struct nfs_server *server = NFS_SERVER(data->header->inode);
 
-	if (data->lseg) {
+	if (data->header->lseg) {
 		data->args.bitmask = NULL;
 		data->res.fattr = NULL;
 	} else
@@ -3460,7 +3466,7 @@ static void nfs4_proc_write_setup(struct nfs_write_data *data, struct rpc_messag
 
 static void nfs4_proc_write_rpc_prepare(struct rpc_task *task, struct nfs_write_data *data)
 {
-	if (nfs4_setup_sequence(NFS_SERVER(data->inode),
+	if (nfs4_setup_sequence(NFS_SERVER(data->header->inode),
 				&data->args.seq_args,
 				&data->res.seq_res,
 				task))
diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c
index 4bff4a3..fbf4874 100644
--- a/fs/nfs/objlayout/objio_osd.c
+++ b/fs/nfs/objlayout/objio_osd.c
@@ -440,11 +440,12 @@ static void _read_done(struct ore_io_state *ios, void *private)
 
 int objio_read_pagelist(struct nfs_read_data *rdata)
 {
+	struct nfs_pgio_header *hdr = rdata->header;
 	struct objio_state *objios;
 	int ret;
 
-	ret = objio_alloc_io_state(NFS_I(rdata->inode)->layout, true,
-			rdata->lseg, rdata->args.pages, rdata->args.pgbase,
+	ret = objio_alloc_io_state(NFS_I(hdr->inode)->layout, true,
+			hdr->lseg, rdata->args.pages, rdata->args.pgbase,
 			rdata->args.offset, rdata->args.count, rdata,
 			GFP_KERNEL, &objios);
 	if (unlikely(ret))
@@ -483,12 +484,12 @@ static struct page *__r4w_get_page(void *priv, u64 offset, bool *uptodate)
 {
 	struct objio_state *objios = priv;
 	struct nfs_write_data *wdata = objios->oir.rpcdata;
+	struct address_space *mapping = wdata->header->inode->i_mapping;
 	pgoff_t index = offset / PAGE_SIZE;
-	struct page *page = find_get_page(wdata->inode->i_mapping, index);
+	struct page *page = find_get_page(mapping, index);
 
 	if (!page) {
-		page = find_or_create_page(wdata->inode->i_mapping,
-						index, GFP_NOFS);
+		page = find_or_create_page(mapping, index, GFP_NOFS);
 		if (unlikely(!page)) {
 			dprintk("%s: grab_cache_page Failed index=0x%lx\n",
 				__func__, index);
@@ -518,11 +519,12 @@ static const struct _ore_r4w_op _r4w_op = {
 
 int objio_write_pagelist(struct nfs_write_data *wdata, int how)
 {
+	struct nfs_pgio_header *hdr = wdata->header;
 	struct objio_state *objios;
 	int ret;
 
-	ret = objio_alloc_io_state(NFS_I(wdata->inode)->layout, false,
-			wdata->lseg, wdata->args.pages, wdata->args.pgbase,
+	ret = objio_alloc_io_state(NFS_I(hdr->inode)->layout, false,
+			hdr->lseg, wdata->args.pages, wdata->args.pgbase,
 			wdata->args.offset, wdata->args.count, wdata, GFP_NOFS,
 			&objios);
 	if (unlikely(ret))
diff --git a/fs/nfs/objlayout/objlayout.c b/fs/nfs/objlayout/objlayout.c
index 595c5fc..8746135 100644
--- a/fs/nfs/objlayout/objlayout.c
+++ b/fs/nfs/objlayout/objlayout.c
@@ -258,7 +258,7 @@ objlayout_read_done(struct objlayout_io_res *oir, ssize_t status, bool sync)
 	if (status >= 0)
 		rdata->res.count = status;
 	else
-		rdata->pnfs_error = status;
+		rdata->header->pnfs_error = status;
 	objlayout_iodone(oir);
 	/* must not use oir after this point */
 
@@ -279,12 +279,14 @@ objlayout_read_done(struct objlayout_io_res *oir, ssize_t status, bool sync)
 enum pnfs_try_status
 objlayout_read_pagelist(struct nfs_read_data *rdata)
 {
+	struct nfs_pgio_header *hdr = rdata->header;
+	struct inode *inode = hdr->inode;
 	loff_t offset = rdata->args.offset;
 	size_t count = rdata->args.count;
 	int err;
 	loff_t eof;
 
-	eof = i_size_read(rdata->inode);
+	eof = i_size_read(inode);
 	if (unlikely(offset + count > eof)) {
 		if (offset >= eof) {
 			err = 0;
@@ -297,17 +299,17 @@ objlayout_read_pagelist(struct nfs_read_data *rdata)
 	}
 
 	rdata->res.eof = (offset + count) >= eof;
-	_fix_verify_io_params(rdata->lseg, &rdata->args.pages,
+	_fix_verify_io_params(hdr->lseg, &rdata->args.pages,
 			      &rdata->args.pgbase,
 			      rdata->args.offset, rdata->args.count);
 
 	dprintk("%s: inode(%lx) offset 0x%llx count 0x%Zx eof=%d\n",
-		__func__, rdata->inode->i_ino, offset, count, rdata->res.eof);
+		__func__, inode->i_ino, offset, count, rdata->res.eof);
 
 	err = objio_read_pagelist(rdata);
  out:
 	if (unlikely(err)) {
-		rdata->pnfs_error = err;
+		hdr->pnfs_error = err;
 		dprintk("%s: Returned Error %d\n", __func__, err);
 		return PNFS_NOT_ATTEMPTED;
 	}
@@ -340,7 +342,7 @@ objlayout_write_done(struct objlayout_io_res *oir, ssize_t status, bool sync)
 		wdata->res.count = status;
 		wdata->verf.committed = oir->committed;
 	} else {
-		wdata->pnfs_error = status;
+		wdata->header->pnfs_error = status;
 	}
 	objlayout_iodone(oir);
 	/* must not use oir after this point */
@@ -363,15 +365,16 @@ enum pnfs_try_status
 objlayout_write_pagelist(struct nfs_write_data *wdata,
 			 int how)
 {
+	struct nfs_pgio_header *hdr = wdata->header;
 	int err;
 
-	_fix_verify_io_params(wdata->lseg, &wdata->args.pages,
+	_fix_verify_io_params(hdr->lseg, &wdata->args.pages,
 			      &wdata->args.pgbase,
 			      wdata->args.offset, wdata->args.count);
 
 	err = objio_write_pagelist(wdata, how);
 	if (unlikely(err)) {
-		wdata->pnfs_error = err;
+		hdr->pnfs_error = err;
 		dprintk("%s: Returned Error %d\n", __func__, err);
 		return PNFS_NOT_ATTEMPTED;
 	}
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 9c4d14a..d705da4 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -1191,13 +1191,15 @@ static int pnfs_write_done_resend_to_mds(struct inode *inode, struct list_head *
 
 static void pnfs_ld_handle_write_error(struct nfs_write_data *data)
 {
-	dprintk("pnfs write error = %d\n", data->pnfs_error);
-	if (NFS_SERVER(data->inode)->pnfs_curr_ld->flags &
+	struct nfs_pgio_header *hdr = data->header;
+
+	dprintk("pnfs write error = %d\n", hdr->pnfs_error);
+	if (NFS_SERVER(hdr->inode)->pnfs_curr_ld->flags &
 	    PNFS_LAYOUTRET_ON_ERROR) {
-		clear_bit(NFS_INO_LAYOUTCOMMIT, &NFS_I(data->inode)->flags);
-		pnfs_return_layout(data->inode);
+		clear_bit(NFS_INO_LAYOUTCOMMIT, &NFS_I(hdr->inode)->flags);
+		pnfs_return_layout(hdr->inode);
 	}
-	data->task.tk_status = pnfs_write_done_resend_to_mds(data->inode, &data->pages);
+	data->task.tk_status = pnfs_write_done_resend_to_mds(hdr->inode, &hdr->pages);
 }
 
 /*
@@ -1205,13 +1207,15 @@ static void pnfs_ld_handle_write_error(struct nfs_write_data *data)
  */
 void pnfs_ld_write_done(struct nfs_write_data *data)
 {
-	if (likely(!data->pnfs_error)) {
+	struct nfs_pgio_header *hdr = data->header;
+
+	if (!hdr->pnfs_error) {
 		pnfs_set_layoutcommit(data);
-		data->mds_ops->rpc_call_done(&data->task, data);
+		hdr->mds_ops->rpc_call_done(&data->task, data);
 	} else
 		pnfs_ld_handle_write_error(data);
-	put_lseg(data->lseg);
-	data->mds_ops->rpc_release(data);
+	put_lseg(hdr->lseg);
+	hdr->mds_ops->rpc_release(data);
 }
 EXPORT_SYMBOL_GPL(pnfs_ld_write_done);
 
@@ -1219,12 +1223,14 @@ static void
 pnfs_write_through_mds(struct nfs_pageio_descriptor *desc,
 		struct nfs_write_data *data)
 {
-	list_splice_tail_init(&data->pages, &desc->pg_list);
-	if (data->req && list_empty(&data->req->wb_list))
-		nfs_list_add_request(data->req, &desc->pg_list);
+	struct nfs_pgio_header *hdr = data->header;
+
+	list_splice_tail_init(&hdr->pages, &desc->pg_list);
+	if (hdr->req && list_empty(&hdr->req->wb_list))
+		nfs_list_add_request(hdr->req, &desc->pg_list);
 	nfs_pageio_reset_write_mds(desc);
 	desc->pg_recoalesce = 1;
-	put_lseg(data->lseg);
+	put_lseg(hdr->lseg);
 	nfs_writedata_release(data);
 }
 
@@ -1234,20 +1240,21 @@ pnfs_try_to_write_data(struct nfs_write_data *wdata,
 			struct pnfs_layout_segment *lseg,
 			int how)
 {
-	struct inode *inode = wdata->inode;
+	struct nfs_pgio_header *hdr = wdata->header;
+	struct inode *inode = hdr->inode;
 	enum pnfs_try_status trypnfs;
 	struct nfs_server *nfss = NFS_SERVER(inode);
 
-	wdata->mds_ops = call_ops;
-	wdata->lseg = get_lseg(lseg);
+	hdr->mds_ops = call_ops;
+	hdr->lseg = get_lseg(lseg);
 
 	dprintk("%s: Writing ino:%lu %u@%llu (how %d)\n", __func__,
 		inode->i_ino, wdata->args.count, wdata->args.offset, how);
 
 	trypnfs = nfss->pnfs_curr_ld->write_pagelist(wdata, how);
 	if (trypnfs == PNFS_NOT_ATTEMPTED) {
-		put_lseg(wdata->lseg);
-		wdata->lseg = NULL;
+		put_lseg(hdr->lseg);
+		hdr->lseg = NULL;
 	} else
 		nfs_inc_stats(inode, NFSIOS_PNFS_WRITE);
 
@@ -1318,13 +1325,15 @@ static int pnfs_read_done_resend_to_mds(struct inode *inode, struct list_head *h
 
 static void pnfs_ld_handle_read_error(struct nfs_read_data *data)
 {
-	dprintk("pnfs read error = %d\n", data->pnfs_error);
-	if (NFS_SERVER(data->inode)->pnfs_curr_ld->flags &
+	struct nfs_pgio_header *hdr = data->header;
+
+	dprintk("pnfs read error = %d\n", hdr->pnfs_error);
+	if (NFS_SERVER(hdr->inode)->pnfs_curr_ld->flags &
 	    PNFS_LAYOUTRET_ON_ERROR) {
-		clear_bit(NFS_INO_LAYOUTCOMMIT, &NFS_I(data->inode)->flags);
-		pnfs_return_layout(data->inode);
+		clear_bit(NFS_INO_LAYOUTCOMMIT, &NFS_I(hdr->inode)->flags);
+		pnfs_return_layout(hdr->inode);
 	}
-	data->task.tk_status = pnfs_read_done_resend_to_mds(data->inode, &data->pages);
+	data->task.tk_status = pnfs_read_done_resend_to_mds(hdr->inode, &hdr->pages);
 }
 
 /*
@@ -1332,13 +1341,15 @@ static void pnfs_ld_handle_read_error(struct nfs_read_data *data)
  */
 void pnfs_ld_read_done(struct nfs_read_data *data)
 {
-	if (likely(!data->pnfs_error)) {
+	struct nfs_pgio_header *hdr = data->header;
+
+	if (likely(!hdr->pnfs_error)) {
 		__nfs4_read_done_cb(data);
-		data->mds_ops->rpc_call_done(&data->task, data);
+		hdr->mds_ops->rpc_call_done(&data->task, data);
 	} else
 		pnfs_ld_handle_read_error(data);
-	put_lseg(data->lseg);
-	data->mds_ops->rpc_release(data);
+	put_lseg(hdr->lseg);
+	hdr->mds_ops->rpc_release(data);
 }
 EXPORT_SYMBOL_GPL(pnfs_ld_read_done);
 
@@ -1346,9 +1357,11 @@ static void
 pnfs_read_through_mds(struct nfs_pageio_descriptor *desc,
 		struct nfs_read_data *data)
 {
-	list_splice_tail_init(&data->pages, &desc->pg_list);
-	if (data->req && list_empty(&data->req->wb_list))
-		nfs_list_add_request(data->req, &desc->pg_list);
+	struct nfs_pgio_header *hdr = data->header;
+
+	list_splice_tail_init(&hdr->pages, &desc->pg_list);
+	if (hdr->req && list_empty(&hdr->req->wb_list))
+		nfs_list_add_request(hdr->req, &desc->pg_list);
 	nfs_pageio_reset_read_mds(desc);
 	desc->pg_recoalesce = 1;
 	nfs_readdata_release(data);
@@ -1362,20 +1375,21 @@ pnfs_try_to_read_data(struct nfs_read_data *rdata,
 		       const struct rpc_call_ops *call_ops,
 		       struct pnfs_layout_segment *lseg)
 {
-	struct inode *inode = rdata->inode;
+	struct nfs_pgio_header *hdr = rdata->header;
+	struct inode *inode = hdr->inode;
 	struct nfs_server *nfss = NFS_SERVER(inode);
 	enum pnfs_try_status trypnfs;
 
-	rdata->mds_ops = call_ops;
-	rdata->lseg = get_lseg(lseg);
+	hdr->mds_ops = call_ops;
+	hdr->lseg = get_lseg(lseg);
 
 	dprintk("%s: Reading ino:%lu %u@%llu\n",
 		__func__, inode->i_ino, rdata->args.count, rdata->args.offset);
 
 	trypnfs = nfss->pnfs_curr_ld->read_pagelist(rdata);
 	if (trypnfs == PNFS_NOT_ATTEMPTED) {
-		put_lseg(rdata->lseg);
-		rdata->lseg = NULL;
+		put_lseg(hdr->lseg);
+		hdr->lseg = NULL;
 	} else {
 		nfs_inc_stats(inode, NFSIOS_PNFS_READ);
 	}
@@ -1450,30 +1464,32 @@ EXPORT_SYMBOL_GPL(pnfs_set_lo_fail);
 void
 pnfs_set_layoutcommit(struct nfs_write_data *wdata)
 {
-	struct nfs_inode *nfsi = NFS_I(wdata->inode);
+	struct nfs_pgio_header *hdr = wdata->header;
+	struct inode *inode = hdr->inode;
+	struct nfs_inode *nfsi = NFS_I(inode);
 	loff_t end_pos = wdata->mds_offset + wdata->res.count;
 	bool mark_as_dirty = false;
 
-	spin_lock(&nfsi->vfs_inode.i_lock);
+	spin_lock(&inode->i_lock);
 	if (!test_and_set_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) {
 		mark_as_dirty = true;
 		dprintk("%s: Set layoutcommit for inode %lu ",
-			__func__, wdata->inode->i_ino);
+			__func__, inode->i_ino);
 	}
-	if (!test_and_set_bit(NFS_LSEG_LAYOUTCOMMIT, &wdata->lseg->pls_flags)) {
+	if (!test_and_set_bit(NFS_LSEG_LAYOUTCOMMIT, &hdr->lseg->pls_flags)) {
 		/* references matched in nfs4_layoutcommit_release */
-		get_lseg(wdata->lseg);
+		get_lseg(hdr->lseg);
 	}
 	if (end_pos > nfsi->layout->plh_lwb)
 		nfsi->layout->plh_lwb = end_pos;
-	spin_unlock(&nfsi->vfs_inode.i_lock);
+	spin_unlock(&inode->i_lock);
 	dprintk("%s: lseg %p end_pos %llu\n",
-		__func__, wdata->lseg, nfsi->layout->plh_lwb);
+		__func__, hdr->lseg, nfsi->layout->plh_lwb);
 
 	/* if pnfs_layoutcommit_inode() runs between inode locks, the next one
 	 * will be a noop because NFS_INO_LAYOUTCOMMIT will not be set */
 	if (mark_as_dirty)
-		mark_inode_dirty_sync(wdata->inode);
+		mark_inode_dirty_sync(inode);
 }
 EXPORT_SYMBOL_GPL(pnfs_set_layoutcommit);
 
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index bf80503..22ee705 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -641,12 +641,14 @@ nfs_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle,
 
 static int nfs_read_done(struct rpc_task *task, struct nfs_read_data *data)
 {
+	struct inode *inode = data->header->inode;
+
 	if (nfs_async_handle_expired_key(task))
 		return -EAGAIN;
 
-	nfs_invalidate_atime(data->inode);
+	nfs_invalidate_atime(inode);
 	if (task->tk_status >= 0) {
-		nfs_refresh_inode(data->inode, data->res.fattr);
+		nfs_refresh_inode(inode, data->res.fattr);
 		/* Emulate the eof flag, which isn't normally needed in NFSv2
 		 * as it is guaranteed to always return the file attributes
 		 */
@@ -668,11 +670,13 @@ static void nfs_proc_read_rpc_prepare(struct rpc_task *task, struct nfs_read_dat
 
 static int nfs_write_done(struct rpc_task *task, struct nfs_write_data *data)
 {
+	struct inode *inode = data->header->inode;
+
 	if (nfs_async_handle_expired_key(task))
 		return -EAGAIN;
 
 	if (task->tk_status >= 0)
-		nfs_post_op_update_inode_force_wcc(data->inode, data->res.fattr);
+		nfs_post_op_update_inode_force_wcc(inode, data->res.fattr);
 	return 0;
 }
 
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 4ddba67..d6d4682 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -35,19 +35,24 @@ static const struct rpc_call_ops nfs_read_full_ops;
 
 static struct kmem_cache *nfs_rdata_cachep;
 
-struct nfs_read_data *nfs_readdata_alloc(unsigned int pagecount)
+struct nfs_read_header *nfs_readhdr_alloc(unsigned int pagecount)
 {
-	struct nfs_read_data *p;
+	struct nfs_read_header *p;
 
 	p = kmem_cache_zalloc(nfs_rdata_cachep, GFP_KERNEL);
 	if (p) {
-		INIT_LIST_HEAD(&p->pages);
-		p->npages = pagecount;
-		if (pagecount <= ARRAY_SIZE(p->page_array))
-			p->pagevec = p->page_array;
+		struct nfs_pgio_header *hdr = &p->header;
+		struct nfs_read_data *data = &p->rpc_data;
+
+		INIT_LIST_HEAD(&hdr->pages);
+		INIT_LIST_HEAD(&data->list);
+		data->npages = pagecount;
+		data->header = hdr;
+		if (pagecount <= ARRAY_SIZE(data->page_array))
+			data->pagevec = data->page_array;
 		else {
-			p->pagevec = kcalloc(pagecount, sizeof(struct page *), GFP_KERNEL);
-			if (!p->pagevec) {
+			data->pagevec = kcalloc(pagecount, sizeof(struct page *), GFP_KERNEL);
+			if (!data->pagevec) {
 				kmem_cache_free(nfs_rdata_cachep, p);
 				p = NULL;
 			}
@@ -56,17 +61,19 @@ struct nfs_read_data *nfs_readdata_alloc(unsigned int pagecount)
 	return p;
 }
 
-void nfs_readdata_free(struct nfs_read_data *p)
+void nfs_readhdr_free(struct nfs_pgio_header *hdr)
 {
-	if (p && (p->pagevec != &p->page_array[0]))
-		kfree(p->pagevec);
-	kmem_cache_free(nfs_rdata_cachep, p);
+	struct nfs_read_header *rhdr = container_of(hdr, struct nfs_read_header, header);
+
+	kmem_cache_free(nfs_rdata_cachep, rhdr);
 }
 
 void nfs_readdata_release(struct nfs_read_data *rdata)
 {
 	put_nfs_open_context(rdata->args.context);
-	nfs_readdata_free(rdata);
+	if (rdata->pagevec != rdata->page_array)
+		kfree(rdata->pagevec);
+	nfs_readhdr_free(rdata->header);
 }
 
 static
@@ -173,13 +180,13 @@ int nfs_initiate_read(struct rpc_clnt *clnt,
 		      struct nfs_read_data *data,
 		      const struct rpc_call_ops *call_ops)
 {
-	struct inode *inode = data->inode;
+	struct inode *inode = data->header->inode;
 	int swap_flags = IS_SWAPFILE(inode) ? NFS_RPC_SWAPFLAGS : 0;
 	struct rpc_task *task;
 	struct rpc_message msg = {
 		.rpc_argp = &data->args,
 		.rpc_resp = &data->res,
-		.rpc_cred = data->cred,
+		.rpc_cred = data->header->cred,
 	};
 	struct rpc_task_setup task_setup_data = {
 		.task = &data->task,
@@ -216,11 +223,11 @@ EXPORT_SYMBOL_GPL(nfs_initiate_read);
 static void nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data,
 		unsigned int count, unsigned int offset)
 {
-	struct inode *inode = req->wb_context->dentry->d_inode;
+	struct inode *inode = data->header->inode;
 
-	data->req	  = req;
-	data->inode	  = inode;
-	data->cred	  = req->wb_context->cred;
+	data->header->req	  = req;
+	data->header->inode	  = inode;
+	data->header->cred	  = req->wb_context->cred;
 
 	data->args.fh     = NFS_FH(inode);
 	data->args.offset = req_offset(req) + offset;
@@ -239,7 +246,7 @@ static void nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data,
 static int nfs_do_read(struct nfs_read_data *data,
 		const struct rpc_call_ops *call_ops)
 {
-	struct inode *inode = data->args.context->dentry->d_inode;
+	struct inode *inode = data->header->inode;
 
 	return nfs_initiate_read(NFS_CLIENT(inode), data, call_ops);
 }
@@ -293,6 +300,7 @@ static int nfs_pagein_multi(struct nfs_pageio_descriptor *desc, struct list_head
 {
 	struct nfs_page *req = nfs_list_entry(desc->pg_list.next);
 	struct page *page = req->wb_page;
+	struct nfs_read_header *rhdr;
 	struct nfs_read_data *data;
 	size_t rsize = desc->pg_bsize, nbytes;
 	unsigned int offset;
@@ -306,9 +314,10 @@ static int nfs_pagein_multi(struct nfs_pageio_descriptor *desc, struct list_head
 	do {
 		size_t len = min(nbytes,rsize);
 
-		data = nfs_readdata_alloc(1);
-		if (!data)
+		rhdr = nfs_readhdr_alloc(1);
+		if (!rhdr)
 			goto out_bad;
+		data = &rhdr->rpc_data;
 		data->pagevec[0] = page;
 		nfs_read_rpcsetup(req, data, len, offset);
 		list_add(&data->list, res);
@@ -333,26 +342,28 @@ static int nfs_pagein_one(struct nfs_pageio_descriptor *desc, struct list_head *
 {
 	struct nfs_page		*req;
 	struct page		**pages;
+	struct nfs_read_header	*rhdr;
 	struct nfs_read_data	*data;
 	struct list_head *head = &desc->pg_list;
 	int ret = 0;
 
-	data = nfs_readdata_alloc(nfs_page_array_len(desc->pg_base,
-						     desc->pg_count));
-	if (!data) {
+	rhdr = nfs_readhdr_alloc(nfs_page_array_len(desc->pg_base,
+						    desc->pg_count));
+	if (!rhdr) {
 		nfs_async_read_error(head);
 		ret = -ENOMEM;
 		goto out;
 	}
 
+	data = &rhdr->rpc_data;
 	pages = data->pagevec;
 	while (!list_empty(head)) {
 		req = nfs_list_entry(head->next);
 		nfs_list_remove_request(req);
-		nfs_list_add_request(req, &data->pages);
+		nfs_list_add_request(req, &rhdr->header.pages);
 		*pages++ = req->wb_page;
 	}
-	req = nfs_list_entry(data->pages.next);
+	req = nfs_list_entry(rhdr->header.pages.next);
 
 	nfs_read_rpcsetup(req, data, desc->pg_count, 0);
 	list_add(&data->list, res);
@@ -390,20 +401,21 @@ static const struct nfs_pageio_ops nfs_pageio_read_ops = {
  */
 int nfs_readpage_result(struct rpc_task *task, struct nfs_read_data *data)
 {
+	struct inode *inode = data->header->inode;
 	int status;
 
 	dprintk("NFS: %s: %5u, (status %d)\n", __func__, task->tk_pid,
 			task->tk_status);
 
-	status = NFS_PROTO(data->inode)->read_done(task, data);
+	status = NFS_PROTO(inode)->read_done(task, data);
 	if (status != 0)
 		return status;
 
-	nfs_add_stats(data->inode, NFSIOS_SERVERREADBYTES, data->res.count);
+	nfs_add_stats(inode, NFSIOS_SERVERREADBYTES, data->res.count);
 
 	if (task->tk_status == -ESTALE) {
-		set_bit(NFS_INO_STALE, &NFS_I(data->inode)->flags);
-		nfs_mark_for_revalidate(data->inode);
+		set_bit(NFS_INO_STALE, &NFS_I(inode)->flags);
+		nfs_mark_for_revalidate(inode);
 	}
 	return 0;
 }
@@ -417,7 +429,7 @@ static void nfs_readpage_retry(struct rpc_task *task, struct nfs_read_data *data
 		return;
 
 	/* This is a short read! */
-	nfs_inc_stats(data->inode, NFSIOS_SHORTREAD);
+	nfs_inc_stats(data->header->inode, NFSIOS_SHORTREAD);
 	/* Has the server at least made some progress? */
 	if (resp->count == 0)
 		return;
@@ -449,7 +461,7 @@ static void nfs_readpage_result_partial(struct rpc_task *task, void *calldata)
 static void nfs_readpage_release_partial(void *calldata)
 {
 	struct nfs_read_data *data = calldata;
-	struct nfs_page *req = data->req;
+	struct nfs_page *req = data->header->req;
 	struct page *page = req->wb_page;
 	int status = data->task.tk_status;
 
@@ -461,13 +473,13 @@ static void nfs_readpage_release_partial(void *calldata)
 			SetPageUptodate(page);
 		nfs_readpage_release(req);
 	}
-	nfs_readdata_release(calldata);
+	nfs_readdata_release(data);
 }
 
 void nfs_read_prepare(struct rpc_task *task, void *calldata)
 {
 	struct nfs_read_data *data = calldata;
-	NFS_PROTO(data->inode)->read_rpc_prepare(task, data);
+	NFS_PROTO(data->header->inode)->read_rpc_prepare(task, data);
 }
 
 static const struct rpc_call_ops nfs_read_partial_ops = {
@@ -524,9 +536,10 @@ static void nfs_readpage_result_full(struct rpc_task *task, void *calldata)
 static void nfs_readpage_release_full(void *calldata)
 {
 	struct nfs_read_data *data = calldata;
+	struct nfs_pgio_header *hdr = data->header;
 
-	while (!list_empty(&data->pages)) {
-		struct nfs_page *req = nfs_list_entry(data->pages.next);
+	while (!list_empty(&hdr->pages)) {
+		struct nfs_page *req = nfs_list_entry(hdr->pages.next);
 
 		nfs_list_remove_request(req);
 		nfs_readpage_release(req);
@@ -685,7 +698,7 @@ out:
 int __init nfs_init_readpagecache(void)
 {
 	nfs_rdata_cachep = kmem_cache_create("nfs_read_data",
-					     sizeof(struct nfs_read_data),
+					     sizeof(struct nfs_read_header),
 					     0, SLAB_HWCACHE_ALIGN,
 					     NULL);
 	if (nfs_rdata_cachep == NULL)
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 76735dd..dbb5c0a 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -69,19 +69,24 @@ void nfs_commit_free(struct nfs_commit_data *p)
 }
 EXPORT_SYMBOL_GPL(nfs_commit_free);
 
-struct nfs_write_data *nfs_writedata_alloc(unsigned int pagecount)
+struct nfs_write_header *nfs_writehdr_alloc(unsigned int pagecount)
 {
-	struct nfs_write_data *p = mempool_alloc(nfs_wdata_mempool, GFP_NOFS);
+	struct nfs_write_header *p = mempool_alloc(nfs_wdata_mempool, GFP_NOFS);
 
 	if (p) {
+		struct nfs_pgio_header *hdr = &p->header;
+		struct nfs_write_data *data = &p->rpc_data;
+
 		memset(p, 0, sizeof(*p));
-		INIT_LIST_HEAD(&p->pages);
-		p->npages = pagecount;
-		if (pagecount <= ARRAY_SIZE(p->page_array))
-			p->pagevec = p->page_array;
+		INIT_LIST_HEAD(&hdr->pages);
+		INIT_LIST_HEAD(&data->list);
+		data->npages = pagecount;
+		data->header = hdr;
+		if (pagecount <= ARRAY_SIZE(data->page_array))
+			data->pagevec = data->page_array;
 		else {
-			p->pagevec = kcalloc(pagecount, sizeof(struct page *), GFP_NOFS);
-			if (!p->pagevec) {
+			data->pagevec = kcalloc(pagecount, sizeof(struct page *), GFP_NOFS);
+			if (!data->pagevec) {
 				mempool_free(p, nfs_wdata_mempool);
 				p = NULL;
 			}
@@ -90,17 +95,18 @@ struct nfs_write_data *nfs_writedata_alloc(unsigned int pagecount)
 	return p;
 }
 
-void nfs_writedata_free(struct nfs_write_data *p)
+void nfs_writehdr_free(struct nfs_pgio_header *hdr)
 {
-	if (p && (p->pagevec != &p->page_array[0]))
-		kfree(p->pagevec);
-	mempool_free(p, nfs_wdata_mempool);
+	struct nfs_write_header *whdr = container_of(hdr, struct nfs_write_header, header);
+	mempool_free(whdr, nfs_wdata_mempool);
 }
 
 void nfs_writedata_release(struct nfs_write_data *wdata)
 {
 	put_nfs_open_context(wdata->args.context);
-	nfs_writedata_free(wdata);
+	if (wdata->pagevec != wdata->page_array)
+		kfree(wdata->pagevec);
+	nfs_writehdr_free(wdata->header);
 }
 
 static void nfs_context_set_write_error(struct nfs_open_context *ctx, int error)
@@ -507,9 +513,8 @@ static inline
 int nfs_write_need_commit(struct nfs_write_data *data)
 {
 	if (data->verf.committed == NFS_DATA_SYNC)
-		return data->lseg == NULL;
-	else
-		return data->verf.committed != NFS_FILE_SYNC;
+		return data->header->lseg == NULL;
+	return data->verf.committed != NFS_FILE_SYNC;
 }
 
 static inline
@@ -517,7 +522,7 @@ int nfs_reschedule_unstable_write(struct nfs_page *req,
 				  struct nfs_write_data *data)
 {
 	if (test_and_clear_bit(PG_NEED_COMMIT, &req->wb_flags)) {
-		nfs_mark_request_commit(req, data->lseg);
+		nfs_mark_request_commit(req, data->header->lseg);
 		return 1;
 	}
 	if (test_and_clear_bit(PG_NEED_RESCHED, &req->wb_flags)) {
@@ -841,13 +846,13 @@ int nfs_initiate_write(struct rpc_clnt *clnt,
 		       const struct rpc_call_ops *call_ops,
 		       int how)
 {
-	struct inode *inode = data->inode;
+	struct inode *inode = data->header->inode;
 	int priority = flush_task_priority(how);
 	struct rpc_task *task;
 	struct rpc_message msg = {
 		.rpc_argp = &data->args,
 		.rpc_resp = &data->res,
-		.rpc_cred = data->cred,
+		.rpc_cred = data->header->cred,
 	};
 	struct rpc_task_setup task_setup_data = {
 		.rpc_client = clnt,
@@ -896,14 +901,15 @@ static void nfs_write_rpcsetup(struct nfs_page *req,
 		unsigned int count, unsigned int offset,
 		int how)
 {
+	struct nfs_pgio_header *hdr = data->header;
 	struct inode *inode = req->wb_context->dentry->d_inode;
 
 	/* Set up the RPC argument and reply structs
 	 * NB: take care not to mess about with data->commit et al. */
 
-	data->req = req;
-	data->inode = inode = req->wb_context->dentry->d_inode;
-	data->cred = req->wb_context->cred;
+	hdr->req = req;
+	hdr->inode = inode = req->wb_context->dentry->d_inode;
+	hdr->cred = req->wb_context->cred;
 
 	data->args.fh     = NFS_FH(inode);
 	data->args.offset = req_offset(req) + offset;
@@ -935,7 +941,7 @@ static int nfs_do_write(struct nfs_write_data *data,
 		const struct rpc_call_ops *call_ops,
 		int how)
 {
-	struct inode *inode = data->args.context->dentry->d_inode;
+	struct inode *inode = data->header->inode;
 
 	return nfs_initiate_write(NFS_CLIENT(inode), data, call_ops, how);
 }
@@ -981,6 +987,7 @@ static int nfs_flush_multi(struct nfs_pageio_descriptor *desc, struct list_head
 {
 	struct nfs_page *req = nfs_list_entry(desc->pg_list.next);
 	struct page *page = req->wb_page;
+	struct nfs_write_header *whdr;
 	struct nfs_write_data *data;
 	size_t wsize = desc->pg_bsize, nbytes;
 	unsigned int offset;
@@ -1000,9 +1007,10 @@ static int nfs_flush_multi(struct nfs_pageio_descriptor *desc, struct list_head
 	do {
 		size_t len = min(nbytes, wsize);
 
-		data = nfs_writedata_alloc(1);
-		if (!data)
+		whdr = nfs_writehdr_alloc(1);
+		if (!whdr)
 			goto out_bad;
+		data = &whdr->rpc_data;
 		data->pagevec[0] = page;
 		nfs_write_rpcsetup(req, data, len, offset, desc->pg_ioflags);
 		list_add(&data->list, res);
@@ -1036,13 +1044,14 @@ static int nfs_flush_one(struct nfs_pageio_descriptor *desc, struct list_head *r
 {
 	struct nfs_page		*req;
 	struct page		**pages;
+	struct nfs_write_header	*whdr;
 	struct nfs_write_data	*data;
 	struct list_head *head = &desc->pg_list;
 	int ret = 0;
 
-	data = nfs_writedata_alloc(nfs_page_array_len(desc->pg_base,
-						      desc->pg_count));
-	if (!data) {
+	whdr = nfs_writehdr_alloc(nfs_page_array_len(desc->pg_base,
+						     desc->pg_count));
+	if (!whdr) {
 		while (!list_empty(head)) {
 			req = nfs_list_entry(head->next);
 			nfs_list_remove_request(req);
@@ -1051,14 +1060,15 @@ static int nfs_flush_one(struct nfs_pageio_descriptor *desc, struct list_head *r
 		ret = -ENOMEM;
 		goto out;
 	}
+	data = &whdr->rpc_data;
 	pages = data->pagevec;
 	while (!list_empty(head)) {
 		req = nfs_list_entry(head->next);
 		nfs_list_remove_request(req);
-		nfs_list_add_request(req, &data->pages);
+		nfs_list_add_request(req, &whdr->header.pages);
 		*pages++ = req->wb_page;
 	}
-	req = nfs_list_entry(data->pages.next);
+	req = nfs_list_entry(whdr->header.pages.next);
 
 	if ((desc->pg_ioflags & FLUSH_COND_STABLE) &&
 	    (desc->pg_moreio || NFS_I(desc->pg_inode)->ncommit))
@@ -1126,10 +1136,11 @@ static void nfs_writeback_done_partial(struct rpc_task *task, void *calldata)
 
 	dprintk("NFS: %5u write(%s/%lld %d@%lld)",
 		task->tk_pid,
-		data->req->wb_context->dentry->d_inode->i_sb->s_id,
+		data->header->inode->i_sb->s_id,
 		(long long)
-		  NFS_FILEID(data->req->wb_context->dentry->d_inode),
-		data->req->wb_bytes, (long long)req_offset(data->req));
+		  NFS_FILEID(data->header->inode),
+		data->header->req->wb_bytes,
+		(long long)req_offset(data->header->req));
 
 	nfs_writeback_done(task, data);
 }
@@ -1137,7 +1148,7 @@ static void nfs_writeback_done_partial(struct rpc_task *task, void *calldata)
 static void nfs_writeback_release_partial(void *calldata)
 {
 	struct nfs_write_data	*data = calldata;
-	struct nfs_page		*req = data->req;
+	struct nfs_page		*req = data->header->req;
 	struct page		*page = req->wb_page;
 	int status = data->task.tk_status;
 
@@ -1169,13 +1180,13 @@ static void nfs_writeback_release_partial(void *calldata)
 out:
 	if (atomic_dec_and_test(&req->wb_complete))
 		nfs_writepage_release(req, data);
-	nfs_writedata_release(calldata);
+	nfs_writedata_release(data);
 }
 
 void nfs_write_prepare(struct rpc_task *task, void *calldata)
 {
 	struct nfs_write_data *data = calldata;
-	NFS_PROTO(data->inode)->write_rpc_prepare(task, data);
+	NFS_PROTO(data->header->inode)->write_rpc_prepare(task, data);
 }
 
 void nfs_commit_prepare(struct rpc_task *task, void *calldata)
@@ -1208,11 +1219,12 @@ static void nfs_writeback_done_full(struct rpc_task *task, void *calldata)
 static void nfs_writeback_release_full(void *calldata)
 {
 	struct nfs_write_data	*data = calldata;
+	struct nfs_pgio_header *hdr = data->header;
 	int status = data->task.tk_status;
 
 	/* Update attributes as result of writeback. */
-	while (!list_empty(&data->pages)) {
-		struct nfs_page *req = nfs_list_entry(data->pages.next);
+	while (!list_empty(&hdr->pages)) {
+		struct nfs_page *req = nfs_list_entry(hdr->pages.next);
 		struct page *page = req->wb_page;
 
 		nfs_list_remove_request(req);
@@ -1233,7 +1245,7 @@ static void nfs_writeback_release_full(void *calldata)
 
 		if (nfs_write_need_commit(data)) {
 			memcpy(&req->wb_verf, &data->verf, sizeof(req->wb_verf));
-			nfs_mark_request_commit(req, data->lseg);
+			nfs_mark_request_commit(req, hdr->lseg);
 			dprintk(" marked for commit\n");
 			goto next;
 		}
@@ -1244,7 +1256,7 @@ remove_request:
 		nfs_unlock_request(req);
 		nfs_end_page_writeback(page);
 	}
-	nfs_writedata_release(calldata);
+	nfs_writedata_release(data);
 }
 
 static const struct rpc_call_ops nfs_write_full_ops = {
@@ -1261,6 +1273,7 @@ void nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
 {
 	struct nfs_writeargs	*argp = &data->args;
 	struct nfs_writeres	*resp = &data->res;
+	struct inode		*inode = data->header->inode;
 	int status;
 
 	dprintk("NFS: %5u nfs_writeback_done (status %d)\n",
@@ -1273,10 +1286,10 @@ void nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
 	 * another writer had changed the file, but some applications
 	 * depend on tighter cache coherency when writing.
 	 */
-	status = NFS_PROTO(data->inode)->write_done(task, data);
+	status = NFS_PROTO(inode)->write_done(task, data);
 	if (status != 0)
 		return;
-	nfs_add_stats(data->inode, NFSIOS_SERVERWRITTENBYTES, resp->count);
+	nfs_add_stats(inode, NFSIOS_SERVERWRITTENBYTES, resp->count);
 
 #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
 	if (resp->verf->committed < argp->stable && task->tk_status >= 0) {
@@ -1294,7 +1307,7 @@ void nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
 		if (time_before(complain, jiffies)) {
 			dprintk("NFS:       faulty NFS server %s:"
 				" (committed = %d) != (stable = %d)\n",
-				NFS_SERVER(data->inode)->nfs_client->cl_hostname,
+				NFS_SERVER(inode)->nfs_client->cl_hostname,
 				resp->verf->committed, argp->stable);
 			complain = jiffies + 300 * HZ;
 		}
@@ -1304,7 +1317,7 @@ void nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
 	if (task->tk_status >= 0 && resp->count < argp->count) {
 		static unsigned long    complain;
 
-		nfs_inc_stats(data->inode, NFSIOS_SHORTWRITE);
+		nfs_inc_stats(inode, NFSIOS_SHORTWRITE);
 
 		/* Has the server at least made some progress? */
 		if (resp->count != 0) {
@@ -1333,7 +1346,6 @@ void nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
 		/* Can't do anything about it except throw an error. */
 		task->tk_status = -EIO;
 	}
-	return;
 }
 
 
@@ -1745,7 +1757,7 @@ int nfs_migrate_page(struct address_space *mapping, struct page *newpage,
 int __init nfs_init_writepagecache(void)
 {
 	nfs_wdata_cachep = kmem_cache_create("nfs_write_data",
-					     sizeof(struct nfs_write_data),
+					     sizeof(struct nfs_write_header),
 					     0, SLAB_HWCACHE_ALIGN,
 					     NULL);
 	if (nfs_wdata_cachep == NULL)
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index d5d68f3..8d3a2b8 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -569,12 +569,6 @@ nfs_have_writebacks(struct inode *inode)
 }
 
 /*
- * Allocate nfs_write_data structures
- */
-extern struct nfs_write_data *nfs_writedata_alloc(unsigned int npages);
-extern void nfs_writedata_free(struct nfs_write_data *);
-
-/*
  * linux/fs/nfs/read.c
  */
 extern int  nfs_readpage(struct file *, struct page *);
@@ -585,12 +579,6 @@ extern int  nfs_readpage_async(struct nfs_open_context *, struct inode *,
 			       struct page *);
 
 /*
- * Allocate nfs_read_data structures
- */
-extern struct nfs_read_data *nfs_readdata_alloc(unsigned int npages);
-extern void nfs_readdata_free(struct nfs_read_data *);
-
-/*
  * linux/fs/nfs3proc.c
  */
 #ifdef CONFIG_NFS_V3_ACL
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 8fb036a..fee3241 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -1168,52 +1168,58 @@ struct nfs_page;
 #define NFS_PAGEVEC_SIZE	(8U)
 
 struct nfs_read_data {
+	struct nfs_pgio_header	*header;
+	struct list_head	list;
 	struct rpc_task		task;
-	struct inode		*inode;
-	struct rpc_cred		*cred;
 	struct nfs_fattr	fattr;	/* fattr storage */
-	struct list_head	pages;	/* Coalesced read requests */
-	struct list_head	list;	/* lists of struct nfs_read_data */
-	struct nfs_page		*req;	/* multi ops per nfs_page */
 	struct page		**pagevec;
 	unsigned int		npages;	/* Max length of pagevec */
 	struct nfs_readargs args;
 	struct nfs_readres  res;
 	unsigned long		timestamp;	/* For lease renewal */
-	struct pnfs_layout_segment *lseg;
-	struct nfs_client	*ds_clp;	/* pNFS data server */
-	const struct rpc_call_ops *mds_ops;
 	int (*read_done_cb) (struct rpc_task *task, struct nfs_read_data *data);
 	__u64			mds_offset;
-	int			pnfs_error;
 	struct page		*page_array[NFS_PAGEVEC_SIZE];
+	struct nfs_client	*ds_clp;	/* pNFS data server */
+};
+
+struct nfs_pgio_header {
+	struct inode		*inode;
+	struct rpc_cred		*cred;
+	struct list_head	pages;
+	struct nfs_page		*req;
+	struct pnfs_layout_segment *lseg;
+	const struct rpc_call_ops *mds_ops;
+	int			pnfs_error;
+};
+
+struct nfs_read_header {
+	struct nfs_pgio_header	header;
+	struct nfs_read_data	rpc_data;
 };
 
 struct nfs_direct_req;
 
 struct nfs_write_data {
+	struct nfs_pgio_header	*header;
+	struct list_head	list;
 	struct rpc_task		task;
-	struct inode		*inode;
-	struct rpc_cred		*cred;
 	struct nfs_fattr	fattr;
 	struct nfs_writeverf	verf;
-	struct list_head	pages;		/* Coalesced requests we wish to flush */
-	struct list_head	list;		/* lists of struct nfs_write_data */
-	struct nfs_page		*req;		/* multi ops per nfs_page */
 	struct page		**pagevec;
 	unsigned int		npages;		/* Max length of pagevec */
 	struct nfs_writeargs	args;		/* argument struct */
 	struct nfs_writeres	res;		/* result struct */
-	struct pnfs_layout_segment *lseg;
-	struct nfs_client	*ds_clp;	/* pNFS data server */
-	const struct rpc_call_ops *mds_ops;
-	int (*write_done_cb) (struct rpc_task *task, struct nfs_write_data *data);
-#ifdef CONFIG_NFS_V4
 	unsigned long		timestamp;	/* For lease renewal */
-#endif
+	int (*write_done_cb) (struct rpc_task *task, struct nfs_write_data *data);
 	__u64			mds_offset;	/* Filelayout dense stripe */
-	int			pnfs_error;
 	struct page		*page_array[NFS_PAGEVEC_SIZE];
+	struct nfs_client	*ds_clp;	/* pNFS data server */
+};
+
+struct nfs_write_header {
+	struct nfs_pgio_header	header;
+	struct nfs_write_data	rpc_data;
 };
 
 struct nfs_commit_data {
-- 
cgit v0.10.2


From 30dd374f6fc1b202db3a1b57b61afff1326bad92 Mon Sep 17 00:00:00 2001
From: Fred Isaman <iisaman@netapp.com>
Date: Fri, 20 Apr 2012 14:47:45 -0400
Subject: NFS: create struct nfs_page_array

Both nfs_read_data and nfs_write_data devote several fields which
can be combined into a single shared struct.

Signed-off-by: Fred Isaman <iisaman@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index 7a48251..7ae8a60 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -242,7 +242,7 @@ bl_read_pagelist(struct nfs_read_data *rdata)
 	int pg_index = rdata->args.pgbase >> PAGE_CACHE_SHIFT;
 
 	dprintk("%s enter nr_pages %u offset %lld count %u\n", __func__,
-	       rdata->npages, f_offset, (unsigned int)rdata->args.count);
+	       rdata->pages.npages, f_offset, (unsigned int)rdata->args.count);
 
 	par = alloc_parallel(rdata);
 	if (!par)
@@ -252,7 +252,7 @@ bl_read_pagelist(struct nfs_read_data *rdata)
 
 	isect = (sector_t) (f_offset >> SECTOR_SHIFT);
 	/* Code assumes extents are page-aligned */
-	for (i = pg_index; i < rdata->npages; i++) {
+	for (i = pg_index; i < rdata->pages.npages; i++) {
 		if (!extent_length) {
 			/* We've used up the previous extent */
 			bl_put_extent(be);
@@ -285,7 +285,8 @@ bl_read_pagelist(struct nfs_read_data *rdata)
 			struct pnfs_block_extent *be_read;
 
 			be_read = (hole && cow_read) ? cow_read : be;
-			bio = bl_add_page_to_bio(bio, rdata->npages - i, READ,
+			bio = bl_add_page_to_bio(bio, rdata->pages.npages - i,
+						 READ,
 						 isect, pages[i], be_read,
 						 bl_end_io_read, par);
 			if (IS_ERR(bio)) {
@@ -654,7 +655,7 @@ next_page:
 
 	/* Middle pages */
 	pg_index = wdata->args.pgbase >> PAGE_CACHE_SHIFT;
-	for (i = pg_index; i < wdata->npages; i++) {
+	for (i = pg_index; i < wdata->pages.npages; i++) {
 		if (!extent_length) {
 			/* We've used up the previous extent */
 			bl_put_extent(be);
@@ -688,7 +689,7 @@ next_page:
 				goto out;
 			}
 		}
-		bio = bl_add_page_to_bio(bio, wdata->npages - i, WRITE,
+		bio = bl_add_page_to_bio(bio, wdata->pages.npages - i, WRITE,
 					 isect, pages[i], be,
 					 bl_end_io_write, par);
 		if (IS_ERR(bio)) {
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 56176af..0faba4c 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -252,11 +252,11 @@ static void nfs_direct_read_release(void *calldata)
 	} else {
 		dreq->count += data->res.count;
 		spin_unlock(&dreq->lock);
-		nfs_direct_dirty_pages(data->pagevec,
+		nfs_direct_dirty_pages(data->pages.pagevec,
 				data->args.pgbase,
 				data->res.count);
 	}
-	nfs_direct_release_pages(data->pagevec, data->npages);
+	nfs_direct_release_pages(data->pages.pagevec, data->pages.npages);
 
 	if (put_dreq(dreq))
 		nfs_direct_complete(dreq);
@@ -273,8 +273,8 @@ static void nfs_direct_readhdr_release(struct nfs_read_header *rhdr)
 {
 	struct nfs_read_data *data = &rhdr->rpc_data;
 
-	if (data->pagevec != data->page_array)
-		kfree(data->pagevec);
+	if (data->pages.pagevec != data->pages.page_array)
+		kfree(data->pages.pagevec);
 	nfs_readhdr_free(&rhdr->header);
 }
 
@@ -312,6 +312,7 @@ static ssize_t nfs_direct_read_schedule_segment(struct nfs_direct_req *dreq,
 	do {
 		struct nfs_read_header *rhdr;
 		struct nfs_read_data *data;
+		struct nfs_page_array *pages;
 		size_t bytes;
 
 		pgbase = user_addr & ~PAGE_MASK;
@@ -322,24 +323,25 @@ static ssize_t nfs_direct_read_schedule_segment(struct nfs_direct_req *dreq,
 		if (unlikely(!rhdr))
 			break;
 		data = &rhdr->rpc_data;
+		pages = &data->pages;
 
 		down_read(&current->mm->mmap_sem);
 		result = get_user_pages(current, current->mm, user_addr,
-					data->npages, 1, 0, data->pagevec, NULL);
+					pages->npages, 1, 0, pages->pagevec, NULL);
 		up_read(&current->mm->mmap_sem);
 		if (result < 0) {
 			nfs_direct_readhdr_release(rhdr);
 			break;
 		}
-		if ((unsigned)result < data->npages) {
+		if ((unsigned)result < pages->npages) {
 			bytes = result * PAGE_SIZE;
 			if (bytes <= pgbase) {
-				nfs_direct_release_pages(data->pagevec, result);
+				nfs_direct_release_pages(pages->pagevec, result);
 				nfs_direct_readhdr_release(rhdr);
 				break;
 			}
 			bytes -= pgbase;
-			data->npages = result;
+			pages->npages = result;
 		}
 
 		get_dreq(dreq);
@@ -352,7 +354,7 @@ static ssize_t nfs_direct_read_schedule_segment(struct nfs_direct_req *dreq,
 		data->args.lock_context = dreq->l_ctx;
 		data->args.offset = pos;
 		data->args.pgbase = pgbase;
-		data->args.pages = data->pagevec;
+		data->args.pages = pages->pagevec;
 		data->args.count = bytes;
 		data->res.fattr = &data->fattr;
 		data->res.eof = 0;
@@ -462,8 +464,8 @@ static void nfs_direct_writehdr_release(struct nfs_write_header *whdr)
 {
 	struct nfs_write_data *data = &whdr->rpc_data;
 
-	if (data->pagevec != data->page_array)
-		kfree(data->pagevec);
+	if (data->pages.pagevec != data->pages.page_array)
+		kfree(data->pages.pagevec);
 	nfs_writehdr_free(&whdr->header);
 }
 
@@ -472,8 +474,10 @@ static void nfs_direct_free_writedata(struct nfs_direct_req *dreq)
 	while (!list_empty(&dreq->rewrite_list)) {
 		struct nfs_pgio_header *hdr = list_entry(dreq->rewrite_list.next, struct nfs_pgio_header, pages);
 		struct nfs_write_header *whdr = container_of(hdr, struct nfs_write_header, header);
+		struct nfs_page_array *p = &whdr->rpc_data.pages;
+
 		list_del(&hdr->pages);
-		nfs_direct_release_pages(whdr->rpc_data.pagevec, whdr->rpc_data.npages);
+		nfs_direct_release_pages(p->pagevec, p->npages);
 		nfs_direct_writehdr_release(whdr);
 	}
 }
@@ -751,6 +755,7 @@ static ssize_t nfs_direct_write_schedule_segment(struct nfs_direct_req *dreq,
 	do {
 		struct nfs_write_header *whdr;
 		struct nfs_write_data *data;
+		struct nfs_page_array *pages;
 		size_t bytes;
 
 		pgbase = user_addr & ~PAGE_MASK;
@@ -762,24 +767,25 @@ static ssize_t nfs_direct_write_schedule_segment(struct nfs_direct_req *dreq,
 			break;
 
 		data = &whdr->rpc_data;
+		pages = &data->pages;
 
 		down_read(&current->mm->mmap_sem);
 		result = get_user_pages(current, current->mm, user_addr,
-					data->npages, 0, 0, data->pagevec, NULL);
+					pages->npages, 0, 0, pages->pagevec, NULL);
 		up_read(&current->mm->mmap_sem);
 		if (result < 0) {
 			nfs_direct_writehdr_release(whdr);
 			break;
 		}
-		if ((unsigned)result < data->npages) {
+		if ((unsigned)result < pages->npages) {
 			bytes = result * PAGE_SIZE;
 			if (bytes <= pgbase) {
-				nfs_direct_release_pages(data->pagevec, result);
+				nfs_direct_release_pages(pages->pagevec, result);
 				nfs_direct_writehdr_release(whdr);
 				break;
 			}
 			bytes -= pgbase;
-			data->npages = result;
+			pages->npages = result;
 		}
 
 		get_dreq(dreq);
@@ -794,7 +800,7 @@ static ssize_t nfs_direct_write_schedule_segment(struct nfs_direct_req *dreq,
 		data->args.lock_context = dreq->l_ctx;
 		data->args.offset = pos;
 		data->args.pgbase = pgbase;
-		data->args.pages = data->pagevec;
+		data->args.pages = pages->pagevec;
 		data->args.count = bytes;
 		data->args.stable = sync;
 		data->res.fattr = &data->fattr;
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 7dc9be1..5c3d77f 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -210,6 +210,7 @@ extern void nfs_destroy_writepagecache(void);
 
 extern int __init nfs_init_directcache(void);
 extern void nfs_destroy_directcache(void);
+extern bool nfs_pgarray_set(struct nfs_page_array *p, unsigned int pagecount);
 
 /* nfs2xdr.c */
 extern int nfs_stat_to_errno(enum nfs_stat);
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index d21fcea..d349bd4 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -26,6 +26,19 @@
 
 static struct kmem_cache *nfs_page_cachep;
 
+bool nfs_pgarray_set(struct nfs_page_array *p, unsigned int pagecount)
+{
+	p->npages = pagecount;
+	if (pagecount <= ARRAY_SIZE(p->page_array))
+		p->pagevec = p->page_array;
+	else {
+		p->pagevec = kcalloc(pagecount, sizeof(struct page *), GFP_KERNEL);
+		if (!p->pagevec)
+			p->npages = 0;
+	}
+	return p->pagevec != NULL;
+}
+
 static inline struct nfs_page *
 nfs_page_alloc(void)
 {
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index d6d4682..f6ab30b 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -46,16 +46,10 @@ struct nfs_read_header *nfs_readhdr_alloc(unsigned int pagecount)
 
 		INIT_LIST_HEAD(&hdr->pages);
 		INIT_LIST_HEAD(&data->list);
-		data->npages = pagecount;
 		data->header = hdr;
-		if (pagecount <= ARRAY_SIZE(data->page_array))
-			data->pagevec = data->page_array;
-		else {
-			data->pagevec = kcalloc(pagecount, sizeof(struct page *), GFP_KERNEL);
-			if (!data->pagevec) {
-				kmem_cache_free(nfs_rdata_cachep, p);
-				p = NULL;
-			}
+		if (!nfs_pgarray_set(&data->pages, pagecount)) {
+			kmem_cache_free(nfs_rdata_cachep, p);
+			p = NULL;
 		}
 	}
 	return p;
@@ -71,8 +65,8 @@ void nfs_readhdr_free(struct nfs_pgio_header *hdr)
 void nfs_readdata_release(struct nfs_read_data *rdata)
 {
 	put_nfs_open_context(rdata->args.context);
-	if (rdata->pagevec != rdata->page_array)
-		kfree(rdata->pagevec);
+	if (rdata->pages.pagevec != rdata->pages.page_array)
+		kfree(rdata->pages.pagevec);
 	nfs_readhdr_free(rdata->header);
 }
 
@@ -232,7 +226,7 @@ static void nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data,
 	data->args.fh     = NFS_FH(inode);
 	data->args.offset = req_offset(req) + offset;
 	data->args.pgbase = req->wb_pgbase + offset;
-	data->args.pages  = data->pagevec;
+	data->args.pages  = data->pages.pagevec;
 	data->args.count  = count;
 	data->args.context = get_nfs_open_context(req->wb_context);
 	data->args.lock_context = req->wb_lock_context;
@@ -318,7 +312,7 @@ static int nfs_pagein_multi(struct nfs_pageio_descriptor *desc, struct list_head
 		if (!rhdr)
 			goto out_bad;
 		data = &rhdr->rpc_data;
-		data->pagevec[0] = page;
+		data->pages.pagevec[0] = page;
 		nfs_read_rpcsetup(req, data, len, offset);
 		list_add(&data->list, res);
 		requests++;
@@ -356,7 +350,7 @@ static int nfs_pagein_one(struct nfs_pageio_descriptor *desc, struct list_head *
 	}
 
 	data = &rhdr->rpc_data;
-	pages = data->pagevec;
+	pages = data->pages.pagevec;
 	while (!list_empty(head)) {
 		req = nfs_list_entry(head->next);
 		nfs_list_remove_request(req);
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index dbb5c0a..2efae04 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -80,16 +80,10 @@ struct nfs_write_header *nfs_writehdr_alloc(unsigned int pagecount)
 		memset(p, 0, sizeof(*p));
 		INIT_LIST_HEAD(&hdr->pages);
 		INIT_LIST_HEAD(&data->list);
-		data->npages = pagecount;
 		data->header = hdr;
-		if (pagecount <= ARRAY_SIZE(data->page_array))
-			data->pagevec = data->page_array;
-		else {
-			data->pagevec = kcalloc(pagecount, sizeof(struct page *), GFP_NOFS);
-			if (!data->pagevec) {
-				mempool_free(p, nfs_wdata_mempool);
-				p = NULL;
-			}
+		if (!nfs_pgarray_set(&data->pages, pagecount)) {
+			mempool_free(p, nfs_wdata_mempool);
+			p = NULL;
 		}
 	}
 	return p;
@@ -104,8 +98,8 @@ void nfs_writehdr_free(struct nfs_pgio_header *hdr)
 void nfs_writedata_release(struct nfs_write_data *wdata)
 {
 	put_nfs_open_context(wdata->args.context);
-	if (wdata->pagevec != wdata->page_array)
-		kfree(wdata->pagevec);
+	if (wdata->pages.pagevec != wdata->pages.page_array)
+		kfree(wdata->pages.pagevec);
 	nfs_writehdr_free(wdata->header);
 }
 
@@ -916,7 +910,7 @@ static void nfs_write_rpcsetup(struct nfs_page *req,
 	/* pnfs_set_layoutcommit needs this */
 	data->mds_offset = data->args.offset;
 	data->args.pgbase = req->wb_pgbase + offset;
-	data->args.pages  = data->pagevec;
+	data->args.pages  = data->pages.pagevec;
 	data->args.count  = count;
 	data->args.context = get_nfs_open_context(req->wb_context);
 	data->args.lock_context = req->wb_lock_context;
@@ -1011,7 +1005,7 @@ static int nfs_flush_multi(struct nfs_pageio_descriptor *desc, struct list_head
 		if (!whdr)
 			goto out_bad;
 		data = &whdr->rpc_data;
-		data->pagevec[0] = page;
+		data->pages.pagevec[0] = page;
 		nfs_write_rpcsetup(req, data, len, offset, desc->pg_ioflags);
 		list_add(&data->list, res);
 		requests++;
@@ -1061,7 +1055,7 @@ static int nfs_flush_one(struct nfs_pageio_descriptor *desc, struct list_head *r
 		goto out;
 	}
 	data = &whdr->rpc_data;
-	pages = data->pagevec;
+	pages = data->pages.pagevec;
 	while (!list_empty(head)) {
 		req = nfs_list_entry(head->next);
 		nfs_list_remove_request(req);
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index fee3241..e34beaf 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -1167,19 +1167,23 @@ struct nfs_page;
 
 #define NFS_PAGEVEC_SIZE	(8U)
 
+struct nfs_page_array {
+	struct page		**pagevec;
+	unsigned int		npages;		/* Max length of pagevec */
+	struct page		*page_array[NFS_PAGEVEC_SIZE];
+};
+
 struct nfs_read_data {
 	struct nfs_pgio_header	*header;
 	struct list_head	list;
 	struct rpc_task		task;
 	struct nfs_fattr	fattr;	/* fattr storage */
-	struct page		**pagevec;
-	unsigned int		npages;	/* Max length of pagevec */
 	struct nfs_readargs args;
 	struct nfs_readres  res;
 	unsigned long		timestamp;	/* For lease renewal */
 	int (*read_done_cb) (struct rpc_task *task, struct nfs_read_data *data);
 	__u64			mds_offset;
-	struct page		*page_array[NFS_PAGEVEC_SIZE];
+	struct nfs_page_array	pages;
 	struct nfs_client	*ds_clp;	/* pNFS data server */
 };
 
@@ -1206,14 +1210,12 @@ struct nfs_write_data {
 	struct rpc_task		task;
 	struct nfs_fattr	fattr;
 	struct nfs_writeverf	verf;
-	struct page		**pagevec;
-	unsigned int		npages;		/* Max length of pagevec */
 	struct nfs_writeargs	args;		/* argument struct */
 	struct nfs_writeres	res;		/* result struct */
 	unsigned long		timestamp;	/* For lease renewal */
 	int (*write_done_cb) (struct rpc_task *task, struct nfs_write_data *data);
 	__u64			mds_offset;	/* Filelayout dense stripe */
-	struct page		*page_array[NFS_PAGEVEC_SIZE];
+	struct nfs_page_array	pages;
 	struct nfs_client	*ds_clp;	/* pNFS data server */
 };
 
-- 
cgit v0.10.2


From 4db6e0b74c0f6dfc2f9c0690e8df512e3b635983 Mon Sep 17 00:00:00 2001
From: Fred Isaman <iisaman@netapp.com>
Date: Fri, 20 Apr 2012 14:47:46 -0400
Subject: NFS: merge _full and _partial read rpc_ops

Decouple nfs_pgio_header and nfs_read_data, and have (possibly
multiple) nfs_read_datas each take a refcount on nfs_pgio_header.

For the moment keeps nfs_read_header as a way to preallocate a single
nfs_read_data with the nfs_pgio_header.  The code doesn't need this,
and would be prettier without, but given the amount of churn I am
already introducing I didn't want to play with tuning new mempools.

This also fixes bug in pnfs_ld_handle_read_error.  In the case of
desc->pg_bsize < PAGE_CACHE_SIZE, the pages list was empty, causing
replay attempt to do nothing.

Signed-off-by: Fred Isaman <iisaman@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 0faba4c..90b00ce 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -319,10 +319,16 @@ static ssize_t nfs_direct_read_schedule_segment(struct nfs_direct_req *dreq,
 		bytes = min(rsize,count);
 
 		result = -ENOMEM;
-		rhdr = nfs_readhdr_alloc(nfs_page_array_len(pgbase, bytes));
+		rhdr = nfs_readhdr_alloc();
 		if (unlikely(!rhdr))
 			break;
-		data = &rhdr->rpc_data;
+		data = nfs_readdata_alloc(&rhdr->header, nfs_page_array_len(pgbase, bytes));
+		if (!data) {
+			nfs_readhdr_free(&rhdr->header);
+			break;
+		}
+		data->header = &rhdr->header;
+		atomic_inc(&data->header->refcnt);
 		pages = &data->pages;
 
 		down_read(&current->mm->mmap_sem);
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 5c3d77f..33af5e5 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -200,6 +200,7 @@ struct vfsmount *nfs_do_refmount(struct rpc_clnt *client, struct dentry *dentry)
 extern struct svc_version nfs4_callback_version1;
 extern struct svc_version nfs4_callback_version4;
 
+struct nfs_pageio_descriptor;
 /* pagelist.c */
 extern int __init nfs_init_nfspagecache(void);
 extern void nfs_destroy_nfspagecache(void);
@@ -211,6 +212,10 @@ extern void nfs_destroy_writepagecache(void);
 extern int __init nfs_init_directcache(void);
 extern void nfs_destroy_directcache(void);
 extern bool nfs_pgarray_set(struct nfs_page_array *p, unsigned int pagecount);
+extern void nfs_pgheader_init(struct nfs_pageio_descriptor *desc,
+			      struct nfs_pgio_header *hdr,
+			      void (*release)(struct nfs_pgio_header *hdr));
+void nfs_set_pgio_error(struct nfs_pgio_header *hdr, int error, loff_t pos);
 
 /* nfs2xdr.c */
 extern int nfs_stat_to_errno(enum nfs_stat);
@@ -295,17 +300,19 @@ extern struct dentry *nfs4_get_root(struct super_block *, struct nfs_fh *,
 extern int nfs4_get_rootfh(struct nfs_server *server, struct nfs_fh *mntfh);
 #endif
 
-struct nfs_pageio_descriptor;
 /* read.c */
-extern struct nfs_read_header *nfs_readhdr_alloc(unsigned int npages);
+extern void nfs_async_read_error(struct list_head *head);
+extern struct nfs_read_header *nfs_readhdr_alloc(void);
 extern void nfs_readhdr_free(struct nfs_pgio_header *hdr);
+extern void nfs_read_completion(struct nfs_pgio_header *hdr);
+extern struct nfs_read_data *nfs_readdata_alloc(struct nfs_pgio_header *hdr,
+						unsigned int pagecount);
 extern int nfs_initiate_read(struct rpc_clnt *clnt,
 			     struct nfs_read_data *data,
 			     const struct rpc_call_ops *call_ops);
 extern void nfs_read_prepare(struct rpc_task *task, void *calldata);
 extern int nfs_generic_pagein(struct nfs_pageio_descriptor *desc,
-		struct list_head *head);
-
+			      struct nfs_pgio_header *hdr);
 extern void nfs_pageio_init_read_mds(struct nfs_pageio_descriptor *pgio,
 		struct inode *inode);
 extern void nfs_pageio_reset_read_mds(struct nfs_pageio_descriptor *pgio);
diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
index ad1d680..333e765 100644
--- a/fs/nfs/nfs4filelayout.c
+++ b/fs/nfs/nfs4filelayout.c
@@ -227,7 +227,6 @@ static void filelayout_read_release(void *data)
 {
 	struct nfs_read_data *rdata = data;
 
-	put_lseg(rdata->header->lseg);
 	rdata->header->mds_ops->rpc_release(data);
 }
 
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 5375862..ce31ab2 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -3391,8 +3391,6 @@ void nfs4_reset_read(struct rpc_task *task, struct nfs_read_data *data)
 	struct inode *inode = hdr->inode;
 
 	dprintk("%s Reset task for i/o through\n", __func__);
-	put_lseg(hdr->lseg);
-	hdr->lseg = NULL;
 	data->ds_clp = NULL;
 	/* offsets will differ in the dense stripe case */
 	data->args.offset = data->mds_offset;
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index d349bd4..cd4c038 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -39,6 +39,30 @@ bool nfs_pgarray_set(struct nfs_page_array *p, unsigned int pagecount)
 	return p->pagevec != NULL;
 }
 
+void nfs_pgheader_init(struct nfs_pageio_descriptor *desc,
+		       struct nfs_pgio_header *hdr,
+		       void (*release)(struct nfs_pgio_header *hdr))
+{
+	hdr->req = nfs_list_entry(desc->pg_list.next);
+	hdr->inode = desc->pg_inode;
+	hdr->cred = hdr->req->wb_context->cred;
+	hdr->io_start = req_offset(hdr->req);
+	hdr->good_bytes = desc->pg_count;
+	hdr->release = release;
+}
+
+void nfs_set_pgio_error(struct nfs_pgio_header *hdr, int error, loff_t pos)
+{
+	spin_lock(&hdr->lock);
+	if (pos < hdr->io_start + hdr->good_bytes) {
+		set_bit(NFS_IOHDR_ERROR, &hdr->flags);
+		clear_bit(NFS_IOHDR_EOF, &hdr->flags);
+		hdr->good_bytes = pos - hdr->io_start;
+		hdr->error = error;
+	}
+	spin_unlock(&hdr->lock);
+}
+
 static inline struct nfs_page *
 nfs_page_alloc(void)
 {
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index d705da4..d1a91db 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -1333,7 +1333,9 @@ static void pnfs_ld_handle_read_error(struct nfs_read_data *data)
 		clear_bit(NFS_INO_LAYOUTCOMMIT, &NFS_I(hdr->inode)->flags);
 		pnfs_return_layout(hdr->inode);
 	}
-	data->task.tk_status = pnfs_read_done_resend_to_mds(hdr->inode, &hdr->pages);
+	if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags))
+		data->task.tk_status = pnfs_read_done_resend_to_mds(hdr->inode,
+								&hdr->pages);
 }
 
 /*
@@ -1348,7 +1350,6 @@ void pnfs_ld_read_done(struct nfs_read_data *data)
 		hdr->mds_ops->rpc_call_done(&data->task, data);
 	} else
 		pnfs_ld_handle_read_error(data);
-	put_lseg(hdr->lseg);
 	hdr->mds_ops->rpc_release(data);
 }
 EXPORT_SYMBOL_GPL(pnfs_ld_read_done);
@@ -1359,11 +1360,11 @@ pnfs_read_through_mds(struct nfs_pageio_descriptor *desc,
 {
 	struct nfs_pgio_header *hdr = data->header;
 
-	list_splice_tail_init(&hdr->pages, &desc->pg_list);
-	if (hdr->req && list_empty(&hdr->req->wb_list))
-		nfs_list_add_request(hdr->req, &desc->pg_list);
-	nfs_pageio_reset_read_mds(desc);
-	desc->pg_recoalesce = 1;
+	if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
+		list_splice_tail_init(&hdr->pages, &desc->pg_list);
+		nfs_pageio_reset_read_mds(desc);
+		desc->pg_recoalesce = 1;
+	}
 	nfs_readdata_release(data);
 }
 
@@ -1381,18 +1382,13 @@ pnfs_try_to_read_data(struct nfs_read_data *rdata,
 	enum pnfs_try_status trypnfs;
 
 	hdr->mds_ops = call_ops;
-	hdr->lseg = get_lseg(lseg);
 
 	dprintk("%s: Reading ino:%lu %u@%llu\n",
 		__func__, inode->i_ino, rdata->args.count, rdata->args.offset);
 
 	trypnfs = nfss->pnfs_curr_ld->read_pagelist(rdata);
-	if (trypnfs == PNFS_NOT_ATTEMPTED) {
-		put_lseg(hdr->lseg);
-		hdr->lseg = NULL;
-	} else {
+	if (trypnfs != PNFS_NOT_ATTEMPTED)
 		nfs_inc_stats(inode, NFSIOS_PNFS_READ);
-	}
 	dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs);
 	return trypnfs;
 }
@@ -1408,7 +1404,7 @@ pnfs_do_multiple_reads(struct nfs_pageio_descriptor *desc, struct list_head *hea
 	while (!list_empty(head)) {
 		enum pnfs_try_status trypnfs;
 
-		data = list_entry(head->next, struct nfs_read_data, list);
+		data = list_first_entry(head, struct nfs_read_data, list);
 		list_del_init(&data->list);
 
 		trypnfs = pnfs_try_to_read_data(data, call_ops, lseg);
@@ -1418,20 +1414,41 @@ pnfs_do_multiple_reads(struct nfs_pageio_descriptor *desc, struct list_head *hea
 	put_lseg(lseg);
 }
 
+static void pnfs_readhdr_free(struct nfs_pgio_header *hdr)
+{
+	put_lseg(hdr->lseg);
+	nfs_readhdr_free(hdr);
+}
+
 int
 pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc)
 {
-	LIST_HEAD(head);
+	struct nfs_read_header *rhdr;
+	struct nfs_pgio_header *hdr;
 	int ret;
 
-	ret = nfs_generic_pagein(desc, &head);
-	if (ret != 0) {
+	rhdr = nfs_readhdr_alloc();
+	if (!rhdr) {
+		nfs_async_read_error(&desc->pg_list);
+		ret = -ENOMEM;
 		put_lseg(desc->pg_lseg);
 		desc->pg_lseg = NULL;
 		return ret;
 	}
-	pnfs_do_multiple_reads(desc, &head);
-	return 0;
+	hdr = &rhdr->header;
+	nfs_pgheader_init(desc, hdr, pnfs_readhdr_free);
+	hdr->lseg = get_lseg(desc->pg_lseg);
+	atomic_inc(&hdr->refcnt);
+	ret = nfs_generic_pagein(desc, hdr);
+	if (ret != 0) {
+		put_lseg(desc->pg_lseg);
+		desc->pg_lseg = NULL;
+		set_bit(NFS_IOHDR_REDO, &hdr->flags);
+	} else
+		pnfs_do_multiple_reads(desc, &hdr->rpc_list);
+	if (atomic_dec_and_test(&hdr->refcnt))
+		nfs_read_completion(hdr);
+	return ret;
 }
 EXPORT_SYMBOL_GPL(pnfs_generic_pg_readpages);
 
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index f6ab30b..c9633b2 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -30,29 +30,49 @@
 #define NFSDBG_FACILITY		NFSDBG_PAGECACHE
 
 static const struct nfs_pageio_ops nfs_pageio_read_ops;
-static const struct rpc_call_ops nfs_read_partial_ops;
-static const struct rpc_call_ops nfs_read_full_ops;
+static const struct rpc_call_ops nfs_read_common_ops;
 
 static struct kmem_cache *nfs_rdata_cachep;
 
-struct nfs_read_header *nfs_readhdr_alloc(unsigned int pagecount)
+struct nfs_read_header *nfs_readhdr_alloc()
 {
-	struct nfs_read_header *p;
+	struct nfs_read_header *rhdr;
 
-	p = kmem_cache_zalloc(nfs_rdata_cachep, GFP_KERNEL);
-	if (p) {
-		struct nfs_pgio_header *hdr = &p->header;
-		struct nfs_read_data *data = &p->rpc_data;
+	rhdr = kmem_cache_zalloc(nfs_rdata_cachep, GFP_KERNEL);
+	if (rhdr) {
+		struct nfs_pgio_header *hdr = &rhdr->header;
 
 		INIT_LIST_HEAD(&hdr->pages);
-		INIT_LIST_HEAD(&data->list);
+		INIT_LIST_HEAD(&hdr->rpc_list);
+		spin_lock_init(&hdr->lock);
+		atomic_set(&hdr->refcnt, 0);
+	}
+	return rhdr;
+}
+
+struct nfs_read_data *nfs_readdata_alloc(struct nfs_pgio_header *hdr,
+					 unsigned int pagecount)
+{
+	struct nfs_read_data *data, *prealloc;
+
+	prealloc = &container_of(hdr, struct nfs_read_header, header)->rpc_data;
+	if (prealloc->header == NULL)
+		data = prealloc;
+	else
+		data = kzalloc(sizeof(*data), GFP_KERNEL);
+	if (!data)
+		goto out;
+
+	if (nfs_pgarray_set(&data->pages, pagecount)) {
 		data->header = hdr;
-		if (!nfs_pgarray_set(&data->pages, pagecount)) {
-			kmem_cache_free(nfs_rdata_cachep, p);
-			p = NULL;
-		}
+		atomic_inc(&hdr->refcnt);
+	} else {
+		if (data != prealloc)
+			kfree(data);
+		data = NULL;
 	}
-	return p;
+out:
+	return data;
 }
 
 void nfs_readhdr_free(struct nfs_pgio_header *hdr)
@@ -64,10 +84,18 @@ void nfs_readhdr_free(struct nfs_pgio_header *hdr)
 
 void nfs_readdata_release(struct nfs_read_data *rdata)
 {
+	struct nfs_pgio_header *hdr = rdata->header;
+	struct nfs_read_header *read_header = container_of(hdr, struct nfs_read_header, header);
+
 	put_nfs_open_context(rdata->args.context);
 	if (rdata->pages.pagevec != rdata->pages.page_array)
 		kfree(rdata->pages.pagevec);
-	nfs_readhdr_free(rdata->header);
+	if (rdata != &read_header->rpc_data)
+		kfree(rdata);
+	else
+		rdata->header = NULL;
+	if (atomic_dec_and_test(&hdr->refcnt))
+		nfs_read_completion(hdr);
 }
 
 static
@@ -79,35 +107,6 @@ int nfs_return_empty_page(struct page *page)
 	return 0;
 }
 
-static void nfs_readpage_truncate_uninitialised_page(struct nfs_read_data *data)
-{
-	unsigned int remainder = data->args.count - data->res.count;
-	unsigned int base = data->args.pgbase + data->res.count;
-	unsigned int pglen;
-	struct page **pages;
-
-	if (data->res.eof == 0 || remainder == 0)
-		return;
-	/*
-	 * Note: "remainder" can never be negative, since we check for
-	 * 	this in the XDR code.
-	 */
-	pages = &data->args.pages[base >> PAGE_CACHE_SHIFT];
-	base &= ~PAGE_CACHE_MASK;
-	pglen = PAGE_CACHE_SIZE - base;
-	for (;;) {
-		if (remainder <= pglen) {
-			zero_user(*pages, base, remainder);
-			break;
-		}
-		zero_user(*pages, base, pglen);
-		pages++;
-		remainder -= pglen;
-		pglen = PAGE_CACHE_SIZE;
-		base = 0;
-	}
-}
-
 void nfs_pageio_init_read_mds(struct nfs_pageio_descriptor *pgio,
 		struct inode *inode)
 {
@@ -170,6 +169,46 @@ static void nfs_readpage_release(struct nfs_page *req)
 	nfs_release_request(req);
 }
 
+/* Note io was page aligned */
+void nfs_read_completion(struct nfs_pgio_header *hdr)
+{
+	unsigned long bytes = 0;
+
+	if (test_bit(NFS_IOHDR_REDO, &hdr->flags))
+		goto out;
+	if (!test_bit(NFS_IOHDR_ERROR, &hdr->flags)) {
+		while (!list_empty(&hdr->pages)) {
+			struct nfs_page *req = nfs_list_entry(hdr->pages.next);
+			struct page *page = req->wb_page;
+
+			if (test_bit(NFS_IOHDR_EOF, &hdr->flags)) {
+				if (bytes > hdr->good_bytes)
+					zero_user(page, 0, PAGE_SIZE);
+				else if (hdr->good_bytes - bytes < PAGE_SIZE)
+					zero_user_segment(page,
+						hdr->good_bytes & ~PAGE_MASK,
+						PAGE_SIZE);
+			}
+			SetPageUptodate(page);
+			nfs_list_remove_request(req);
+			nfs_readpage_release(req);
+			bytes += PAGE_SIZE;
+		}
+	} else {
+		while (!list_empty(&hdr->pages)) {
+			struct nfs_page *req = nfs_list_entry(hdr->pages.next);
+
+			bytes += req->wb_bytes;
+			if (bytes <= hdr->good_bytes)
+				SetPageUptodate(req->wb_page);
+			nfs_list_remove_request(req);
+			nfs_readpage_release(req);
+		}
+	}
+out:
+	hdr->release(hdr);
+}
+
 int nfs_initiate_read(struct rpc_clnt *clnt,
 		      struct nfs_read_data *data,
 		      const struct rpc_call_ops *call_ops)
@@ -214,16 +253,12 @@ EXPORT_SYMBOL_GPL(nfs_initiate_read);
 /*
  * Set up the NFS read request struct
  */
-static void nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data,
+static void nfs_read_rpcsetup(struct nfs_read_data *data,
 		unsigned int count, unsigned int offset)
 {
-	struct inode *inode = data->header->inode;
-
-	data->header->req	  = req;
-	data->header->inode	  = inode;
-	data->header->cred	  = req->wb_context->cred;
+	struct nfs_page *req = data->header->req;
 
-	data->args.fh     = NFS_FH(inode);
+	data->args.fh     = NFS_FH(data->header->inode);
 	data->args.offset = req_offset(req) + offset;
 	data->args.pgbase = req->wb_pgbase + offset;
 	data->args.pages  = data->pages.pagevec;
@@ -255,7 +290,7 @@ nfs_do_multiple_reads(struct list_head *head,
 	while (!list_empty(head)) {
 		int ret2;
 
-		data = list_entry(head->next, struct nfs_read_data, list);
+		data = list_first_entry(head, struct nfs_read_data, list);
 		list_del_init(&data->list);
 
 		ret2 = nfs_do_read(data, call_ops);
@@ -265,7 +300,7 @@ nfs_do_multiple_reads(struct list_head *head,
 	return ret;
 }
 
-static void
+void
 nfs_async_read_error(struct list_head *head)
 {
 	struct nfs_page	*req;
@@ -290,11 +325,11 @@ nfs_async_read_error(struct list_head *head)
  * won't see the new data until our attribute cache is updated.  This is more
  * or less conventional NFS client behavior.
  */
-static int nfs_pagein_multi(struct nfs_pageio_descriptor *desc, struct list_head *res)
+static int nfs_pagein_multi(struct nfs_pageio_descriptor *desc,
+			    struct nfs_pgio_header *hdr)
 {
-	struct nfs_page *req = nfs_list_entry(desc->pg_list.next);
+	struct nfs_page *req = hdr->req;
 	struct page *page = req->wb_page;
-	struct nfs_read_header *rhdr;
 	struct nfs_read_data *data;
 	size_t rsize = desc->pg_bsize, nbytes;
 	unsigned int offset;
@@ -302,85 +337,97 @@ static int nfs_pagein_multi(struct nfs_pageio_descriptor *desc, struct list_head
 	int ret = 0;
 
 	nfs_list_remove_request(req);
+	nfs_list_add_request(req, &hdr->pages);
 
 	offset = 0;
 	nbytes = desc->pg_count;
 	do {
 		size_t len = min(nbytes,rsize);
 
-		rhdr = nfs_readhdr_alloc(1);
-		if (!rhdr)
+		data = nfs_readdata_alloc(hdr, 1);
+		if (!data)
 			goto out_bad;
-		data = &rhdr->rpc_data;
 		data->pages.pagevec[0] = page;
-		nfs_read_rpcsetup(req, data, len, offset);
-		list_add(&data->list, res);
+		nfs_read_rpcsetup(data, len, offset);
+		list_add(&data->list, &hdr->rpc_list);
 		requests++;
 		nbytes -= len;
 		offset += len;
 	} while(nbytes != 0);
-	atomic_set(&req->wb_complete, requests);
-	desc->pg_rpc_callops = &nfs_read_partial_ops;
+	desc->pg_rpc_callops = &nfs_read_common_ops;
 	return ret;
 out_bad:
-	while (!list_empty(res)) {
-		data = list_entry(res->next, struct nfs_read_data, list);
+	while (!list_empty(&hdr->rpc_list)) {
+		data = list_first_entry(&hdr->rpc_list, struct nfs_read_data, list);
 		list_del(&data->list);
 		nfs_readdata_release(data);
 	}
-	nfs_readpage_release(req);
+	nfs_async_read_error(&hdr->pages);
 	return -ENOMEM;
 }
 
-static int nfs_pagein_one(struct nfs_pageio_descriptor *desc, struct list_head *res)
+static int nfs_pagein_one(struct nfs_pageio_descriptor *desc,
+			  struct nfs_pgio_header *hdr)
 {
 	struct nfs_page		*req;
 	struct page		**pages;
-	struct nfs_read_header	*rhdr;
-	struct nfs_read_data	*data;
+	struct nfs_read_data    *data;
 	struct list_head *head = &desc->pg_list;
 	int ret = 0;
 
-	rhdr = nfs_readhdr_alloc(nfs_page_array_len(desc->pg_base,
-						    desc->pg_count));
-	if (!rhdr) {
+	data = nfs_readdata_alloc(hdr, nfs_page_array_len(desc->pg_base,
+							  desc->pg_count));
+	if (!data) {
 		nfs_async_read_error(head);
 		ret = -ENOMEM;
 		goto out;
 	}
 
-	data = &rhdr->rpc_data;
 	pages = data->pages.pagevec;
 	while (!list_empty(head)) {
 		req = nfs_list_entry(head->next);
 		nfs_list_remove_request(req);
-		nfs_list_add_request(req, &rhdr->header.pages);
+		nfs_list_add_request(req, &hdr->pages);
 		*pages++ = req->wb_page;
 	}
-	req = nfs_list_entry(rhdr->header.pages.next);
 
-	nfs_read_rpcsetup(req, data, desc->pg_count, 0);
-	list_add(&data->list, res);
-	desc->pg_rpc_callops = &nfs_read_full_ops;
+	nfs_read_rpcsetup(data, desc->pg_count, 0);
+	list_add(&data->list, &hdr->rpc_list);
+	desc->pg_rpc_callops = &nfs_read_common_ops;
 out:
 	return ret;
 }
 
-int nfs_generic_pagein(struct nfs_pageio_descriptor *desc, struct list_head *head)
+int nfs_generic_pagein(struct nfs_pageio_descriptor *desc,
+		       struct nfs_pgio_header *hdr)
 {
 	if (desc->pg_bsize < PAGE_CACHE_SIZE)
-		return nfs_pagein_multi(desc, head);
-	return nfs_pagein_one(desc, head);
+		return nfs_pagein_multi(desc, hdr);
+	return nfs_pagein_one(desc, hdr);
 }
 
 static int nfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc)
 {
-	LIST_HEAD(head);
+	struct nfs_read_header *rhdr;
+	struct nfs_pgio_header *hdr;
 	int ret;
 
-	ret = nfs_generic_pagein(desc, &head);
+	rhdr = nfs_readhdr_alloc();
+	if (!rhdr) {
+		nfs_async_read_error(&desc->pg_list);
+		return -ENOMEM;
+	}
+	hdr = &rhdr->header;
+	nfs_pgheader_init(desc, hdr, nfs_readhdr_free);
+	atomic_inc(&hdr->refcnt);
+	ret = nfs_generic_pagein(desc, hdr);
 	if (ret == 0)
-		ret = nfs_do_multiple_reads(&head, desc->pg_rpc_callops);
+		ret = nfs_do_multiple_reads(&hdr->rpc_list,
+					    desc->pg_rpc_callops);
+	else
+		set_bit(NFS_IOHDR_REDO, &hdr->flags);
+	if (atomic_dec_and_test(&hdr->refcnt))
+		nfs_read_completion(hdr);
 	return ret;
 }
 
@@ -419,15 +466,13 @@ static void nfs_readpage_retry(struct rpc_task *task, struct nfs_read_data *data
 	struct nfs_readargs *argp = &data->args;
 	struct nfs_readres *resp = &data->res;
 
-	if (resp->eof || resp->count == argp->count)
-		return;
-
 	/* This is a short read! */
 	nfs_inc_stats(data->header->inode, NFSIOS_SHORTREAD);
 	/* Has the server at least made some progress? */
-	if (resp->count == 0)
+	if (resp->count == 0) {
+		nfs_set_pgio_error(data->header, -EIO, argp->offset);
 		return;
-
+	}
 	/* Yes, so retry the read at the end of the data */
 	data->mds_offset += resp->count;
 	argp->offset += resp->count;
@@ -436,38 +481,34 @@ static void nfs_readpage_retry(struct rpc_task *task, struct nfs_read_data *data
 	rpc_restart_call_prepare(task);
 }
 
-/*
- * Handle a read reply that fills part of a page.
- */
-static void nfs_readpage_result_partial(struct rpc_task *task, void *calldata)
+static void nfs_readpage_result_common(struct rpc_task *task, void *calldata)
 {
 	struct nfs_read_data *data = calldata;
- 
+	struct nfs_pgio_header *hdr = data->header;
+
+	/* Note the only returns of nfs_readpage_result are 0 and -EAGAIN */
 	if (nfs_readpage_result(task, data) != 0)
 		return;
 	if (task->tk_status < 0)
-		return;
-
-	nfs_readpage_truncate_uninitialised_page(data);
-	nfs_readpage_retry(task, data);
+		nfs_set_pgio_error(hdr, task->tk_status, data->args.offset);
+	else if (data->res.eof) {
+		loff_t bound;
+
+		bound = data->args.offset + data->res.count;
+		spin_lock(&hdr->lock);
+		if (bound < hdr->io_start + hdr->good_bytes) {
+			set_bit(NFS_IOHDR_EOF, &hdr->flags);
+			clear_bit(NFS_IOHDR_ERROR, &hdr->flags);
+			hdr->good_bytes = bound - hdr->io_start;
+		}
+		spin_unlock(&hdr->lock);
+	} else if (data->res.count != data->args.count)
+		nfs_readpage_retry(task, data);
 }
 
-static void nfs_readpage_release_partial(void *calldata)
+static void nfs_readpage_release_common(void *calldata)
 {
-	struct nfs_read_data *data = calldata;
-	struct nfs_page *req = data->header->req;
-	struct page *page = req->wb_page;
-	int status = data->task.tk_status;
-
-	if (status < 0)
-		set_bit(PG_PARTIAL_READ_FAILED, &req->wb_flags);
-
-	if (atomic_dec_and_test(&req->wb_complete)) {
-		if (!test_bit(PG_PARTIAL_READ_FAILED, &req->wb_flags))
-			SetPageUptodate(page);
-		nfs_readpage_release(req);
-	}
-	nfs_readdata_release(data);
+	nfs_readdata_release(calldata);
 }
 
 void nfs_read_prepare(struct rpc_task *task, void *calldata)
@@ -476,75 +517,10 @@ void nfs_read_prepare(struct rpc_task *task, void *calldata)
 	NFS_PROTO(data->header->inode)->read_rpc_prepare(task, data);
 }
 
-static const struct rpc_call_ops nfs_read_partial_ops = {
-	.rpc_call_prepare = nfs_read_prepare,
-	.rpc_call_done = nfs_readpage_result_partial,
-	.rpc_release = nfs_readpage_release_partial,
-};
-
-static void nfs_readpage_set_pages_uptodate(struct nfs_read_data *data)
-{
-	unsigned int count = data->res.count;
-	unsigned int base = data->args.pgbase;
-	struct page **pages;
-
-	if (data->res.eof)
-		count = data->args.count;
-	if (unlikely(count == 0))
-		return;
-	pages = &data->args.pages[base >> PAGE_CACHE_SHIFT];
-	base &= ~PAGE_CACHE_MASK;
-	count += base;
-	for (;count >= PAGE_CACHE_SIZE; count -= PAGE_CACHE_SIZE, pages++)
-		SetPageUptodate(*pages);
-	if (count == 0)
-		return;
-	/* Was this a short read? */
-	if (data->res.eof || data->res.count == data->args.count)
-		SetPageUptodate(*pages);
-}
-
-/*
- * This is the callback from RPC telling us whether a reply was
- * received or some error occurred (timeout or socket shutdown).
- */
-static void nfs_readpage_result_full(struct rpc_task *task, void *calldata)
-{
-	struct nfs_read_data *data = calldata;
-
-	if (nfs_readpage_result(task, data) != 0)
-		return;
-	if (task->tk_status < 0)
-		return;
-	/*
-	 * Note: nfs_readpage_retry may change the values of
-	 * data->args. In the multi-page case, we therefore need
-	 * to ensure that we call nfs_readpage_set_pages_uptodate()
-	 * first.
-	 */
-	nfs_readpage_truncate_uninitialised_page(data);
-	nfs_readpage_set_pages_uptodate(data);
-	nfs_readpage_retry(task, data);
-}
-
-static void nfs_readpage_release_full(void *calldata)
-{
-	struct nfs_read_data *data = calldata;
-	struct nfs_pgio_header *hdr = data->header;
-
-	while (!list_empty(&hdr->pages)) {
-		struct nfs_page *req = nfs_list_entry(hdr->pages.next);
-
-		nfs_list_remove_request(req);
-		nfs_readpage_release(req);
-	}
-	nfs_readdata_release(calldata);
-}
-
-static const struct rpc_call_ops nfs_read_full_ops = {
+static const struct rpc_call_ops nfs_read_common_ops = {
 	.rpc_call_prepare = nfs_read_prepare,
-	.rpc_call_done = nfs_readpage_result_full,
-	.rpc_release = nfs_readpage_release_full,
+	.rpc_call_done = nfs_readpage_result_common,
+	.rpc_release = nfs_readpage_release_common,
 };
 
 /*
diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h
index eac30d6..5c52034 100644
--- a/include/linux/nfs_page.h
+++ b/include/linux/nfs_page.h
@@ -27,7 +27,6 @@ enum {
 	PG_CLEAN,
 	PG_NEED_COMMIT,
 	PG_NEED_RESCHED,
-	PG_PARTIAL_READ_FAILED,
 	PG_COMMIT_TO_DS,
 };
 
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index e34beaf..1648621 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -1187,14 +1187,30 @@ struct nfs_read_data {
 	struct nfs_client	*ds_clp;	/* pNFS data server */
 };
 
+/* used as flag bits in nfs_pgio_header */
+enum {
+	NFS_IOHDR_ERROR = 0,
+	NFS_IOHDR_EOF,
+	NFS_IOHDR_REDO,
+};
+
 struct nfs_pgio_header {
 	struct inode		*inode;
 	struct rpc_cred		*cred;
 	struct list_head	pages;
+	struct list_head	rpc_list;
+	atomic_t		refcnt;
 	struct nfs_page		*req;
 	struct pnfs_layout_segment *lseg;
+	loff_t			io_start;
 	const struct rpc_call_ops *mds_ops;
+	void (*release) (struct nfs_pgio_header *hdr);
+	spinlock_t		lock;
+	/* fields protected by lock */
 	int			pnfs_error;
+	int			error;		/* merge with pnfs_error */
+	unsigned long		good_bytes;	/* boundary of good data */
+	unsigned long		flags;
 };
 
 struct nfs_read_header {
-- 
cgit v0.10.2


From 6c75dc0d498caa402fb17b1bf769835a9db875c8 Mon Sep 17 00:00:00 2001
From: Fred Isaman <iisaman@netapp.com>
Date: Fri, 20 Apr 2012 14:47:47 -0400
Subject: NFS: merge _full and _partial write rpc_ops

Decouple nfs_pgio_header and nfs_write_data, and have (possibly
multiple) nfs_write_datas each take a refcount on nfs_pgio_header.

For the moment keeps nfs_write_header as a way to preallocate a single
nfs_write_data with the nfs_pgio_header.  The code doesn't need this,
and would be prettier without, but given the amount of churn I am
already introducing I didn't want to play with tuning new mempools.

This also fixes bug in pnfs_ld_handle_write_error.  In the case of
desc->pg_bsize < PAGE_CACHE_SIZE, the pages list was empty, causing
replay attempt to do nothing.

Signed-off-by: Fred Isaman <iisaman@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 90b00ce..22a40c4 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -768,11 +768,17 @@ static ssize_t nfs_direct_write_schedule_segment(struct nfs_direct_req *dreq,
 		bytes = min(wsize,count);
 
 		result = -ENOMEM;
-		whdr = nfs_writehdr_alloc(nfs_page_array_len(pgbase, bytes));
+		whdr = nfs_writehdr_alloc();
 		if (unlikely(!whdr))
 			break;
 
-		data = &whdr->rpc_data;
+		data = nfs_writedata_alloc(&whdr->header, nfs_page_array_len(pgbase, bytes));
+		if (!data) {
+			nfs_writehdr_free(&whdr->header);
+			break;
+		}
+		data->header = &whdr->header;
+		atomic_inc(&data->header->refcnt);
 		pages = &data->pages;
 
 		down_read(&current->mm->mmap_sem);
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 33af5e5..16bc9c4 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -319,10 +319,14 @@ extern void nfs_pageio_reset_read_mds(struct nfs_pageio_descriptor *pgio);
 extern void nfs_readdata_release(struct nfs_read_data *rdata);
 
 /* write.c */
-extern struct nfs_write_header *nfs_writehdr_alloc(unsigned int npages);
+extern void nfs_async_write_error(struct list_head *head);
+extern struct nfs_write_header *nfs_writehdr_alloc(void);
 extern void nfs_writehdr_free(struct nfs_pgio_header *hdr);
+extern struct nfs_write_data *nfs_writedata_alloc(struct nfs_pgio_header *hdr,
+						  unsigned int pagecount);
+extern void nfs_write_completion(struct nfs_pgio_header *hdr);
 extern int nfs_generic_flush(struct nfs_pageio_descriptor *desc,
-		struct list_head *head);
+			     struct nfs_pgio_header *hdr);
 extern void nfs_pageio_init_write_mds(struct nfs_pageio_descriptor *pgio,
 				  struct inode *inode, int ioflags);
 extern void nfs_pageio_reset_write_mds(struct nfs_pageio_descriptor *pgio);
diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
index 333e765..02d8170 100644
--- a/fs/nfs/nfs4filelayout.c
+++ b/fs/nfs/nfs4filelayout.c
@@ -314,7 +314,6 @@ static void filelayout_write_release(void *data)
 {
 	struct nfs_write_data *wdata = data;
 
-	put_lseg(wdata->header->lseg);
 	wdata->header->mds_ops->rpc_release(data);
 }
 
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index ce31ab2..87af80d 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -3431,8 +3431,6 @@ void nfs4_reset_write(struct rpc_task *task, struct nfs_write_data *data)
 	struct inode *inode = hdr->inode;
 
 	dprintk("%s Reset task for i/o through\n", __func__);
-	put_lseg(hdr->lseg);
-	hdr->lseg        = NULL;
 	data->ds_clp     = NULL;
 	data->write_done_cb = nfs4_write_done_cb;
 	data->args.fh       = NFS_FH(inode);
@@ -3448,7 +3446,7 @@ static void nfs4_proc_write_setup(struct nfs_write_data *data, struct rpc_messag
 {
 	struct nfs_server *server = NFS_SERVER(data->header->inode);
 
-	if (data->header->lseg) {
+	if (data->ds_clp) {
 		data->args.bitmask = NULL;
 		data->res.fattr = NULL;
 	} else
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index d1a91db..d515f00 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -1199,7 +1199,9 @@ static void pnfs_ld_handle_write_error(struct nfs_write_data *data)
 		clear_bit(NFS_INO_LAYOUTCOMMIT, &NFS_I(hdr->inode)->flags);
 		pnfs_return_layout(hdr->inode);
 	}
-	data->task.tk_status = pnfs_write_done_resend_to_mds(hdr->inode, &hdr->pages);
+	if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags))
+		data->task.tk_status = pnfs_write_done_resend_to_mds(hdr->inode,
+								&hdr->pages);
 }
 
 /*
@@ -1214,7 +1216,6 @@ void pnfs_ld_write_done(struct nfs_write_data *data)
 		hdr->mds_ops->rpc_call_done(&data->task, data);
 	} else
 		pnfs_ld_handle_write_error(data);
-	put_lseg(hdr->lseg);
 	hdr->mds_ops->rpc_release(data);
 }
 EXPORT_SYMBOL_GPL(pnfs_ld_write_done);
@@ -1225,12 +1226,11 @@ pnfs_write_through_mds(struct nfs_pageio_descriptor *desc,
 {
 	struct nfs_pgio_header *hdr = data->header;
 
-	list_splice_tail_init(&hdr->pages, &desc->pg_list);
-	if (hdr->req && list_empty(&hdr->req->wb_list))
-		nfs_list_add_request(hdr->req, &desc->pg_list);
-	nfs_pageio_reset_write_mds(desc);
-	desc->pg_recoalesce = 1;
-	put_lseg(hdr->lseg);
+	if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
+		list_splice_tail_init(&hdr->pages, &desc->pg_list);
+		nfs_pageio_reset_write_mds(desc);
+		desc->pg_recoalesce = 1;
+	}
 	nfs_writedata_release(data);
 }
 
@@ -1246,18 +1246,12 @@ pnfs_try_to_write_data(struct nfs_write_data *wdata,
 	struct nfs_server *nfss = NFS_SERVER(inode);
 
 	hdr->mds_ops = call_ops;
-	hdr->lseg = get_lseg(lseg);
 
 	dprintk("%s: Writing ino:%lu %u@%llu (how %d)\n", __func__,
 		inode->i_ino, wdata->args.count, wdata->args.offset, how);
-
 	trypnfs = nfss->pnfs_curr_ld->write_pagelist(wdata, how);
-	if (trypnfs == PNFS_NOT_ATTEMPTED) {
-		put_lseg(hdr->lseg);
-		hdr->lseg = NULL;
-	} else
+	if (trypnfs != PNFS_NOT_ATTEMPTED)
 		nfs_inc_stats(inode, NFSIOS_PNFS_WRITE);
-
 	dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs);
 	return trypnfs;
 }
@@ -1273,7 +1267,7 @@ pnfs_do_multiple_writes(struct nfs_pageio_descriptor *desc, struct list_head *he
 	while (!list_empty(head)) {
 		enum pnfs_try_status trypnfs;
 
-		data = list_entry(head->next, struct nfs_write_data, list);
+		data = list_first_entry(head, struct nfs_write_data, list);
 		list_del_init(&data->list);
 
 		trypnfs = pnfs_try_to_write_data(data, call_ops, lseg, how);
@@ -1283,20 +1277,40 @@ pnfs_do_multiple_writes(struct nfs_pageio_descriptor *desc, struct list_head *he
 	put_lseg(lseg);
 }
 
+static void pnfs_writehdr_free(struct nfs_pgio_header *hdr)
+{
+	put_lseg(hdr->lseg);
+	nfs_writehdr_free(hdr);
+}
+
 int
 pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc)
 {
-	LIST_HEAD(head);
+	struct nfs_write_header *whdr;
+	struct nfs_pgio_header *hdr;
 	int ret;
 
-	ret = nfs_generic_flush(desc, &head);
-	if (ret != 0) {
+	whdr = nfs_writehdr_alloc();
+	if (!whdr) {
+		nfs_async_write_error(&desc->pg_list);
 		put_lseg(desc->pg_lseg);
 		desc->pg_lseg = NULL;
-		return ret;
+		return -ENOMEM;
 	}
-	pnfs_do_multiple_writes(desc, &head, desc->pg_ioflags);
-	return 0;
+	hdr = &whdr->header;
+	nfs_pgheader_init(desc, hdr, pnfs_writehdr_free);
+	hdr->lseg = get_lseg(desc->pg_lseg);
+	atomic_inc(&hdr->refcnt);
+	ret = nfs_generic_flush(desc, hdr);
+	if (ret != 0) {
+		put_lseg(desc->pg_lseg);
+		desc->pg_lseg = NULL;
+		set_bit(NFS_IOHDR_REDO, &hdr->flags);
+	} else
+		pnfs_do_multiple_writes(desc, &hdr->rpc_list, desc->pg_ioflags);
+	if (atomic_dec_and_test(&hdr->refcnt))
+		nfs_write_completion(hdr);
+	return ret;
 }
 EXPORT_SYMBOL_GPL(pnfs_generic_pg_writepages);
 
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 2efae04..076075e 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -42,8 +42,7 @@
 static void nfs_pageio_init_write(struct nfs_pageio_descriptor *desc,
 				  struct inode *inode, int ioflags);
 static void nfs_redirty_request(struct nfs_page *req);
-static const struct rpc_call_ops nfs_write_partial_ops;
-static const struct rpc_call_ops nfs_write_full_ops;
+static const struct rpc_call_ops nfs_write_common_ops;
 static const struct rpc_call_ops nfs_commit_ops;
 
 static struct kmem_cache *nfs_wdata_cachep;
@@ -69,26 +68,47 @@ void nfs_commit_free(struct nfs_commit_data *p)
 }
 EXPORT_SYMBOL_GPL(nfs_commit_free);
 
-struct nfs_write_header *nfs_writehdr_alloc(unsigned int pagecount)
+struct nfs_write_header *nfs_writehdr_alloc(void)
 {
 	struct nfs_write_header *p = mempool_alloc(nfs_wdata_mempool, GFP_NOFS);
 
 	if (p) {
 		struct nfs_pgio_header *hdr = &p->header;
-		struct nfs_write_data *data = &p->rpc_data;
 
 		memset(p, 0, sizeof(*p));
 		INIT_LIST_HEAD(&hdr->pages);
-		INIT_LIST_HEAD(&data->list);
-		data->header = hdr;
-		if (!nfs_pgarray_set(&data->pages, pagecount)) {
-			mempool_free(p, nfs_wdata_mempool);
-			p = NULL;
-		}
+		INIT_LIST_HEAD(&hdr->rpc_list);
+		spin_lock_init(&hdr->lock);
+		atomic_set(&hdr->refcnt, 0);
 	}
 	return p;
 }
 
+struct nfs_write_data *nfs_writedata_alloc(struct nfs_pgio_header *hdr,
+					   unsigned int pagecount)
+{
+	struct nfs_write_data *data, *prealloc;
+
+	prealloc = &container_of(hdr, struct nfs_write_header, header)->rpc_data;
+	if (prealloc->header == NULL)
+		data = prealloc;
+	else
+		data = kzalloc(sizeof(*data), GFP_KERNEL);
+	if (!data)
+		goto out;
+
+	if (nfs_pgarray_set(&data->pages, pagecount)) {
+		data->header = hdr;
+		atomic_inc(&hdr->refcnt);
+	} else {
+		if (data != prealloc)
+			kfree(data);
+		data = NULL;
+	}
+out:
+	return data;
+}
+
 void nfs_writehdr_free(struct nfs_pgio_header *hdr)
 {
 	struct nfs_write_header *whdr = container_of(hdr, struct nfs_write_header, header);
@@ -97,10 +117,18 @@ void nfs_writehdr_free(struct nfs_pgio_header *hdr)
 
 void nfs_writedata_release(struct nfs_write_data *wdata)
 {
+	struct nfs_pgio_header *hdr = wdata->header;
+	struct nfs_write_header *write_header = container_of(hdr, struct nfs_write_header, header);
+
 	put_nfs_open_context(wdata->args.context);
 	if (wdata->pages.pagevec != wdata->pages.page_array)
 		kfree(wdata->pages.pagevec);
-	nfs_writehdr_free(wdata->header);
+	if (wdata != &write_header->rpc_data)
+		kfree(wdata);
+	else
+		wdata->header = NULL;
+	if (atomic_dec_and_test(&hdr->refcnt))
+		nfs_write_completion(hdr);
 }
 
 static void nfs_context_set_write_error(struct nfs_open_context *ctx, int error)
@@ -511,20 +539,6 @@ int nfs_write_need_commit(struct nfs_write_data *data)
 	return data->verf.committed != NFS_FILE_SYNC;
 }
 
-static inline
-int nfs_reschedule_unstable_write(struct nfs_page *req,
-				  struct nfs_write_data *data)
-{
-	if (test_and_clear_bit(PG_NEED_COMMIT, &req->wb_flags)) {
-		nfs_mark_request_commit(req, data->header->lseg);
-		return 1;
-	}
-	if (test_and_clear_bit(PG_NEED_RESCHED, &req->wb_flags)) {
-		nfs_mark_request_dirty(req);
-		return 1;
-	}
-	return 0;
-}
 #else
 static void
 nfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg)
@@ -542,13 +556,43 @@ int nfs_write_need_commit(struct nfs_write_data *data)
 	return 0;
 }
 
-static inline
-int nfs_reschedule_unstable_write(struct nfs_page *req,
-				  struct nfs_write_data *data)
+#endif
+
+void nfs_write_completion(struct nfs_pgio_header *hdr)
 {
-	return 0;
+	unsigned long bytes = 0;
+
+	if (test_bit(NFS_IOHDR_REDO, &hdr->flags))
+		goto out;
+	while (!list_empty(&hdr->pages)) {
+		struct nfs_page *req = nfs_list_entry(hdr->pages.next);
+		struct page *page = req->wb_page;
+
+		bytes += req->wb_bytes;
+		nfs_list_remove_request(req);
+		if (test_bit(NFS_IOHDR_ERROR, &hdr->flags) &&
+		    (hdr->good_bytes < bytes)) {
+			nfs_set_pageerror(page);
+			nfs_context_set_write_error(req->wb_context, hdr->error);
+			goto remove_req;
+		}
+		if (test_bit(NFS_IOHDR_NEED_RESCHED, &hdr->flags)) {
+			nfs_mark_request_dirty(req);
+			goto next;
+		}
+		if (test_bit(NFS_IOHDR_NEED_COMMIT, &hdr->flags)) {
+			nfs_mark_request_commit(req, hdr->lseg);
+			goto next;
+		}
+remove_req:
+		nfs_inode_remove_request(req);
+next:
+		nfs_unlock_request(req);
+		nfs_end_page_writeback(page);
+	}
+out:
+	hdr->release(hdr);
 }
-#endif
 
 #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
 static int
@@ -813,17 +857,6 @@ int nfs_updatepage(struct file *file, struct page *page,
 	return status;
 }
 
-static void nfs_writepage_release(struct nfs_page *req,
-				  struct nfs_write_data *data)
-{
-	struct page *page = req->wb_page;
-
-	if (PageError(req->wb_page) || !nfs_reschedule_unstable_write(req, data))
-		nfs_inode_remove_request(req);
-	nfs_unlock_request(req);
-	nfs_end_page_writeback(page);
-}
-
 static int flush_task_priority(int how)
 {
 	switch (how & (FLUSH_HIGHPRI|FLUSH_LOWPRI)) {
@@ -890,22 +923,16 @@ EXPORT_SYMBOL_GPL(nfs_initiate_write);
 /*
  * Set up the argument/result storage required for the RPC call.
  */
-static void nfs_write_rpcsetup(struct nfs_page *req,
-		struct nfs_write_data *data,
+static void nfs_write_rpcsetup(struct nfs_write_data *data,
 		unsigned int count, unsigned int offset,
 		int how)
 {
-	struct nfs_pgio_header *hdr = data->header;
-	struct inode *inode = req->wb_context->dentry->d_inode;
+	struct nfs_page *req = data->header->req;
 
 	/* Set up the RPC argument and reply structs
 	 * NB: take care not to mess about with data->commit et al. */
 
-	hdr->req = req;
-	hdr->inode = inode = req->wb_context->dentry->d_inode;
-	hdr->cred = req->wb_context->cred;
-
-	data->args.fh     = NFS_FH(inode);
+	data->args.fh     = NFS_FH(data->header->inode);
 	data->args.offset = req_offset(req) + offset;
 	/* pnfs_set_layoutcommit needs this */
 	data->mds_offset = data->args.offset;
@@ -919,7 +946,7 @@ static void nfs_write_rpcsetup(struct nfs_page *req,
 	case 0:
 		break;
 	case FLUSH_COND_STABLE:
-		if (nfs_need_commit(NFS_I(inode)))
+		if (nfs_need_commit(NFS_I(data->header->inode)))
 			break;
 	default:
 		data->args.stable = NFS_FILE_SYNC;
@@ -950,7 +977,7 @@ static int nfs_do_multiple_writes(struct list_head *head,
 	while (!list_empty(head)) {
 		int ret2;
 
-		data = list_entry(head->next, struct nfs_write_data, list);
+		data = list_first_entry(head, struct nfs_write_data, list);
 		list_del_init(&data->list);
 		
 		ret2 = nfs_do_write(data, call_ops, how);
@@ -973,15 +1000,26 @@ static void nfs_redirty_request(struct nfs_page *req)
 	nfs_end_page_writeback(page);
 }
 
+void nfs_async_write_error(struct list_head *head)
+{
+	struct nfs_page	*req;
+
+	while (!list_empty(head)) {
+		req = nfs_list_entry(head->next);
+		nfs_list_remove_request(req);
+		nfs_redirty_request(req);
+	}
+}
+
 /*
  * Generate multiple small requests to write out a single
  * contiguous dirty area on one page.
  */
-static int nfs_flush_multi(struct nfs_pageio_descriptor *desc, struct list_head *res)
+static int nfs_flush_multi(struct nfs_pageio_descriptor *desc,
+			   struct nfs_pgio_header *hdr)
 {
-	struct nfs_page *req = nfs_list_entry(desc->pg_list.next);
+	struct nfs_page *req = hdr->req;
 	struct page *page = req->wb_page;
-	struct nfs_write_header *whdr;
 	struct nfs_write_data *data;
 	size_t wsize = desc->pg_bsize, nbytes;
 	unsigned int offset;
@@ -989,6 +1027,7 @@ static int nfs_flush_multi(struct nfs_pageio_descriptor *desc, struct list_head
 	int ret = 0;
 
 	nfs_list_remove_request(req);
+	nfs_list_add_request(req, &hdr->pages);
 
 	if ((desc->pg_ioflags & FLUSH_COND_STABLE) &&
 	    (desc->pg_moreio || NFS_I(desc->pg_inode)->ncommit ||
@@ -1001,28 +1040,27 @@ static int nfs_flush_multi(struct nfs_pageio_descriptor *desc, struct list_head
 	do {
 		size_t len = min(nbytes, wsize);
 
-		whdr = nfs_writehdr_alloc(1);
-		if (!whdr)
+		data = nfs_writedata_alloc(hdr, 1);
+		if (!data)
 			goto out_bad;
-		data = &whdr->rpc_data;
 		data->pages.pagevec[0] = page;
-		nfs_write_rpcsetup(req, data, len, offset, desc->pg_ioflags);
-		list_add(&data->list, res);
+		nfs_write_rpcsetup(data, len, offset, desc->pg_ioflags);
+		list_add(&data->list, &hdr->rpc_list);
 		requests++;
 		nbytes -= len;
 		offset += len;
 	} while (nbytes != 0);
 	atomic_set(&req->wb_complete, requests);
-	desc->pg_rpc_callops = &nfs_write_partial_ops;
+	desc->pg_rpc_callops = &nfs_write_common_ops;
 	return ret;
 
 out_bad:
-	while (!list_empty(res)) {
-		data = list_entry(res->next, struct nfs_write_data, list);
+	while (!list_empty(&hdr->rpc_list)) {
+		data = list_first_entry(&hdr->rpc_list, struct nfs_write_data, list);
 		list_del(&data->list);
 		nfs_writedata_release(data);
 	}
-	nfs_redirty_request(req);
+	nfs_async_write_error(&hdr->pages);
 	return -ENOMEM;
 }
 
@@ -1034,64 +1072,74 @@ out_bad:
  * This is the case if nfs_updatepage detects a conflicting request
  * that has been written but not committed.
  */
-static int nfs_flush_one(struct nfs_pageio_descriptor *desc, struct list_head *res)
+static int nfs_flush_one(struct nfs_pageio_descriptor *desc,
+			 struct nfs_pgio_header *hdr)
 {
 	struct nfs_page		*req;
 	struct page		**pages;
-	struct nfs_write_header	*whdr;
 	struct nfs_write_data	*data;
 	struct list_head *head = &desc->pg_list;
 	int ret = 0;
 
-	whdr = nfs_writehdr_alloc(nfs_page_array_len(desc->pg_base,
-						     desc->pg_count));
-	if (!whdr) {
-		while (!list_empty(head)) {
-			req = nfs_list_entry(head->next);
-			nfs_list_remove_request(req);
-			nfs_redirty_request(req);
-		}
+	data = nfs_writedata_alloc(hdr, nfs_page_array_len(desc->pg_base,
+							   desc->pg_count));
+	if (!data) {
+		nfs_async_write_error(head);
 		ret = -ENOMEM;
 		goto out;
 	}
-	data = &whdr->rpc_data;
+
 	pages = data->pages.pagevec;
 	while (!list_empty(head)) {
 		req = nfs_list_entry(head->next);
 		nfs_list_remove_request(req);
-		nfs_list_add_request(req, &whdr->header.pages);
+		nfs_list_add_request(req, &hdr->pages);
 		*pages++ = req->wb_page;
 	}
-	req = nfs_list_entry(whdr->header.pages.next);
 
 	if ((desc->pg_ioflags & FLUSH_COND_STABLE) &&
 	    (desc->pg_moreio || NFS_I(desc->pg_inode)->ncommit))
 		desc->pg_ioflags &= ~FLUSH_COND_STABLE;
 
 	/* Set up the argument struct */
-	nfs_write_rpcsetup(req, data, desc->pg_count, 0, desc->pg_ioflags);
-	list_add(&data->list, res);
-	desc->pg_rpc_callops = &nfs_write_full_ops;
+	nfs_write_rpcsetup(data, desc->pg_count, 0, desc->pg_ioflags);
+	list_add(&data->list, &hdr->rpc_list);
+	desc->pg_rpc_callops = &nfs_write_common_ops;
 out:
 	return ret;
 }
 
-int nfs_generic_flush(struct nfs_pageio_descriptor *desc, struct list_head *head)
+int nfs_generic_flush(struct nfs_pageio_descriptor *desc,
+		      struct nfs_pgio_header *hdr)
 {
 	if (desc->pg_bsize < PAGE_CACHE_SIZE)
-		return nfs_flush_multi(desc, head);
-	return nfs_flush_one(desc, head);
+		return nfs_flush_multi(desc, hdr);
+	return nfs_flush_one(desc, hdr);
 }
 
 static int nfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc)
 {
-	LIST_HEAD(head);
+	struct nfs_write_header *whdr;
+	struct nfs_pgio_header *hdr;
 	int ret;
 
-	ret = nfs_generic_flush(desc, &head);
+	whdr = nfs_writehdr_alloc();
+	if (!whdr) {
+		nfs_async_write_error(&desc->pg_list);
+		return -ENOMEM;
+	}
+	hdr = &whdr->header;
+	nfs_pgheader_init(desc, hdr, nfs_writehdr_free);
+	atomic_inc(&hdr->refcnt);
+	ret = nfs_generic_flush(desc, hdr);
 	if (ret == 0)
-		ret = nfs_do_multiple_writes(&head, desc->pg_rpc_callops,
-				desc->pg_ioflags);
+		ret = nfs_do_multiple_writes(&hdr->rpc_list,
+					     desc->pg_rpc_callops,
+					     desc->pg_ioflags);
+	else
+		set_bit(NFS_IOHDR_REDO, &hdr->flags);
+	if (atomic_dec_and_test(&hdr->refcnt))
+		nfs_write_completion(hdr);
 	return ret;
 }
 
@@ -1121,62 +1169,6 @@ static void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio,
 		nfs_pageio_init_write_mds(pgio, inode, ioflags);
 }
 
-/*
- * Handle a write reply that flushed part of a page.
- */
-static void nfs_writeback_done_partial(struct rpc_task *task, void *calldata)
-{
-	struct nfs_write_data	*data = calldata;
-
-	dprintk("NFS: %5u write(%s/%lld %d@%lld)",
-		task->tk_pid,
-		data->header->inode->i_sb->s_id,
-		(long long)
-		  NFS_FILEID(data->header->inode),
-		data->header->req->wb_bytes,
-		(long long)req_offset(data->header->req));
-
-	nfs_writeback_done(task, data);
-}
-
-static void nfs_writeback_release_partial(void *calldata)
-{
-	struct nfs_write_data	*data = calldata;
-	struct nfs_page		*req = data->header->req;
-	struct page		*page = req->wb_page;
-	int status = data->task.tk_status;
-
-	if (status < 0) {
-		nfs_set_pageerror(page);
-		nfs_context_set_write_error(req->wb_context, status);
-		dprintk(", error = %d\n", status);
-		goto out;
-	}
-
-	if (nfs_write_need_commit(data)) {
-		struct inode *inode = page->mapping->host;
-
-		spin_lock(&inode->i_lock);
-		if (test_bit(PG_NEED_RESCHED, &req->wb_flags)) {
-			/* Do nothing we need to resend the writes */
-		} else if (!test_and_set_bit(PG_NEED_COMMIT, &req->wb_flags)) {
-			memcpy(&req->wb_verf, &data->verf, sizeof(req->wb_verf));
-			dprintk(" defer commit\n");
-		} else if (memcmp(&req->wb_verf, &data->verf, sizeof(req->wb_verf))) {
-			set_bit(PG_NEED_RESCHED, &req->wb_flags);
-			clear_bit(PG_NEED_COMMIT, &req->wb_flags);
-			dprintk(" server reboot detected\n");
-		}
-		spin_unlock(&inode->i_lock);
-	} else
-		dprintk(" OK\n");
-
-out:
-	if (atomic_dec_and_test(&req->wb_complete))
-		nfs_writepage_release(req, data);
-	nfs_writedata_release(data);
-}
-
 void nfs_write_prepare(struct rpc_task *task, void *calldata)
 {
 	struct nfs_write_data *data = calldata;
@@ -1190,12 +1182,6 @@ void nfs_commit_prepare(struct rpc_task *task, void *calldata)
 	NFS_PROTO(data->inode)->commit_rpc_prepare(task, data);
 }
 
-static const struct rpc_call_ops nfs_write_partial_ops = {
-	.rpc_call_prepare = nfs_write_prepare,
-	.rpc_call_done = nfs_writeback_done_partial,
-	.rpc_release = nfs_writeback_release_partial,
-};
-
 /*
  * Handle a write reply that flushes a whole page.
  *
@@ -1203,60 +1189,37 @@ static const struct rpc_call_ops nfs_write_partial_ops = {
  *	  writebacks since the page->count is kept > 1 for as long
  *	  as the page has a write request pending.
  */
-static void nfs_writeback_done_full(struct rpc_task *task, void *calldata)
+static void nfs_writeback_done_common(struct rpc_task *task, void *calldata)
 {
 	struct nfs_write_data	*data = calldata;
 
 	nfs_writeback_done(task, data);
 }
 
-static void nfs_writeback_release_full(void *calldata)
+static void nfs_writeback_release_common(void *calldata)
 {
 	struct nfs_write_data	*data = calldata;
 	struct nfs_pgio_header *hdr = data->header;
 	int status = data->task.tk_status;
+	struct nfs_page *req = hdr->req;
 
-	/* Update attributes as result of writeback. */
-	while (!list_empty(&hdr->pages)) {
-		struct nfs_page *req = nfs_list_entry(hdr->pages.next);
-		struct page *page = req->wb_page;
-
-		nfs_list_remove_request(req);
-
-		dprintk("NFS: %5u write (%s/%lld %d@%lld)",
-			data->task.tk_pid,
-			req->wb_context->dentry->d_inode->i_sb->s_id,
-			(long long)NFS_FILEID(req->wb_context->dentry->d_inode),
-			req->wb_bytes,
-			(long long)req_offset(req));
-
-		if (status < 0) {
-			nfs_set_pageerror(page);
-			nfs_context_set_write_error(req->wb_context, status);
-			dprintk(", error = %d\n", status);
-			goto remove_request;
-		}
-
-		if (nfs_write_need_commit(data)) {
+	if ((status >= 0) && nfs_write_need_commit(data)) {
+		spin_lock(&hdr->lock);
+		if (test_bit(NFS_IOHDR_NEED_RESCHED, &hdr->flags))
+			; /* Do nothing */
+		else if (!test_and_set_bit(NFS_IOHDR_NEED_COMMIT, &hdr->flags))
 			memcpy(&req->wb_verf, &data->verf, sizeof(req->wb_verf));
-			nfs_mark_request_commit(req, hdr->lseg);
-			dprintk(" marked for commit\n");
-			goto next;
-		}
-		dprintk(" OK\n");
-remove_request:
-		nfs_inode_remove_request(req);
-	next:
-		nfs_unlock_request(req);
-		nfs_end_page_writeback(page);
+		else if (memcmp(&req->wb_verf, &data->verf, sizeof(req->wb_verf)))
+			set_bit(NFS_IOHDR_NEED_RESCHED, &hdr->flags);
+		spin_unlock(&hdr->lock);
 	}
 	nfs_writedata_release(data);
 }
 
-static const struct rpc_call_ops nfs_write_full_ops = {
+static const struct rpc_call_ops nfs_write_common_ops = {
 	.rpc_call_prepare = nfs_write_prepare,
-	.rpc_call_done = nfs_writeback_done_full,
-	.rpc_release = nfs_writeback_release_full,
+	.rpc_call_done = nfs_writeback_done_common,
+	.rpc_release = nfs_writeback_release_common,
 };
 
 
@@ -1307,38 +1270,40 @@ void nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
 		}
 	}
 #endif
-	/* Is this a short write? */
-	if (task->tk_status >= 0 && resp->count < argp->count) {
+	if (task->tk_status < 0)
+		nfs_set_pgio_error(data->header, task->tk_status, argp->offset);
+	else if (resp->count < argp->count) {
 		static unsigned long    complain;
 
+		/* This a short write! */
 		nfs_inc_stats(inode, NFSIOS_SHORTWRITE);
 
 		/* Has the server at least made some progress? */
-		if (resp->count != 0) {
-			/* Was this an NFSv2 write or an NFSv3 stable write? */
-			if (resp->verf->committed != NFS_UNSTABLE) {
-				/* Resend from where the server left off */
-				data->mds_offset += resp->count;
-				argp->offset += resp->count;
-				argp->pgbase += resp->count;
-				argp->count -= resp->count;
-			} else {
-				/* Resend as a stable write in order to avoid
-				 * headaches in the case of a server crash.
-				 */
-				argp->stable = NFS_FILE_SYNC;
+		if (resp->count == 0) {
+			if (time_before(complain, jiffies)) {
+				printk(KERN_WARNING
+				       "NFS: Server wrote zero bytes, expected %u.\n",
+				       argp->count);
+				complain = jiffies + 300 * HZ;
 			}
-			rpc_restart_call_prepare(task);
+			nfs_set_pgio_error(data->header, -EIO, argp->offset);
+			task->tk_status = -EIO;
 			return;
 		}
-		if (time_before(complain, jiffies)) {
-			printk(KERN_WARNING
-			       "NFS: Server wrote zero bytes, expected %u.\n",
-					argp->count);
-			complain = jiffies + 300 * HZ;
+		/* Was this an NFSv2 write or an NFSv3 stable write? */
+		if (resp->verf->committed != NFS_UNSTABLE) {
+			/* Resend from where the server left off */
+			data->mds_offset += resp->count;
+			argp->offset += resp->count;
+			argp->pgbase += resp->count;
+			argp->count -= resp->count;
+		} else {
+			/* Resend as a stable write in order to avoid
+			 * headaches in the case of a server crash.
+			 */
+			argp->stable = NFS_FILE_SYNC;
 		}
-		/* Can't do anything about it except throw an error. */
-		task->tk_status = -EIO;
+		rpc_restart_call_prepare(task);
 	}
 }
 
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 1648621..0d17db7 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -1192,6 +1192,8 @@ enum {
 	NFS_IOHDR_ERROR = 0,
 	NFS_IOHDR_EOF,
 	NFS_IOHDR_REDO,
+	NFS_IOHDR_NEED_COMMIT,
+	NFS_IOHDR_NEED_RESCHED,
 };
 
 struct nfs_pgio_header {
-- 
cgit v0.10.2


From 061ae2edb7375ab6776468b075da71008a098b55 Mon Sep 17 00:00:00 2001
From: Fred Isaman <iisaman@netapp.com>
Date: Fri, 20 Apr 2012 14:47:48 -0400
Subject: NFS: create completion structure to pass into page_init functions

Factors out the code that will need to change when directio
starts using these code paths.  This will allow directio to use
the generic pagein and flush routines

Signed-off-by: Fred Isaman <iisaman@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 16bc9c4..3ef8fcd 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -300,11 +300,10 @@ extern struct dentry *nfs4_get_root(struct super_block *, struct nfs_fh *,
 extern int nfs4_get_rootfh(struct nfs_server *server, struct nfs_fh *mntfh);
 #endif
 
+struct nfs_pgio_completion_ops;
 /* read.c */
-extern void nfs_async_read_error(struct list_head *head);
 extern struct nfs_read_header *nfs_readhdr_alloc(void);
 extern void nfs_readhdr_free(struct nfs_pgio_header *hdr);
-extern void nfs_read_completion(struct nfs_pgio_header *hdr);
 extern struct nfs_read_data *nfs_readdata_alloc(struct nfs_pgio_header *hdr,
 						unsigned int pagecount);
 extern int nfs_initiate_read(struct rpc_clnt *clnt,
@@ -314,21 +313,21 @@ extern void nfs_read_prepare(struct rpc_task *task, void *calldata);
 extern int nfs_generic_pagein(struct nfs_pageio_descriptor *desc,
 			      struct nfs_pgio_header *hdr);
 extern void nfs_pageio_init_read_mds(struct nfs_pageio_descriptor *pgio,
-		struct inode *inode);
+			struct inode *inode,
+			const struct nfs_pgio_completion_ops *compl_ops);
 extern void nfs_pageio_reset_read_mds(struct nfs_pageio_descriptor *pgio);
 extern void nfs_readdata_release(struct nfs_read_data *rdata);
 
 /* write.c */
-extern void nfs_async_write_error(struct list_head *head);
 extern struct nfs_write_header *nfs_writehdr_alloc(void);
 extern void nfs_writehdr_free(struct nfs_pgio_header *hdr);
 extern struct nfs_write_data *nfs_writedata_alloc(struct nfs_pgio_header *hdr,
 						  unsigned int pagecount);
-extern void nfs_write_completion(struct nfs_pgio_header *hdr);
 extern int nfs_generic_flush(struct nfs_pageio_descriptor *desc,
 			     struct nfs_pgio_header *hdr);
 extern void nfs_pageio_init_write_mds(struct nfs_pageio_descriptor *pgio,
-				  struct inode *inode, int ioflags);
+			struct inode *inode, int ioflags,
+			const struct nfs_pgio_completion_ops *compl_ops);
 extern void nfs_pageio_reset_write_mds(struct nfs_pageio_descriptor *pgio);
 extern void nfs_writedata_release(struct nfs_write_data *wdata);
 extern void nfs_commit_free(struct nfs_commit_data *p);
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index cd4c038..4cf2a68 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -49,6 +49,7 @@ void nfs_pgheader_init(struct nfs_pageio_descriptor *desc,
 	hdr->io_start = req_offset(hdr->req);
 	hdr->good_bytes = desc->pg_count;
 	hdr->release = release;
+	hdr->completion_ops = desc->pg_completion_ops;
 }
 
 void nfs_set_pgio_error(struct nfs_pgio_header *hdr, int error, loff_t pos)
@@ -240,6 +241,7 @@ EXPORT_SYMBOL_GPL(nfs_generic_pg_test);
 void nfs_pageio_init(struct nfs_pageio_descriptor *desc,
 		     struct inode *inode,
 		     const struct nfs_pageio_ops *pg_ops,
+		     const struct nfs_pgio_completion_ops *compl_ops,
 		     size_t bsize,
 		     int io_flags)
 {
@@ -252,6 +254,7 @@ void nfs_pageio_init(struct nfs_pageio_descriptor *desc,
 	desc->pg_recoalesce = 0;
 	desc->pg_inode = inode;
 	desc->pg_ops = pg_ops;
+	desc->pg_completion_ops = compl_ops;
 	desc->pg_ioflags = io_flags;
 	desc->pg_error = 0;
 	desc->pg_lseg = NULL;
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index d515f00..b3a0c01 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -1113,26 +1113,31 @@ pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *
 EXPORT_SYMBOL_GPL(pnfs_generic_pg_init_write);
 
 bool
-pnfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, struct inode *inode)
+pnfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, struct inode *inode,
+		      const struct nfs_pgio_completion_ops *compl_ops)
 {
 	struct nfs_server *server = NFS_SERVER(inode);
 	struct pnfs_layoutdriver_type *ld = server->pnfs_curr_ld;
 
 	if (ld == NULL)
 		return false;
-	nfs_pageio_init(pgio, inode, ld->pg_read_ops, server->rsize, 0);
+	nfs_pageio_init(pgio, inode, ld->pg_read_ops, compl_ops,
+			server->rsize, 0);
 	return true;
 }
 
 bool
-pnfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, struct inode *inode, int ioflags)
+pnfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, struct inode *inode,
+		       int ioflags,
+		       const struct nfs_pgio_completion_ops *compl_ops)
 {
 	struct nfs_server *server = NFS_SERVER(inode);
 	struct pnfs_layoutdriver_type *ld = server->pnfs_curr_ld;
 
 	if (ld == NULL)
 		return false;
-	nfs_pageio_init(pgio, inode, ld->pg_write_ops, server->wsize, ioflags);
+	nfs_pageio_init(pgio, inode, ld->pg_write_ops, compl_ops,
+			server->wsize, ioflags);
 	return true;
 }
 
@@ -1162,13 +1167,15 @@ pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
 }
 EXPORT_SYMBOL_GPL(pnfs_generic_pg_test);
 
-static int pnfs_write_done_resend_to_mds(struct inode *inode, struct list_head *head)
+static int pnfs_write_done_resend_to_mds(struct inode *inode,
+				struct list_head *head,
+				const struct nfs_pgio_completion_ops *compl_ops)
 {
 	struct nfs_pageio_descriptor pgio;
 	LIST_HEAD(failed);
 
 	/* Resend all requests through the MDS */
-	nfs_pageio_init_write_mds(&pgio, inode, FLUSH_STABLE);
+	nfs_pageio_init_write_mds(&pgio, inode, FLUSH_STABLE, compl_ops);
 	while (!list_empty(head)) {
 		struct nfs_page *req = nfs_list_entry(head->next);
 
@@ -1201,7 +1208,8 @@ static void pnfs_ld_handle_write_error(struct nfs_write_data *data)
 	}
 	if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags))
 		data->task.tk_status = pnfs_write_done_resend_to_mds(hdr->inode,
-								&hdr->pages);
+							&hdr->pages,
+							hdr->completion_ops);
 }
 
 /*
@@ -1292,7 +1300,7 @@ pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc)
 
 	whdr = nfs_writehdr_alloc();
 	if (!whdr) {
-		nfs_async_write_error(&desc->pg_list);
+		desc->pg_completion_ops->error_cleanup(&hdr->pages);
 		put_lseg(desc->pg_lseg);
 		desc->pg_lseg = NULL;
 		return -ENOMEM;
@@ -1309,18 +1317,20 @@ pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc)
 	} else
 		pnfs_do_multiple_writes(desc, &hdr->rpc_list, desc->pg_ioflags);
 	if (atomic_dec_and_test(&hdr->refcnt))
-		nfs_write_completion(hdr);
+		hdr->completion_ops->completion(hdr);
 	return ret;
 }
 EXPORT_SYMBOL_GPL(pnfs_generic_pg_writepages);
 
-static int pnfs_read_done_resend_to_mds(struct inode *inode, struct list_head *head)
+static int pnfs_read_done_resend_to_mds(struct inode *inode,
+				struct list_head *head,
+				const struct nfs_pgio_completion_ops *compl_ops)
 {
 	struct nfs_pageio_descriptor pgio;
 	LIST_HEAD(failed);
 
 	/* Resend all requests through the MDS */
-	nfs_pageio_init_read_mds(&pgio, inode);
+	nfs_pageio_init_read_mds(&pgio, inode, compl_ops);
 	while (!list_empty(head)) {
 		struct nfs_page *req = nfs_list_entry(head->next);
 
@@ -1349,7 +1359,8 @@ static void pnfs_ld_handle_read_error(struct nfs_read_data *data)
 	}
 	if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags))
 		data->task.tk_status = pnfs_read_done_resend_to_mds(hdr->inode,
-								&hdr->pages);
+							&hdr->pages,
+							hdr->completion_ops);
 }
 
 /*
@@ -1443,7 +1454,7 @@ pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc)
 
 	rhdr = nfs_readhdr_alloc();
 	if (!rhdr) {
-		nfs_async_read_error(&desc->pg_list);
+		desc->pg_completion_ops->error_cleanup(&desc->pg_list);
 		ret = -ENOMEM;
 		put_lseg(desc->pg_lseg);
 		desc->pg_lseg = NULL;
@@ -1461,7 +1472,7 @@ pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc)
 	} else
 		pnfs_do_multiple_reads(desc, &hdr->rpc_list);
 	if (atomic_dec_and_test(&hdr->refcnt))
-		nfs_read_completion(hdr);
+		hdr->completion_ops->completion(hdr);
 	return ret;
 }
 EXPORT_SYMBOL_GPL(pnfs_generic_pg_readpages);
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 442ebf6..734e4ef 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -168,8 +168,10 @@ extern int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp);
 void get_layout_hdr(struct pnfs_layout_hdr *lo);
 void put_lseg(struct pnfs_layout_segment *lseg);
 
-bool pnfs_pageio_init_read(struct nfs_pageio_descriptor *, struct inode *);
-bool pnfs_pageio_init_write(struct nfs_pageio_descriptor *, struct inode *, int);
+bool pnfs_pageio_init_read(struct nfs_pageio_descriptor *, struct inode *,
+			   const struct nfs_pgio_completion_ops *);
+bool pnfs_pageio_init_write(struct nfs_pageio_descriptor *, struct inode *,
+			    int, const struct nfs_pgio_completion_ops *);
 
 void set_pnfs_layoutdriver(struct nfs_server *, const struct nfs_fh *, u32);
 void unset_pnfs_layoutdriver(struct nfs_server *);
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index c9633b2..5e78af1 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -31,6 +31,7 @@
 
 static const struct nfs_pageio_ops nfs_pageio_read_ops;
 static const struct rpc_call_ops nfs_read_common_ops;
+static const struct nfs_pgio_completion_ops nfs_async_read_completion_ops;
 
 static struct kmem_cache *nfs_rdata_cachep;
 
@@ -95,7 +96,7 @@ void nfs_readdata_release(struct nfs_read_data *rdata)
 	else
 		rdata->header = NULL;
 	if (atomic_dec_and_test(&hdr->refcnt))
-		nfs_read_completion(hdr);
+		hdr->completion_ops->completion(hdr);
 }
 
 static
@@ -108,9 +109,10 @@ int nfs_return_empty_page(struct page *page)
 }
 
 void nfs_pageio_init_read_mds(struct nfs_pageio_descriptor *pgio,
-		struct inode *inode)
+			      struct inode *inode,
+			      const struct nfs_pgio_completion_ops *compl_ops)
 {
-	nfs_pageio_init(pgio, inode, &nfs_pageio_read_ops,
+	nfs_pageio_init(pgio, inode, &nfs_pageio_read_ops, compl_ops,
 			NFS_SERVER(inode)->rsize, 0);
 }
 
@@ -122,10 +124,11 @@ void nfs_pageio_reset_read_mds(struct nfs_pageio_descriptor *pgio)
 EXPORT_SYMBOL_GPL(nfs_pageio_reset_read_mds);
 
 static void nfs_pageio_init_read(struct nfs_pageio_descriptor *pgio,
-		struct inode *inode)
+				struct inode *inode,
+				const struct nfs_pgio_completion_ops *compl_ops)
 {
-	if (!pnfs_pageio_init_read(pgio, inode))
-		nfs_pageio_init_read_mds(pgio, inode);
+	if (!pnfs_pageio_init_read(pgio, inode, compl_ops))
+		nfs_pageio_init_read_mds(pgio, inode, compl_ops);
 }
 
 int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,
@@ -146,7 +149,7 @@ int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,
 	if (len < PAGE_CACHE_SIZE)
 		zero_user_segment(page, len, PAGE_CACHE_SIZE);
 
-	nfs_pageio_init_read(&pgio, inode);
+	nfs_pageio_init_read(&pgio, inode, &nfs_async_read_completion_ops);
 	nfs_pageio_add_request(&pgio, new);
 	nfs_pageio_complete(&pgio);
 	return 0;
@@ -170,7 +173,7 @@ static void nfs_readpage_release(struct nfs_page *req)
 }
 
 /* Note io was page aligned */
-void nfs_read_completion(struct nfs_pgio_header *hdr)
+static void nfs_read_completion(struct nfs_pgio_header *hdr)
 {
 	unsigned long bytes = 0;
 
@@ -300,7 +303,7 @@ nfs_do_multiple_reads(struct list_head *head,
 	return ret;
 }
 
-void
+static void
 nfs_async_read_error(struct list_head *head)
 {
 	struct nfs_page	*req;
@@ -312,6 +315,11 @@ nfs_async_read_error(struct list_head *head)
 	}
 }
 
+static const struct nfs_pgio_completion_ops nfs_async_read_completion_ops = {
+	.error_cleanup = nfs_async_read_error,
+	.completion = nfs_read_completion,
+};
+
 /*
  * Generate multiple requests to fill a single page.
  *
@@ -362,7 +370,7 @@ out_bad:
 		list_del(&data->list);
 		nfs_readdata_release(data);
 	}
-	nfs_async_read_error(&hdr->pages);
+	desc->pg_completion_ops->error_cleanup(&hdr->pages);
 	return -ENOMEM;
 }
 
@@ -378,7 +386,7 @@ static int nfs_pagein_one(struct nfs_pageio_descriptor *desc,
 	data = nfs_readdata_alloc(hdr, nfs_page_array_len(desc->pg_base,
 							  desc->pg_count));
 	if (!data) {
-		nfs_async_read_error(head);
+		desc->pg_completion_ops->error_cleanup(head);
 		ret = -ENOMEM;
 		goto out;
 	}
@@ -414,7 +422,7 @@ static int nfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc)
 
 	rhdr = nfs_readhdr_alloc();
 	if (!rhdr) {
-		nfs_async_read_error(&desc->pg_list);
+		desc->pg_completion_ops->error_cleanup(&desc->pg_list);
 		return -ENOMEM;
 	}
 	hdr = &rhdr->header;
@@ -427,7 +435,7 @@ static int nfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc)
 	else
 		set_bit(NFS_IOHDR_REDO, &hdr->flags);
 	if (atomic_dec_and_test(&hdr->refcnt))
-		nfs_read_completion(hdr);
+		hdr->completion_ops->completion(hdr);
 	return ret;
 }
 
@@ -652,7 +660,7 @@ int nfs_readpages(struct file *filp, struct address_space *mapping,
 	if (ret == 0)
 		goto read_complete; /* all pages were read */
 
-	nfs_pageio_init_read(&pgio, inode);
+	nfs_pageio_init_read(&pgio, inode, &nfs_async_read_completion_ops);
 
 	ret = read_cache_pages(mapping, pages, readpage_async_filler, &desc);
 
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 076075e..1503972 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -40,10 +40,12 @@
  * Local function declarations
  */
 static void nfs_pageio_init_write(struct nfs_pageio_descriptor *desc,
-				  struct inode *inode, int ioflags);
+			struct inode *inode, int ioflags,
+			const struct nfs_pgio_completion_ops *compl_ops);
 static void nfs_redirty_request(struct nfs_page *req);
 static const struct rpc_call_ops nfs_write_common_ops;
 static const struct rpc_call_ops nfs_commit_ops;
+static const struct nfs_pgio_completion_ops nfs_async_write_completion_ops;
 
 static struct kmem_cache *nfs_wdata_cachep;
 static mempool_t *nfs_wdata_mempool;
@@ -128,7 +130,7 @@ void nfs_writedata_release(struct nfs_write_data *wdata)
 	else
 		wdata->header = NULL;
 	if (atomic_dec_and_test(&hdr->refcnt))
-		nfs_write_completion(hdr);
+		hdr->completion_ops->completion(hdr);
 }
 
 static void nfs_context_set_write_error(struct nfs_open_context *ctx, int error)
@@ -337,7 +339,8 @@ static int nfs_writepage_locked(struct page *page, struct writeback_control *wbc
 	struct nfs_pageio_descriptor pgio;
 	int err;
 
-	nfs_pageio_init_write(&pgio, page->mapping->host, wb_priority(wbc));
+	nfs_pageio_init_write(&pgio, page->mapping->host, wb_priority(wbc),
+			      &nfs_async_write_completion_ops);
 	err = nfs_do_writepage(page, wbc, &pgio);
 	nfs_pageio_complete(&pgio);
 	if (err < 0)
@@ -380,7 +383,8 @@ int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc)
 
 	nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGES);
 
-	nfs_pageio_init_write(&pgio, inode, wb_priority(wbc));
+	nfs_pageio_init_write(&pgio, inode, wb_priority(wbc),
+			      &nfs_async_write_completion_ops);
 	err = write_cache_pages(mapping, wbc, nfs_writepages_callback, &pgio);
 	nfs_pageio_complete(&pgio);
 
@@ -558,7 +562,7 @@ int nfs_write_need_commit(struct nfs_write_data *data)
 
 #endif
 
-void nfs_write_completion(struct nfs_pgio_header *hdr)
+static void nfs_write_completion(struct nfs_pgio_header *hdr)
 {
 	unsigned long bytes = 0;
 
@@ -1000,7 +1004,7 @@ static void nfs_redirty_request(struct nfs_page *req)
 	nfs_end_page_writeback(page);
 }
 
-void nfs_async_write_error(struct list_head *head)
+static void nfs_async_write_error(struct list_head *head)
 {
 	struct nfs_page	*req;
 
@@ -1011,6 +1015,11 @@ void nfs_async_write_error(struct list_head *head)
 	}
 }
 
+static const struct nfs_pgio_completion_ops nfs_async_write_completion_ops = {
+	.error_cleanup = nfs_async_write_error,
+	.completion = nfs_write_completion,
+};
+
 /*
  * Generate multiple small requests to write out a single
  * contiguous dirty area on one page.
@@ -1060,7 +1069,7 @@ out_bad:
 		list_del(&data->list);
 		nfs_writedata_release(data);
 	}
-	nfs_async_write_error(&hdr->pages);
+	desc->pg_completion_ops->error_cleanup(&hdr->pages);
 	return -ENOMEM;
 }
 
@@ -1084,7 +1093,7 @@ static int nfs_flush_one(struct nfs_pageio_descriptor *desc,
 	data = nfs_writedata_alloc(hdr, nfs_page_array_len(desc->pg_base,
 							   desc->pg_count));
 	if (!data) {
-		nfs_async_write_error(head);
+		desc->pg_completion_ops->error_cleanup(head);
 		ret = -ENOMEM;
 		goto out;
 	}
@@ -1125,7 +1134,7 @@ static int nfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc)
 
 	whdr = nfs_writehdr_alloc();
 	if (!whdr) {
-		nfs_async_write_error(&desc->pg_list);
+		desc->pg_completion_ops->error_cleanup(&hdr->pages);
 		return -ENOMEM;
 	}
 	hdr = &whdr->header;
@@ -1139,7 +1148,7 @@ static int nfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc)
 	else
 		set_bit(NFS_IOHDR_REDO, &hdr->flags);
 	if (atomic_dec_and_test(&hdr->refcnt))
-		nfs_write_completion(hdr);
+		hdr->completion_ops->completion(hdr);
 	return ret;
 }
 
@@ -1149,9 +1158,10 @@ static const struct nfs_pageio_ops nfs_pageio_write_ops = {
 };
 
 void nfs_pageio_init_write_mds(struct nfs_pageio_descriptor *pgio,
-				  struct inode *inode, int ioflags)
+			       struct inode *inode, int ioflags,
+			       const struct nfs_pgio_completion_ops *compl_ops)
 {
-	nfs_pageio_init(pgio, inode, &nfs_pageio_write_ops,
+	nfs_pageio_init(pgio, inode, &nfs_pageio_write_ops, compl_ops,
 				NFS_SERVER(inode)->wsize, ioflags);
 }
 
@@ -1163,10 +1173,11 @@ void nfs_pageio_reset_write_mds(struct nfs_pageio_descriptor *pgio)
 EXPORT_SYMBOL_GPL(nfs_pageio_reset_write_mds);
 
 static void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio,
-				  struct inode *inode, int ioflags)
+				struct inode *inode, int ioflags,
+				const struct nfs_pgio_completion_ops *compl_ops)
 {
-	if (!pnfs_pageio_init_write(pgio, inode, ioflags))
-		nfs_pageio_init_write_mds(pgio, inode, ioflags);
+	if (!pnfs_pageio_init_write(pgio, inode, ioflags, compl_ops))
+		nfs_pageio_init_write_mds(pgio, inode, ioflags, compl_ops);
 }
 
 void nfs_write_prepare(struct rpc_task *task, void *calldata)
diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h
index 5c52034..bc5b7a5 100644
--- a/include/linux/nfs_page.h
+++ b/include/linux/nfs_page.h
@@ -67,6 +67,7 @@ struct nfs_pageio_descriptor {
 	int 			pg_ioflags;
 	int			pg_error;
 	const struct rpc_call_ops *pg_rpc_callops;
+	const struct nfs_pgio_completion_ops *pg_completion_ops;
 	struct pnfs_layout_segment *pg_lseg;
 };
 
@@ -83,6 +84,7 @@ extern	void nfs_release_request(struct nfs_page *req);
 extern	void nfs_pageio_init(struct nfs_pageio_descriptor *desc,
 			     struct inode *inode,
 			     const struct nfs_pageio_ops *pg_ops,
+			     const struct nfs_pgio_completion_ops *compl_ops,
 			     size_t bsize,
 			     int how);
 extern	int nfs_pageio_add_request(struct nfs_pageio_descriptor *,
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 0d17db7..6fa1d22 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -1207,6 +1207,7 @@ struct nfs_pgio_header {
 	loff_t			io_start;
 	const struct rpc_call_ops *mds_ops;
 	void (*release) (struct nfs_pgio_header *hdr);
+	const struct nfs_pgio_completion_ops *completion_ops;
 	spinlock_t		lock;
 	/* fields protected by lock */
 	int			pnfs_error;
@@ -1261,6 +1262,11 @@ struct nfs_commit_data {
 	int (*commit_done_cb) (struct rpc_task *task, struct nfs_commit_data *data);
 };
 
+struct nfs_pgio_completion_ops {
+	void	(*error_cleanup)(struct list_head *head);
+	void	(*completion)(struct nfs_pgio_header *hdr);
+};
+
 struct nfs_unlinkdata {
 	struct hlist_node list;
 	struct nfs_removeargs args;
-- 
cgit v0.10.2


From 9533da2979757258d3fd5429d830a297013d69ed Mon Sep 17 00:00:00 2001
From: Fred Isaman <iisaman@netapp.com>
Date: Fri, 20 Apr 2012 14:47:49 -0400
Subject: NFS: remove unused wb_complete field from struct nfs_page

Signed-off-by: Fred Isaman <iisaman@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 4cf2a68..5d01a16 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -114,7 +114,6 @@ nfs_create_request(struct nfs_open_context *ctx, struct inode *inode,
 	 * long write-back delay. This will be adjusted in
 	 * update_nfs_request below if the region is not locked. */
 	req->wb_page    = page;
-	atomic_set(&req->wb_complete, 0);
 	req->wb_index	= page->index;
 	page_cache_get(page);
 	BUG_ON(PagePrivate(page));
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 1503972..705bf01 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1059,7 +1059,6 @@ static int nfs_flush_multi(struct nfs_pageio_descriptor *desc,
 		nbytes -= len;
 		offset += len;
 	} while (nbytes != 0);
-	atomic_set(&req->wb_complete, requests);
 	desc->pg_rpc_callops = &nfs_write_common_ops;
 	return ret;
 
diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h
index bc5b7a5..0a5b63f 100644
--- a/include/linux/nfs_page.h
+++ b/include/linux/nfs_page.h
@@ -36,7 +36,6 @@ struct nfs_page {
 	struct page		*wb_page;	/* page to read in/write out */
 	struct nfs_open_context	*wb_context;	/* File state context info */
 	struct nfs_lock_context	*wb_lock_context;	/* lock context info */
-	atomic_t		wb_complete;	/* i/os we're waiting for */
 	pgoff_t			wb_index;	/* Offset >> PAGE_CACHE_SHIFT */
 	unsigned int		wb_offset,	/* Offset & ~PAGE_CACHE_MASK */
 				wb_pgbase,	/* Start of page data */
-- 
cgit v0.10.2


From 1825a0d08f22463e5a8f4b1636473efd057a3479 Mon Sep 17 00:00:00 2001
From: Fred Isaman <iisaman@netapp.com>
Date: Fri, 20 Apr 2012 19:55:31 -0400
Subject: NFS: prepare coalesce testing for directio

The coalesce code made assumptions that will no longer be true once
non-page aligned io occurs.  This introduces no change in
current behavior, but allows for more general situations to come.

Signed-off-by: Fred Isaman <iisaman@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
index 02d8170..e40523f 100644
--- a/fs/nfs/nfs4filelayout.c
+++ b/fs/nfs/nfs4filelayout.c
@@ -796,6 +796,16 @@ filelayout_pg_init_read(struct nfs_pageio_descriptor *pgio,
 {
 	BUG_ON(pgio->pg_lseg != NULL);
 
+	if (req->wb_offset != req->wb_pgbase) {
+		/*
+		 * Handling unaligned pages is difficult, because have to
+		 * somehow split a req in two in certain cases in the
+		 * pg.test code.  Avoid this by just not using pnfs
+		 * in this case.
+		 */
+		nfs_pageio_reset_read_mds(pgio);
+		return;
+	}
 	pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
 					   req->wb_context,
 					   0,
@@ -815,6 +825,8 @@ filelayout_pg_init_write(struct nfs_pageio_descriptor *pgio,
 
 	BUG_ON(pgio->pg_lseg != NULL);
 
+	if (req->wb_offset != req->wb_pgbase)
+		goto out_mds;
 	pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
 					   req->wb_context,
 					   0,
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 5d01a16..638ca7f 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -280,12 +280,12 @@ static bool nfs_can_coalesce_requests(struct nfs_page *prev,
 		return false;
 	if (req->wb_context->state != prev->wb_context->state)
 		return false;
-	if (req->wb_index != (prev->wb_index + 1))
-		return false;
 	if (req->wb_pgbase != 0)
 		return false;
 	if (prev->wb_pgbase + prev->wb_bytes != PAGE_CACHE_SIZE)
 		return false;
+	if (req_offset(req) != req_offset(prev) + prev->wb_bytes)
+		return false;
 	return pgio->pg_ops->pg_test(pgio, prev, req);
 }
 
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index b3a0c01..4da05e4 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -1082,6 +1082,10 @@ pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *r
 {
 	BUG_ON(pgio->pg_lseg != NULL);
 
+	if (req->wb_offset != req->wb_pgbase) {
+		nfs_pageio_reset_read_mds(pgio);
+		return;
+	}
 	pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
 					   req->wb_context,
 					   req_offset(req),
@@ -1100,6 +1104,10 @@ pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *
 {
 	BUG_ON(pgio->pg_lseg != NULL);
 
+	if (req->wb_offset != req->wb_pgbase) {
+		nfs_pageio_reset_write_mds(pgio);
+		return;
+	}
 	pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
 					   req->wb_context,
 					   req_offset(req),
-- 
cgit v0.10.2


From 584aa810b6240d88c28113a90c5029449814a3b5 Mon Sep 17 00:00:00 2001
From: Fred Isaman <iisaman@netapp.com>
Date: Fri, 20 Apr 2012 14:47:51 -0400
Subject: NFS: rewrite directio read to use async coalesce code

This also has the advantage that it allows directio to use pnfs.

Signed-off-by: Fred Isaman <iisaman@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 22a40c4..4ba9a2c 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -124,22 +124,6 @@ ssize_t nfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, loff_
 	return -EINVAL;
 }
 
-static void nfs_direct_dirty_pages(struct page **pages, unsigned int pgbase, size_t count)
-{
-	unsigned int npages;
-	unsigned int i;
-
-	if (count == 0)
-		return;
-	pages += (pgbase >> PAGE_SHIFT);
-	npages = (count + (pgbase & ~PAGE_MASK) + PAGE_SIZE - 1) >> PAGE_SHIFT;
-	for (i = 0; i < npages; i++) {
-		struct page *page = pages[i];
-		if (!PageCompound(page))
-			set_page_dirty(page);
-	}
-}
-
 static void nfs_direct_release_pages(struct page **pages, unsigned int npages)
 {
 	unsigned int i;
@@ -226,58 +210,92 @@ static void nfs_direct_complete(struct nfs_direct_req *dreq)
 	nfs_direct_req_release(dreq);
 }
 
-/*
- * We must hold a reference to all the pages in this direct read request
- * until the RPCs complete.  This could be long *after* we are woken up in
- * nfs_direct_wait (for instance, if someone hits ^C on a slow server).
- */
-static void nfs_direct_read_result(struct rpc_task *task, void *calldata)
+void nfs_direct_readpage_release(struct nfs_page *req)
 {
-	struct nfs_read_data *data = calldata;
-
-	nfs_readpage_result(task, data);
+	dprintk("NFS: direct read done (%s/%lld %d@%lld)\n",
+		req->wb_context->dentry->d_inode->i_sb->s_id,
+		(long long)NFS_FILEID(req->wb_context->dentry->d_inode),
+		req->wb_bytes,
+		(long long)req_offset(req));
+	nfs_release_request(req);
 }
 
-static void nfs_direct_read_release(void *calldata)
+static void nfs_direct_read_completion(struct nfs_pgio_header *hdr)
 {
+	unsigned long bytes = 0;
+	struct nfs_direct_req *dreq = hdr->dreq;
 
-	struct nfs_read_data *data = calldata;
-	struct nfs_direct_req *dreq = (struct nfs_direct_req *)data->header->req;
-	int status = data->task.tk_status;
+	if (test_bit(NFS_IOHDR_REDO, &hdr->flags))
+		goto out_put;
 
 	spin_lock(&dreq->lock);
-	if (unlikely(status < 0)) {
-		dreq->error = status;
-		spin_unlock(&dreq->lock);
+	if (test_bit(NFS_IOHDR_ERROR, &hdr->flags) && (hdr->good_bytes == 0))
+		dreq->error = hdr->error;
+	else
+		dreq->count += hdr->good_bytes;
+	spin_unlock(&dreq->lock);
+
+	if (!test_bit(NFS_IOHDR_ERROR, &hdr->flags)) {
+		while (!list_empty(&hdr->pages)) {
+			struct nfs_page *req = nfs_list_entry(hdr->pages.next);
+			struct page *page = req->wb_page;
+
+			if (test_bit(NFS_IOHDR_EOF, &hdr->flags)) {
+				if (bytes > hdr->good_bytes)
+					zero_user(page, 0, PAGE_SIZE);
+				else if (hdr->good_bytes - bytes < PAGE_SIZE)
+					zero_user_segment(page,
+						hdr->good_bytes & ~PAGE_MASK,
+						PAGE_SIZE);
+			}
+			bytes += req->wb_bytes;
+			nfs_list_remove_request(req);
+			nfs_direct_readpage_release(req);
+			if (!PageCompound(page))
+				set_page_dirty(page);
+			page_cache_release(page);
+		}
 	} else {
-		dreq->count += data->res.count;
-		spin_unlock(&dreq->lock);
-		nfs_direct_dirty_pages(data->pages.pagevec,
-				data->args.pgbase,
-				data->res.count);
+		while (!list_empty(&hdr->pages)) {
+			struct nfs_page *req = nfs_list_entry(hdr->pages.next);
+
+			if (bytes < hdr->good_bytes)
+				if (!PageCompound(req->wb_page))
+					set_page_dirty(req->wb_page);
+			bytes += req->wb_bytes;
+			page_cache_release(req->wb_page);
+			nfs_list_remove_request(req);
+			nfs_direct_readpage_release(req);
+		}
 	}
-	nfs_direct_release_pages(data->pages.pagevec, data->pages.npages);
-
+out_put:
 	if (put_dreq(dreq))
 		nfs_direct_complete(dreq);
-	nfs_readdata_release(data);
+	hdr->release(hdr);
 }
 
-static const struct rpc_call_ops nfs_read_direct_ops = {
-	.rpc_call_prepare = nfs_read_prepare,
-	.rpc_call_done = nfs_direct_read_result,
-	.rpc_release = nfs_direct_read_release,
-};
-
-static void nfs_direct_readhdr_release(struct nfs_read_header *rhdr)
+static void nfs_sync_pgio_error(struct list_head *head)
 {
-	struct nfs_read_data *data = &rhdr->rpc_data;
+	struct nfs_page *req;
 
-	if (data->pages.pagevec != data->pages.page_array)
-		kfree(data->pages.pagevec);
-	nfs_readhdr_free(&rhdr->header);
+	while (!list_empty(head)) {
+		req = nfs_list_entry(head->next);
+		nfs_list_remove_request(req);
+		nfs_release_request(req);
+	}
 }
 
+static void nfs_direct_pgio_init(struct nfs_pgio_header *hdr)
+{
+	get_dreq(hdr->dreq);
+}
+
+static const struct nfs_pgio_completion_ops nfs_direct_read_completion_ops = {
+	.error_cleanup = nfs_sync_pgio_error,
+	.init_hdr = nfs_direct_pgio_init,
+	.completion = nfs_direct_read_completion,
+};
+
 /*
  * For each rsize'd chunk of the user's buffer, dispatch an NFS READ
  * operation.  If nfs_readdata_alloc() or get_user_pages() fails,
@@ -285,118 +303,85 @@ static void nfs_direct_readhdr_release(struct nfs_read_header *rhdr)
  * handled automatically by nfs_direct_read_result().  Otherwise, if
  * no requests have been sent, just return an error.
  */
-static ssize_t nfs_direct_read_schedule_segment(struct nfs_direct_req *dreq,
+static ssize_t nfs_direct_read_schedule_segment(struct nfs_pageio_descriptor *desc,
 						const struct iovec *iov,
 						loff_t pos)
 {
+	struct nfs_direct_req *dreq = desc->pg_dreq;
 	struct nfs_open_context *ctx = dreq->ctx;
 	struct inode *inode = ctx->dentry->d_inode;
 	unsigned long user_addr = (unsigned long)iov->iov_base;
 	size_t count = iov->iov_len;
 	size_t rsize = NFS_SERVER(inode)->rsize;
-	struct rpc_task *task;
-	struct rpc_message msg = {
-		.rpc_cred = ctx->cred,
-	};
-	struct rpc_task_setup task_setup_data = {
-		.rpc_client = NFS_CLIENT(inode),
-		.rpc_message = &msg,
-		.callback_ops = &nfs_read_direct_ops,
-		.workqueue = nfsiod_workqueue,
-		.flags = RPC_TASK_ASYNC,
-	};
 	unsigned int pgbase;
 	int result;
 	ssize_t started = 0;
+	struct page **pagevec = NULL;
+	unsigned int npages;
 
 	do {
-		struct nfs_read_header *rhdr;
-		struct nfs_read_data *data;
-		struct nfs_page_array *pages;
 		size_t bytes;
+		int i;
 
 		pgbase = user_addr & ~PAGE_MASK;
-		bytes = min(rsize,count);
+		bytes = min(max(rsize, PAGE_SIZE), count);
 
 		result = -ENOMEM;
-		rhdr = nfs_readhdr_alloc();
-		if (unlikely(!rhdr))
-			break;
-		data = nfs_readdata_alloc(&rhdr->header, nfs_page_array_len(pgbase, bytes));
-		if (!data) {
-			nfs_readhdr_free(&rhdr->header);
+		npages = nfs_page_array_len(pgbase, bytes);
+		if (!pagevec)
+			pagevec = kmalloc(npages * sizeof(struct page *),
+					  GFP_KERNEL);
+		if (!pagevec)
 			break;
-		}
-		data->header = &rhdr->header;
-		atomic_inc(&data->header->refcnt);
-		pages = &data->pages;
-
 		down_read(&current->mm->mmap_sem);
 		result = get_user_pages(current, current->mm, user_addr,
-					pages->npages, 1, 0, pages->pagevec, NULL);
+					npages, 1, 0, pagevec, NULL);
 		up_read(&current->mm->mmap_sem);
-		if (result < 0) {
-			nfs_direct_readhdr_release(rhdr);
+		if (result < 0)
 			break;
-		}
-		if ((unsigned)result < pages->npages) {
+		if ((unsigned)result < npages) {
 			bytes = result * PAGE_SIZE;
 			if (bytes <= pgbase) {
-				nfs_direct_release_pages(pages->pagevec, result);
-				nfs_direct_readhdr_release(rhdr);
+				nfs_direct_release_pages(pagevec, result);
 				break;
 			}
 			bytes -= pgbase;
-			pages->npages = result;
+			npages = result;
 		}
 
-		get_dreq(dreq);
-
-		rhdr->header.req = (struct nfs_page *) dreq;
-		rhdr->header.inode = inode;
-		rhdr->header.cred = msg.rpc_cred;
-		data->args.fh = NFS_FH(inode);
-		data->args.context = get_nfs_open_context(ctx);
-		data->args.lock_context = dreq->l_ctx;
-		data->args.offset = pos;
-		data->args.pgbase = pgbase;
-		data->args.pages = pages->pagevec;
-		data->args.count = bytes;
-		data->res.fattr = &data->fattr;
-		data->res.eof = 0;
-		data->res.count = bytes;
-		nfs_fattr_init(&data->fattr);
-		msg.rpc_argp = &data->args;
-		msg.rpc_resp = &data->res;
-
-		task_setup_data.task = &data->task;
-		task_setup_data.callback_data = data;
-		NFS_PROTO(inode)->read_setup(data, &msg);
-
-		task = rpc_run_task(&task_setup_data);
-		if (IS_ERR(task))
-			break;
-
-		dprintk("NFS: %5u initiated direct read call "
-			"(req %s/%Ld, %zu bytes @ offset %Lu)\n",
-				task->tk_pid,
-				inode->i_sb->s_id,
-				(long long)NFS_FILEID(inode),
-				bytes,
-				(unsigned long long)data->args.offset);
-		rpc_put_task(task);
-
-		started += bytes;
-		user_addr += bytes;
-		pos += bytes;
-		/* FIXME: Remove this unnecessary math from final patch */
-		pgbase += bytes;
-		pgbase &= ~PAGE_MASK;
-		BUG_ON(pgbase != (user_addr & ~PAGE_MASK));
-
-		count -= bytes;
+		for (i = 0; i < npages; i++) {
+			struct nfs_page *req;
+			unsigned int req_len = min(bytes, PAGE_SIZE - pgbase);
+			/* XXX do we need to do the eof zeroing found in async_filler? */
+			req = nfs_create_request(dreq->ctx, dreq->inode,
+						 pagevec[i],
+						 pgbase, req_len);
+			if (IS_ERR(req)) {
+				nfs_direct_release_pages(pagevec + i,
+							 npages - i);
+				result = PTR_ERR(req);
+				break;
+			}
+			req->wb_index = pos >> PAGE_SHIFT;
+			req->wb_offset = pos & ~PAGE_MASK;
+			if (!nfs_pageio_add_request(desc, req)) {
+				result = desc->pg_error;
+				nfs_release_request(req);
+				nfs_direct_release_pages(pagevec + i,
+							 npages - i);
+				break;
+			}
+			pgbase = 0;
+			bytes -= req_len;
+			started += req_len;
+			user_addr += req_len;
+			pos += req_len;
+			count -= req_len;
+		}
 	} while (count != 0);
 
+	kfree(pagevec);
+
 	if (started)
 		return started;
 	return result < 0 ? (ssize_t) result : -EFAULT;
@@ -407,15 +392,19 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq,
 					      unsigned long nr_segs,
 					      loff_t pos)
 {
+	struct nfs_pageio_descriptor desc;
 	ssize_t result = -EINVAL;
 	size_t requested_bytes = 0;
 	unsigned long seg;
 
+	nfs_pageio_init_read(&desc, dreq->inode,
+			     &nfs_direct_read_completion_ops);
 	get_dreq(dreq);
+	desc.pg_dreq = dreq;
 
 	for (seg = 0; seg < nr_segs; seg++) {
 		const struct iovec *vec = &iov[seg];
-		result = nfs_direct_read_schedule_segment(dreq, vec, pos);
+		result = nfs_direct_read_schedule_segment(&desc, vec, pos);
 		if (result < 0)
 			break;
 		requested_bytes += result;
@@ -424,6 +413,8 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq,
 		pos += vec->iov_len;
 	}
 
+	nfs_pageio_complete(&desc);
+
 	/*
 	 * If no bytes were started, return the error, and let the
 	 * generic layer handle the completion.
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 3ef8fcd..cd5d4a3 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -304,8 +304,9 @@ struct nfs_pgio_completion_ops;
 /* read.c */
 extern struct nfs_read_header *nfs_readhdr_alloc(void);
 extern void nfs_readhdr_free(struct nfs_pgio_header *hdr);
-extern struct nfs_read_data *nfs_readdata_alloc(struct nfs_pgio_header *hdr,
-						unsigned int pagecount);
+extern void nfs_pageio_init_read(struct nfs_pageio_descriptor *pgio,
+			struct inode *inode,
+			const struct nfs_pgio_completion_ops *compl_ops);
 extern int nfs_initiate_read(struct rpc_clnt *clnt,
 			     struct nfs_read_data *data,
 			     const struct rpc_call_ops *call_ops);
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 638ca7f..33a21ca 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -48,8 +48,11 @@ void nfs_pgheader_init(struct nfs_pageio_descriptor *desc,
 	hdr->cred = hdr->req->wb_context->cred;
 	hdr->io_start = req_offset(hdr->req);
 	hdr->good_bytes = desc->pg_count;
+	hdr->dreq = desc->pg_dreq;
 	hdr->release = release;
 	hdr->completion_ops = desc->pg_completion_ops;
+	if (hdr->completion_ops->init_hdr)
+		hdr->completion_ops->init_hdr(hdr);
 }
 
 void nfs_set_pgio_error(struct nfs_pgio_header *hdr, int error, loff_t pos)
@@ -116,9 +119,6 @@ nfs_create_request(struct nfs_open_context *ctx, struct inode *inode,
 	req->wb_page    = page;
 	req->wb_index	= page->index;
 	page_cache_get(page);
-	BUG_ON(PagePrivate(page));
-	BUG_ON(!PageLocked(page));
-	BUG_ON(page->mapping->host != inode);
 	req->wb_offset  = offset;
 	req->wb_pgbase	= offset;
 	req->wb_bytes   = count;
@@ -257,6 +257,7 @@ void nfs_pageio_init(struct nfs_pageio_descriptor *desc,
 	desc->pg_ioflags = io_flags;
 	desc->pg_error = 0;
 	desc->pg_lseg = NULL;
+	desc->pg_dreq = NULL;
 }
 
 /**
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 5e78af1..35e2dce 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -51,8 +51,8 @@ struct nfs_read_header *nfs_readhdr_alloc()
 	return rhdr;
 }
 
-struct nfs_read_data *nfs_readdata_alloc(struct nfs_pgio_header *hdr,
-					 unsigned int pagecount)
+static struct nfs_read_data *nfs_readdata_alloc(struct nfs_pgio_header *hdr,
+						unsigned int pagecount)
 {
 	struct nfs_read_data *data, *prealloc;
 
@@ -123,9 +123,9 @@ void nfs_pageio_reset_read_mds(struct nfs_pageio_descriptor *pgio)
 }
 EXPORT_SYMBOL_GPL(nfs_pageio_reset_read_mds);
 
-static void nfs_pageio_init_read(struct nfs_pageio_descriptor *pgio,
-				struct inode *inode,
-				const struct nfs_pgio_completion_ops *compl_ops)
+void nfs_pageio_init_read(struct nfs_pageio_descriptor *pgio,
+			  struct inode *inode,
+			  const struct nfs_pgio_completion_ops *compl_ops)
 {
 	if (!pnfs_pageio_init_read(pgio, inode, compl_ops))
 		nfs_pageio_init_read_mds(pgio, inode, compl_ops);
diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h
index 0a5b63f..f9ee9eb 100644
--- a/include/linux/nfs_page.h
+++ b/include/linux/nfs_page.h
@@ -68,6 +68,7 @@ struct nfs_pageio_descriptor {
 	const struct rpc_call_ops *pg_rpc_callops;
 	const struct nfs_pgio_completion_ops *pg_completion_ops;
 	struct pnfs_layout_segment *pg_lseg;
+	struct nfs_direct_req	*pg_dreq;
 };
 
 #define NFS_WBACK_BUSY(req)	(test_bit(PG_BUSY,&(req)->wb_flags))
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 6fa1d22..38687b8 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -1208,6 +1208,7 @@ struct nfs_pgio_header {
 	const struct rpc_call_ops *mds_ops;
 	void (*release) (struct nfs_pgio_header *hdr);
 	const struct nfs_pgio_completion_ops *completion_ops;
+	struct nfs_direct_req	*dreq;
 	spinlock_t		lock;
 	/* fields protected by lock */
 	int			pnfs_error;
@@ -1221,8 +1222,6 @@ struct nfs_read_header {
 	struct nfs_read_data	rpc_data;
 };
 
-struct nfs_direct_req;
-
 struct nfs_write_data {
 	struct nfs_pgio_header	*header;
 	struct list_head	list;
@@ -1264,6 +1263,7 @@ struct nfs_commit_data {
 
 struct nfs_pgio_completion_ops {
 	void	(*error_cleanup)(struct list_head *head);
+	void	(*init_hdr)(struct nfs_pgio_header *hdr);
 	void	(*completion)(struct nfs_pgio_header *hdr);
 };
 
-- 
cgit v0.10.2


From 84c53ab5c093058c756dcef1879d38be6de90a3c Mon Sep 17 00:00:00 2001
From: Fred Isaman <iisaman@netapp.com>
Date: Fri, 20 Apr 2012 14:47:52 -0400
Subject: NFS: create nfs_generic_commit_list

Simple refactoring.

Signed-off-by: Fred Isaman <iisaman@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 705bf01..2500f1c 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1523,6 +1523,17 @@ static const struct rpc_call_ops nfs_commit_ops = {
 	.rpc_release = nfs_commit_release,
 };
 
+static int nfs_generic_commit_list(struct inode *inode, struct list_head *head,
+				   int how)
+{
+	int status;
+
+	status = pnfs_commit_list(inode, head, how);
+	if (status == PNFS_NOT_ATTEMPTED)
+		status = nfs_commit_list(inode, head, how);
+	return status;
+}
+
 int nfs_commit_inode(struct inode *inode, int how)
 {
 	LIST_HEAD(head);
@@ -1536,9 +1547,7 @@ int nfs_commit_inode(struct inode *inode, int how)
 	if (res) {
 		int error;
 
-		error = pnfs_commit_list(inode, &head, how);
-		if (error == PNFS_NOT_ATTEMPTED)
-			error = nfs_commit_list(inode, &head, how);
+		error = nfs_generic_commit_list(inode, &head, how);
 		if (error < 0)
 			return error;
 		if (!may_wait)
-- 
cgit v0.10.2


From ea2cf2282b4278461266013e9c002ee1c66700ff Mon Sep 17 00:00:00 2001
From: Fred Isaman <iisaman@netapp.com>
Date: Fri, 20 Apr 2012 14:47:53 -0400
Subject: NFS: create struct nfs_commit_info

It is COMMIT that is handled the most differently between
the paged and direct paths.  Create a structure that encapsulates
everything either path needs to know about the commit state.

We could use void to hide some of the layout driver stuff, but
Trond suggests pulling it out to ensure type checking, given the
huge changes being made, and the fact that it doesn't interfere
with other drivers.

Signed-off-by: Fred Isaman <iisaman@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index e8bbfa5..59a12c6a 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -1547,7 +1547,7 @@ static inline void nfs4_init_once(struct nfs_inode *nfsi)
 	nfsi->delegation_state = 0;
 	init_rwsem(&nfsi->rwsem);
 	nfsi->layout = NULL;
-	atomic_set(&nfsi->commits_outstanding, 0);
+	atomic_set(&nfsi->commit_info.rpcs_out, 0);
 #endif
 }
 
@@ -1559,9 +1559,9 @@ static void init_once(void *foo)
 	INIT_LIST_HEAD(&nfsi->open_files);
 	INIT_LIST_HEAD(&nfsi->access_cache_entry_lru);
 	INIT_LIST_HEAD(&nfsi->access_cache_inode_lru);
-	INIT_LIST_HEAD(&nfsi->commit_list);
+	INIT_LIST_HEAD(&nfsi->commit_info.list);
 	nfsi->npages = 0;
-	nfsi->ncommit = 0;
+	nfsi->commit_info.ncommit = 0;
 	atomic_set(&nfsi->silly_count, 1);
 	INIT_HLIST_HEAD(&nfsi->silly_list);
 	init_waitqueue_head(&nfsi->waitqueue);
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index cd5d4a3..145e9e7 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -346,12 +346,18 @@ extern void nfs_init_commit(struct nfs_commit_data *data,
 			    struct list_head *head,
 			    struct pnfs_layout_segment *lseg);
 void nfs_retry_commit(struct list_head *page_list,
-		      struct pnfs_layout_segment *lseg);
+		      struct pnfs_layout_segment *lseg,
+		      struct nfs_commit_info *cinfo);
 void nfs_commit_clear_lock(struct nfs_inode *nfsi);
 void nfs_commitdata_release(struct nfs_commit_data *data);
 void nfs_commit_release_pages(struct nfs_commit_data *data);
-void nfs_request_add_commit_list(struct nfs_page *req, struct list_head *head);
-void nfs_request_remove_commit_list(struct nfs_page *req);
+void nfs_request_add_commit_list(struct nfs_page *req, struct list_head *dst,
+				 struct nfs_commit_info *cinfo);
+void nfs_request_remove_commit_list(struct nfs_page *req,
+				    struct nfs_commit_info *cinfo);
+void nfs_init_cinfo(struct nfs_commit_info *cinfo,
+		    struct inode *inode,
+		    struct nfs_direct_req *dreq);
 
 #ifdef CONFIG_MIGRATION
 extern int nfs_migrate_page(struct address_space *,
diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
index e40523f..fe2cb55 100644
--- a/fs/nfs/nfs4filelayout.c
+++ b/fs/nfs/nfs4filelayout.c
@@ -347,9 +347,11 @@ static void filelayout_commit_count_stats(struct rpc_task *task, void *data)
 static void filelayout_commit_release(void *calldata)
 {
 	struct nfs_commit_data *data = calldata;
+	struct nfs_commit_info cinfo;
 
 	nfs_commit_release_pages(data);
-	if (atomic_dec_and_test(&NFS_I(data->inode)->commits_outstanding))
+	nfs_init_cinfo(&cinfo, data->inode, data->dreq);
+	if (atomic_dec_and_test(&cinfo.mds->rpcs_out))
 		nfs_commit_clear_lock(NFS_I(data->inode));
 	put_lseg(data->lseg);
 	nfs_commitdata_release(data);
@@ -695,17 +697,16 @@ filelayout_free_lseg(struct pnfs_layout_segment *lseg)
 
 static int
 filelayout_alloc_commit_info(struct pnfs_layout_segment *lseg,
+			     struct nfs_commit_info *cinfo,
 			     gfp_t gfp_flags)
 {
 	struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg);
-	struct nfs4_filelayout *flo = FILELAYOUT_FROM_HDR(lseg->pls_layout);
-
-	struct nfs4_fl_commit_bucket *buckets;
+	struct pnfs_commit_bucket *buckets;
 	int size;
 
 	if (fl->commit_through_mds)
 		return 0;
-	if (flo->commit_info.nbuckets != 0) {
+	if (cinfo->ds->nbuckets != 0) {
 		/* This assumes there is only one IOMODE_RW lseg.  What
 		 * we really want to do is have a layout_hdr level
 		 * dictionary of <multipath_list4, fh> keys, each
@@ -718,25 +719,25 @@ filelayout_alloc_commit_info(struct pnfs_layout_segment *lseg,
 	size = (fl->stripe_type == STRIPE_SPARSE) ?
 		fl->dsaddr->ds_num : fl->dsaddr->stripe_count;
 
-	buckets = kcalloc(size, sizeof(struct nfs4_fl_commit_bucket),
+	buckets = kcalloc(size, sizeof(struct pnfs_commit_bucket),
 			  gfp_flags);
 	if (!buckets)
 		return -ENOMEM;
 	else {
 		int i;
 
-		spin_lock(&lseg->pls_layout->plh_inode->i_lock);
-		if (flo->commit_info.nbuckets != 0)
+		spin_lock(cinfo->lock);
+		if (cinfo->ds->nbuckets != 0)
 			kfree(buckets);
 		else {
-			flo->commit_info.buckets = buckets;
-			flo->commit_info.nbuckets = size;
+			cinfo->ds->buckets = buckets;
+			cinfo->ds->nbuckets = size;
 			for (i = 0; i < size; i++) {
 				INIT_LIST_HEAD(&buckets[i].written);
 				INIT_LIST_HEAD(&buckets[i].committing);
 			}
 		}
-		spin_unlock(&lseg->pls_layout->plh_inode->i_lock);
+		spin_unlock(cinfo->lock);
 		return 0;
 	}
 }
@@ -821,6 +822,7 @@ static void
 filelayout_pg_init_write(struct nfs_pageio_descriptor *pgio,
 			 struct nfs_page *req)
 {
+	struct nfs_commit_info cinfo;
 	int status;
 
 	BUG_ON(pgio->pg_lseg != NULL);
@@ -836,7 +838,8 @@ filelayout_pg_init_write(struct nfs_pageio_descriptor *pgio,
 	/* If no lseg, fall back to write through mds */
 	if (pgio->pg_lseg == NULL)
 		goto out_mds;
-	status = filelayout_alloc_commit_info(pgio->pg_lseg, GFP_NOFS);
+	nfs_init_cinfo(&cinfo, pgio->pg_inode, pgio->pg_dreq);
+	status = filelayout_alloc_commit_info(pgio->pg_lseg, &cinfo, GFP_NOFS);
 	if (status < 0) {
 		put_lseg(pgio->pg_lseg);
 		pgio->pg_lseg = NULL;
@@ -871,40 +874,42 @@ static u32 select_bucket_index(struct nfs4_filelayout_segment *fl, u32 j)
  * If this will make the bucket empty, it will need to put the lseg reference.
  */
 static void
-filelayout_clear_request_commit(struct nfs_page *req)
+filelayout_clear_request_commit(struct nfs_page *req,
+				struct nfs_commit_info *cinfo)
 {
 	struct pnfs_layout_segment *freeme = NULL;
-	struct inode *inode = req->wb_context->dentry->d_inode;
 
-	spin_lock(&inode->i_lock);
+	spin_lock(cinfo->lock);
 	if (!test_and_clear_bit(PG_COMMIT_TO_DS, &req->wb_flags))
 		goto out;
+	cinfo->ds->nwritten--;
 	if (list_is_singular(&req->wb_list)) {
-		struct nfs4_fl_commit_bucket *bucket;
+		struct pnfs_commit_bucket *bucket;
 
 		bucket = list_first_entry(&req->wb_list,
-					  struct nfs4_fl_commit_bucket,
+					  struct pnfs_commit_bucket,
 					  written);
 		freeme = bucket->wlseg;
 		bucket->wlseg = NULL;
 	}
 out:
-	nfs_request_remove_commit_list(req);
-	spin_unlock(&inode->i_lock);
+	nfs_request_remove_commit_list(req, cinfo);
+	spin_unlock(cinfo->lock);
 	put_lseg(freeme);
 }
 
 static struct list_head *
 filelayout_choose_commit_list(struct nfs_page *req,
-			      struct pnfs_layout_segment *lseg)
+			      struct pnfs_layout_segment *lseg,
+			      struct nfs_commit_info *cinfo)
 {
 	struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg);
 	u32 i, j;
 	struct list_head *list;
-	struct nfs4_fl_commit_bucket *buckets;
+	struct pnfs_commit_bucket *buckets;
 
 	if (fl->commit_through_mds)
-		return &NFS_I(req->wb_context->dentry->d_inode)->commit_list;
+		return &cinfo->mds->list;
 
 	/* Note that we are calling nfs4_fl_calc_j_index on each page
 	 * that ends up being committed to a data server.  An attractive
@@ -914,7 +919,7 @@ filelayout_choose_commit_list(struct nfs_page *req,
 	 */
 	j = nfs4_fl_calc_j_index(lseg, req_offset(req));
 	i = select_bucket_index(fl, j);
-	buckets = FILELAYOUT_FROM_HDR(lseg->pls_layout)->commit_info.buckets;
+	buckets = cinfo->ds->buckets;
 	list = &buckets[i].written;
 	if (list_empty(list)) {
 		/* Non-empty buckets hold a reference on the lseg.  That ref
@@ -926,17 +931,19 @@ filelayout_choose_commit_list(struct nfs_page *req,
 		buckets[i].wlseg = get_lseg(lseg);
 	}
 	set_bit(PG_COMMIT_TO_DS, &req->wb_flags);
+	cinfo->ds->nwritten++;
 	return list;
 }
 
 static void
 filelayout_mark_request_commit(struct nfs_page *req,
-		struct pnfs_layout_segment *lseg)
+			       struct pnfs_layout_segment *lseg,
+			       struct nfs_commit_info *cinfo)
 {
 	struct list_head *list;
 
-	list = filelayout_choose_commit_list(req, lseg);
-	nfs_request_add_commit_list(req, list);
+	list = filelayout_choose_commit_list(req, lseg, cinfo);
+	nfs_request_add_commit_list(req, list, cinfo);
 }
 
 static u32 calc_ds_index_from_commit(struct pnfs_layout_segment *lseg, u32 i)
@@ -993,8 +1000,9 @@ static int filelayout_initiate_commit(struct nfs_commit_data *data, int how)
 }
 
 static int
-filelayout_scan_ds_commit_list(struct nfs4_fl_commit_bucket *bucket, int max,
-		spinlock_t *lock)
+filelayout_scan_ds_commit_list(struct pnfs_commit_bucket *bucket,
+			       struct nfs_commit_info *cinfo,
+			       int max)
 {
 	struct list_head *src = &bucket->written;
 	struct list_head *dst = &bucket->committing;
@@ -1004,9 +1012,9 @@ filelayout_scan_ds_commit_list(struct nfs4_fl_commit_bucket *bucket, int max,
 	list_for_each_entry_safe(req, tmp, src, wb_list) {
 		if (!nfs_lock_request(req))
 			continue;
-		if (cond_resched_lock(lock))
+		if (cond_resched_lock(cinfo->lock))
 			list_safe_reset_next(req, tmp, wb_list);
-		nfs_request_remove_commit_list(req);
+		nfs_request_remove_commit_list(req, cinfo);
 		clear_bit(PG_COMMIT_TO_DS, &req->wb_flags);
 		nfs_list_add_request(req, dst);
 		ret++;
@@ -1014,6 +1022,8 @@ filelayout_scan_ds_commit_list(struct nfs4_fl_commit_bucket *bucket, int max,
 			break;
 	}
 	if (ret) {
+		cinfo->ds->nwritten -= ret;
+		cinfo->ds->ncommitting += ret;
 		bucket->clseg = bucket->wlseg;
 		if (list_empty(src))
 			bucket->wlseg = NULL;
@@ -1024,37 +1034,32 @@ filelayout_scan_ds_commit_list(struct nfs4_fl_commit_bucket *bucket, int max,
 }
 
 /* Move reqs from written to committing lists, returning count of number moved.
- * Note called with i_lock held.
+ * Note called with cinfo->lock held.
  */
-static int filelayout_scan_commit_lists(struct inode *inode, int max,
-		spinlock_t *lock)
+static int filelayout_scan_commit_lists(struct nfs_commit_info *cinfo,
+					int max)
 {
-	struct nfs4_fl_commit_info *fl_cinfo;
 	int i, rv = 0, cnt;
 
-	fl_cinfo = &FILELAYOUT_FROM_HDR(NFS_I(inode)->layout)->commit_info;
-	if (fl_cinfo->nbuckets == 0)
-		goto out_done;
-	for (i = 0; i < fl_cinfo->nbuckets && max != 0; i++) {
-		cnt = filelayout_scan_ds_commit_list(&fl_cinfo->buckets[i],
-				max, lock);
+	for (i = 0; i < cinfo->ds->nbuckets && max != 0; i++) {
+		cnt = filelayout_scan_ds_commit_list(&cinfo->ds->buckets[i],
+						     cinfo, max);
 		max -= cnt;
 		rv += cnt;
 	}
-out_done:
 	return rv;
 }
 
 static unsigned int
-alloc_ds_commits(struct inode *inode, struct list_head *list)
+alloc_ds_commits(struct nfs_commit_info *cinfo, struct list_head *list)
 {
-	struct nfs4_fl_commit_info *fl_cinfo;
-	struct nfs4_fl_commit_bucket *bucket;
+	struct pnfs_ds_commit_info *fl_cinfo;
+	struct pnfs_commit_bucket *bucket;
 	struct nfs_commit_data *data;
 	int i, j;
 	unsigned int nreq = 0;
 
-	fl_cinfo = &FILELAYOUT_FROM_HDR(NFS_I(inode)->layout)->commit_info;
+	fl_cinfo = cinfo->ds;
 	bucket = fl_cinfo->buckets;
 	for (i = 0; i < fl_cinfo->nbuckets; i++, bucket++) {
 		if (list_empty(&bucket->committing))
@@ -1073,7 +1078,7 @@ alloc_ds_commits(struct inode *inode, struct list_head *list)
 	for (j = i; j < fl_cinfo->nbuckets; j++, bucket++) {
 		if (list_empty(&bucket->committing))
 			continue;
-		nfs_retry_commit(&bucket->committing, bucket->clseg);
+		nfs_retry_commit(&bucket->committing, bucket->clseg, cinfo);
 		put_lseg(bucket->clseg);
 		bucket->clseg = NULL;
 	}
@@ -1084,7 +1089,7 @@ alloc_ds_commits(struct inode *inode, struct list_head *list)
 /* This follows nfs_commit_list pretty closely */
 static int
 filelayout_commit_pagelist(struct inode *inode, struct list_head *mds_pages,
-			   int how)
+			   int how, struct nfs_commit_info *cinfo)
 {
 	struct nfs_commit_data *data, *tmp;
 	LIST_HEAD(list);
@@ -1097,17 +1102,17 @@ filelayout_commit_pagelist(struct inode *inode, struct list_head *mds_pages,
 			list_add(&data->pages, &list);
 			nreq++;
 		} else
-			nfs_retry_commit(mds_pages, NULL);
+			nfs_retry_commit(mds_pages, NULL, cinfo);
 	}
 
-	nreq += alloc_ds_commits(inode, &list);
+	nreq += alloc_ds_commits(cinfo, &list);
 
 	if (nreq == 0) {
 		nfs_commit_clear_lock(NFS_I(inode));
 		goto out;
 	}
 
-	atomic_add(nreq, &NFS_I(inode)->commits_outstanding);
+	atomic_add(nreq, &cinfo->mds->rpcs_out);
 
 	list_for_each_entry_safe(data, tmp, &list, pages) {
 		list_del_init(&data->pages);
@@ -1116,14 +1121,15 @@ filelayout_commit_pagelist(struct inode *inode, struct list_head *mds_pages,
 			nfs_initiate_commit(NFS_CLIENT(inode), data,
 					    data->mds_ops, how);
 		} else {
-			struct nfs4_fl_commit_info *fl_cinfo;
+			struct pnfs_commit_bucket *buckets;
 
-			fl_cinfo = &FILELAYOUT_FROM_HDR(data->lseg->pls_layout)->commit_info;
-			nfs_init_commit(data, &fl_cinfo->buckets[data->ds_commit_index].committing, data->lseg);
+			buckets = cinfo->ds->buckets;
+			nfs_init_commit(data, &buckets[data->ds_commit_index].committing, data->lseg);
 			filelayout_initiate_commit(data, how);
 		}
 	}
 out:
+	cinfo->ds->ncommitting = 0;
 	return PNFS_ATTEMPTED;
 }
 
@@ -1148,6 +1154,12 @@ filelayout_free_layout_hdr(struct pnfs_layout_hdr *lo)
 	kfree(FILELAYOUT_FROM_HDR(lo));
 }
 
+static struct pnfs_ds_commit_info *
+filelayout_get_ds_info(struct inode *inode)
+{
+	return &FILELAYOUT_FROM_HDR(NFS_I(inode)->layout)->commit_info;
+}
+
 static struct pnfs_layoutdriver_type filelayout_type = {
 	.id			= LAYOUT_NFSV4_1_FILES,
 	.name			= "LAYOUT_NFSV4_1_FILES",
@@ -1158,6 +1170,7 @@ static struct pnfs_layoutdriver_type filelayout_type = {
 	.free_lseg		= filelayout_free_lseg,
 	.pg_read_ops		= &filelayout_pg_read_ops,
 	.pg_write_ops		= &filelayout_pg_write_ops,
+	.get_ds_info		= &filelayout_get_ds_info,
 	.mark_request_commit	= filelayout_mark_request_commit,
 	.clear_request_commit	= filelayout_clear_request_commit,
 	.scan_commit_lists	= filelayout_scan_commit_lists,
diff --git a/fs/nfs/nfs4filelayout.h b/fs/nfs/nfs4filelayout.h
index 333a3ac..96b89bb 100644
--- a/fs/nfs/nfs4filelayout.h
+++ b/fs/nfs/nfs4filelayout.h
@@ -74,18 +74,6 @@ struct nfs4_file_layout_dsaddr {
 	struct nfs4_pnfs_ds		*ds_list[1];
 };
 
-struct nfs4_fl_commit_bucket {
-	struct list_head written;
-	struct list_head committing;
-	struct pnfs_layout_segment *wlseg;
-	struct pnfs_layout_segment *clseg;
-};
-
-struct nfs4_fl_commit_info {
-	int nbuckets;
-	struct nfs4_fl_commit_bucket *buckets;
-};
-
 struct nfs4_filelayout_segment {
 	struct pnfs_layout_segment generic_hdr;
 	u32 stripe_type;
@@ -100,7 +88,7 @@ struct nfs4_filelayout_segment {
 
 struct nfs4_filelayout {
 	struct pnfs_layout_hdr generic_hdr;
-	struct nfs4_fl_commit_info commit_info;
+	struct pnfs_ds_commit_info commit_info;
 };
 
 static inline struct nfs4_filelayout *
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 734e4ef..4cd8760 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -94,11 +94,18 @@ struct pnfs_layoutdriver_type {
 	const struct nfs_pageio_ops *pg_read_ops;
 	const struct nfs_pageio_ops *pg_write_ops;
 
+	struct pnfs_ds_commit_info *(*get_ds_info) (struct inode *inode);
 	void (*mark_request_commit) (struct nfs_page *req,
-					struct pnfs_layout_segment *lseg);
-	void (*clear_request_commit) (struct nfs_page *req);
-	int (*scan_commit_lists) (struct inode *inode, int max, spinlock_t *lock);
-	int (*commit_pagelist)(struct inode *inode, struct list_head *mds_pages, int how);
+				     struct pnfs_layout_segment *lseg,
+				     struct nfs_commit_info *cinfo);
+	void (*clear_request_commit) (struct nfs_page *req,
+				      struct nfs_commit_info *cinfo);
+	int (*scan_commit_lists) (struct nfs_commit_info *cinfo,
+				  int max);
+	int (*commit_pagelist)(struct inode *inode,
+			       struct list_head *mds_pages,
+			       int how,
+			       struct nfs_commit_info *cinfo);
 
 	/*
 	 * Return PNFS_ATTEMPTED to indicate the layout code has attempted
@@ -263,49 +270,57 @@ static inline int pnfs_enabled_sb(struct nfs_server *nfss)
 }
 
 static inline int
-pnfs_commit_list(struct inode *inode, struct list_head *mds_pages, int how)
+pnfs_commit_list(struct inode *inode, struct list_head *mds_pages, int how,
+		 struct nfs_commit_info *cinfo)
 {
-	if (!test_and_clear_bit(NFS_INO_PNFS_COMMIT, &NFS_I(inode)->flags))
+	if (cinfo->ds == NULL || cinfo->ds->ncommitting == 0)
 		return PNFS_NOT_ATTEMPTED;
-	return NFS_SERVER(inode)->pnfs_curr_ld->commit_pagelist(inode, mds_pages, how);
+	return NFS_SERVER(inode)->pnfs_curr_ld->commit_pagelist(inode, mds_pages, how, cinfo);
+}
+
+static inline struct pnfs_ds_commit_info *
+pnfs_get_ds_info(struct inode *inode)
+{
+	struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld;
+
+	if (ld == NULL || ld->get_ds_info == NULL)
+		return NULL;
+	return ld->get_ds_info(inode);
 }
 
 static inline bool
-pnfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg)
+pnfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg,
+			 struct nfs_commit_info *cinfo)
 {
 	struct inode *inode = req->wb_context->dentry->d_inode;
 	struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld;
 
 	if (lseg == NULL || ld->mark_request_commit == NULL)
 		return false;
-	ld->mark_request_commit(req, lseg);
+	ld->mark_request_commit(req, lseg, cinfo);
 	return true;
 }
 
 static inline bool
-pnfs_clear_request_commit(struct nfs_page *req)
+pnfs_clear_request_commit(struct nfs_page *req, struct nfs_commit_info *cinfo)
 {
 	struct inode *inode = req->wb_context->dentry->d_inode;
 	struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld;
 
 	if (ld == NULL || ld->clear_request_commit == NULL)
 		return false;
-	ld->clear_request_commit(req);
+	ld->clear_request_commit(req, cinfo);
 	return true;
 }
 
 static inline int
-pnfs_scan_commit_lists(struct inode *inode, int max, spinlock_t *lock)
+pnfs_scan_commit_lists(struct inode *inode, struct nfs_commit_info *cinfo,
+		       int max)
 {
-	struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld;
-	int ret;
-
-	if (ld == NULL || ld->scan_commit_lists == NULL)
+	if (cinfo->ds == NULL || cinfo->ds->nwritten == 0)
 		return 0;
-	ret = ld->scan_commit_lists(inode, max, lock);
-	if (ret != 0)
-		set_bit(NFS_INO_PNFS_COMMIT, &NFS_I(inode)->flags);
-	return ret;
+	else
+		return NFS_SERVER(inode)->pnfs_curr_ld->scan_commit_lists(cinfo, max);
 }
 
 /* Should the pNFS client commit and return the layout upon a setattr */
@@ -409,25 +424,34 @@ static inline bool pnfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, st
 }
 
 static inline int
-pnfs_commit_list(struct inode *inode, struct list_head *mds_pages, int how)
+pnfs_commit_list(struct inode *inode, struct list_head *mds_pages, int how,
+		 struct nfs_commit_info *cinfo)
 {
 	return PNFS_NOT_ATTEMPTED;
 }
 
+static inline struct pnfs_ds_commit_info *
+pnfs_get_ds_info(struct inode *inode)
+{
+	return NULL;
+}
+
 static inline bool
-pnfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg)
+pnfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg,
+			 struct nfs_commit_info *cinfo)
 {
 	return false;
 }
 
 static inline bool
-pnfs_clear_request_commit(struct nfs_page *req)
+pnfs_clear_request_commit(struct nfs_page *req, struct nfs_commit_info *cinfo)
 {
 	return false;
 }
 
 static inline int
-pnfs_scan_commit_lists(struct inode *inode, int max, spinlock_t *lock)
+pnfs_scan_commit_lists(struct inode *inode, struct nfs_commit_info *cinfo,
+		       int max)
 {
 	return 0;
 }
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 2500f1c..18bf700 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -452,65 +452,79 @@ nfs_mark_request_dirty(struct nfs_page *req)
 /**
  * nfs_request_add_commit_list - add request to a commit list
  * @req: pointer to a struct nfs_page
- * @head: commit list head
+ * @dst: commit list head
+ * @cinfo: holds list lock and accounting info
  *
- * This sets the PG_CLEAN bit, updates the inode global count of
+ * This sets the PG_CLEAN bit, updates the cinfo count of
  * number of outstanding requests requiring a commit as well as
  * the MM page stats.
  *
- * The caller must _not_ hold the inode->i_lock, but must be
+ * The caller must _not_ hold the cinfo->lock, but must be
  * holding the nfs_page lock.
  */
 void
-nfs_request_add_commit_list(struct nfs_page *req, struct list_head *head)
+nfs_request_add_commit_list(struct nfs_page *req, struct list_head *dst,
+			    struct nfs_commit_info *cinfo)
 {
-	struct inode *inode = req->wb_context->dentry->d_inode;
-
 	set_bit(PG_CLEAN, &(req)->wb_flags);
-	spin_lock(&inode->i_lock);
-	nfs_list_add_request(req, head);
-	NFS_I(inode)->ncommit++;
-	spin_unlock(&inode->i_lock);
+	spin_lock(cinfo->lock);
+	nfs_list_add_request(req, dst);
+	cinfo->mds->ncommit++;
+	spin_unlock(cinfo->lock);
 	inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
 	inc_bdi_stat(req->wb_page->mapping->backing_dev_info, BDI_RECLAIMABLE);
-	__mark_inode_dirty(inode, I_DIRTY_DATASYNC);
+	__mark_inode_dirty(req->wb_context->dentry->d_inode, I_DIRTY_DATASYNC);
 }
 EXPORT_SYMBOL_GPL(nfs_request_add_commit_list);
 
 /**
  * nfs_request_remove_commit_list - Remove request from a commit list
  * @req: pointer to a nfs_page
+ * @cinfo: holds list lock and accounting info
  *
- * This clears the PG_CLEAN bit, and updates the inode global count of
+ * This clears the PG_CLEAN bit, and updates the cinfo's count of
  * number of outstanding requests requiring a commit
  * It does not update the MM page stats.
  *
- * The caller _must_ hold the inode->i_lock and the nfs_page lock.
+ * The caller _must_ hold the cinfo->lock and the nfs_page lock.
  */
 void
-nfs_request_remove_commit_list(struct nfs_page *req)
+nfs_request_remove_commit_list(struct nfs_page *req,
+			       struct nfs_commit_info *cinfo)
 {
-	struct inode *inode = req->wb_context->dentry->d_inode;
-
 	if (!test_and_clear_bit(PG_CLEAN, &(req)->wb_flags))
 		return;
 	nfs_list_remove_request(req);
-	NFS_I(inode)->ncommit--;
+	cinfo->mds->ncommit--;
 }
 EXPORT_SYMBOL_GPL(nfs_request_remove_commit_list);
 
+static void nfs_init_cinfo_from_inode(struct nfs_commit_info *cinfo,
+				      struct inode *inode)
+{
+	cinfo->lock = &inode->i_lock;
+	cinfo->mds = &NFS_I(inode)->commit_info;
+	cinfo->ds = pnfs_get_ds_info(inode);
+}
+
+void nfs_init_cinfo(struct nfs_commit_info *cinfo,
+		    struct inode *inode,
+		    struct nfs_direct_req *dreq)
+{
+	nfs_init_cinfo_from_inode(cinfo, inode);
+}
+EXPORT_SYMBOL_GPL(nfs_init_cinfo);
 
 /*
  * Add a request to the inode's commit list.
  */
 static void
-nfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg)
+nfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg,
+			struct nfs_commit_info *cinfo)
 {
-	struct inode *inode = req->wb_context->dentry->d_inode;
-
-	if (pnfs_mark_request_commit(req, lseg))
+	if (pnfs_mark_request_commit(req, lseg, cinfo))
 		return;
-	nfs_request_add_commit_list(req, &NFS_I(inode)->commit_list);
+	nfs_request_add_commit_list(req, &cinfo->mds->list, cinfo);
 }
 
 static void
@@ -525,11 +539,13 @@ nfs_clear_request_commit(struct nfs_page *req)
 {
 	if (test_bit(PG_CLEAN, &req->wb_flags)) {
 		struct inode *inode = req->wb_context->dentry->d_inode;
+		struct nfs_commit_info cinfo;
 
-		if (!pnfs_clear_request_commit(req)) {
-			spin_lock(&inode->i_lock);
-			nfs_request_remove_commit_list(req);
-			spin_unlock(&inode->i_lock);
+		nfs_init_cinfo_from_inode(&cinfo, inode);
+		if (!pnfs_clear_request_commit(req, &cinfo)) {
+			spin_lock(cinfo.lock);
+			nfs_request_remove_commit_list(req, &cinfo);
+			spin_unlock(cinfo.lock);
 		}
 		nfs_clear_page_commit(req->wb_page);
 	}
@@ -545,7 +561,8 @@ int nfs_write_need_commit(struct nfs_write_data *data)
 
 #else
 static void
-nfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg)
+nfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg,
+			struct nfs_commit_info *cinfo)
 {
 }
 
@@ -564,10 +581,12 @@ int nfs_write_need_commit(struct nfs_write_data *data)
 
 static void nfs_write_completion(struct nfs_pgio_header *hdr)
 {
+	struct nfs_commit_info cinfo;
 	unsigned long bytes = 0;
 
 	if (test_bit(NFS_IOHDR_REDO, &hdr->flags))
 		goto out;
+	nfs_init_cinfo_from_inode(&cinfo, hdr->inode);
 	while (!list_empty(&hdr->pages)) {
 		struct nfs_page *req = nfs_list_entry(hdr->pages.next);
 		struct page *page = req->wb_page;
@@ -585,7 +604,7 @@ static void nfs_write_completion(struct nfs_pgio_header *hdr)
 			goto next;
 		}
 		if (test_bit(NFS_IOHDR_NEED_COMMIT, &hdr->flags)) {
-			nfs_mark_request_commit(req, hdr->lseg);
+			nfs_mark_request_commit(req, hdr->lseg, &cinfo);
 			goto next;
 		}
 remove_req:
@@ -599,16 +618,16 @@ out:
 }
 
 #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
-static int
-nfs_need_commit(struct nfs_inode *nfsi)
+static unsigned long
+nfs_reqs_to_commit(struct nfs_commit_info *cinfo)
 {
-	return nfsi->ncommit > 0;
+	return cinfo->mds->ncommit;
 }
 
-/* i_lock held by caller */
+/* cinfo->lock held by caller */
 static int
-nfs_scan_commit_list(struct list_head *src, struct list_head *dst, int max,
-		spinlock_t *lock)
+nfs_scan_commit_list(struct list_head *src, struct list_head *dst,
+		     struct nfs_commit_info *cinfo, int max)
 {
 	struct nfs_page *req, *tmp;
 	int ret = 0;
@@ -616,9 +635,9 @@ nfs_scan_commit_list(struct list_head *src, struct list_head *dst, int max,
 	list_for_each_entry_safe(req, tmp, src, wb_list) {
 		if (!nfs_lock_request(req))
 			continue;
-		if (cond_resched_lock(lock))
+		if (cond_resched_lock(cinfo->lock))
 			list_safe_reset_next(req, tmp, wb_list);
-		nfs_request_remove_commit_list(req);
+		nfs_request_remove_commit_list(req, cinfo);
 		nfs_list_add_request(req, dst);
 		ret++;
 		if (ret == max)
@@ -630,37 +649,38 @@ nfs_scan_commit_list(struct list_head *src, struct list_head *dst, int max,
 /*
  * nfs_scan_commit - Scan an inode for commit requests
  * @inode: NFS inode to scan
- * @dst: destination list
+ * @dst: mds destination list
+ * @cinfo: mds and ds lists of reqs ready to commit
  *
  * Moves requests from the inode's 'commit' request list.
  * The requests are *not* checked to ensure that they form a contiguous set.
  */
 static int
-nfs_scan_commit(struct inode *inode, struct list_head *dst)
+nfs_scan_commit(struct inode *inode, struct list_head *dst,
+		struct nfs_commit_info *cinfo)
 {
-	struct nfs_inode *nfsi = NFS_I(inode);
 	int ret = 0;
 
-	spin_lock(&inode->i_lock);
-	if (nfsi->ncommit > 0) {
+	spin_lock(cinfo->lock);
+	if (cinfo->mds->ncommit > 0) {
 		const int max = INT_MAX;
 
-		ret = nfs_scan_commit_list(&nfsi->commit_list, dst, max,
-				&inode->i_lock);
-		ret += pnfs_scan_commit_lists(inode, max - ret,
-				&inode->i_lock);
+		ret = nfs_scan_commit_list(&cinfo->mds->list, dst,
+					   cinfo, max);
+		ret += pnfs_scan_commit_lists(inode, cinfo, max - ret);
 	}
-	spin_unlock(&inode->i_lock);
+	spin_unlock(cinfo->lock);
 	return ret;
 }
 
 #else
-static inline int nfs_need_commit(struct nfs_inode *nfsi)
+static unsigned long nfs_reqs_to_commit(struct nfs_commit_info *cinfo)
 {
 	return 0;
 }
 
-static inline int nfs_scan_commit(struct inode *inode, struct list_head *dst)
+static inline int nfs_scan_commit(struct inode *inode, struct list_head *dst,
+				  struct nfs_commit_info *cinfo)
 {
 	return 0;
 }
@@ -929,7 +949,7 @@ EXPORT_SYMBOL_GPL(nfs_initiate_write);
  */
 static void nfs_write_rpcsetup(struct nfs_write_data *data,
 		unsigned int count, unsigned int offset,
-		int how)
+		int how, struct nfs_commit_info *cinfo)
 {
 	struct nfs_page *req = data->header->req;
 
@@ -950,7 +970,7 @@ static void nfs_write_rpcsetup(struct nfs_write_data *data,
 	case 0:
 		break;
 	case FLUSH_COND_STABLE:
-		if (nfs_need_commit(NFS_I(data->header->inode)))
+		if (nfs_reqs_to_commit(cinfo))
 			break;
 	default:
 		data->args.stable = NFS_FILE_SYNC;
@@ -1034,12 +1054,14 @@ static int nfs_flush_multi(struct nfs_pageio_descriptor *desc,
 	unsigned int offset;
 	int requests = 0;
 	int ret = 0;
+	struct nfs_commit_info cinfo;
 
+	nfs_init_cinfo(&cinfo, desc->pg_inode, desc->pg_dreq);
 	nfs_list_remove_request(req);
 	nfs_list_add_request(req, &hdr->pages);
 
 	if ((desc->pg_ioflags & FLUSH_COND_STABLE) &&
-	    (desc->pg_moreio || NFS_I(desc->pg_inode)->ncommit ||
+	    (desc->pg_moreio || nfs_reqs_to_commit(&cinfo) ||
 	     desc->pg_count > wsize))
 		desc->pg_ioflags &= ~FLUSH_COND_STABLE;
 
@@ -1053,7 +1075,7 @@ static int nfs_flush_multi(struct nfs_pageio_descriptor *desc,
 		if (!data)
 			goto out_bad;
 		data->pages.pagevec[0] = page;
-		nfs_write_rpcsetup(data, len, offset, desc->pg_ioflags);
+		nfs_write_rpcsetup(data, len, offset, desc->pg_ioflags, &cinfo);
 		list_add(&data->list, &hdr->rpc_list);
 		requests++;
 		nbytes -= len;
@@ -1088,6 +1110,7 @@ static int nfs_flush_one(struct nfs_pageio_descriptor *desc,
 	struct nfs_write_data	*data;
 	struct list_head *head = &desc->pg_list;
 	int ret = 0;
+	struct nfs_commit_info cinfo;
 
 	data = nfs_writedata_alloc(hdr, nfs_page_array_len(desc->pg_base,
 							   desc->pg_count));
@@ -1097,6 +1120,7 @@ static int nfs_flush_one(struct nfs_pageio_descriptor *desc,
 		goto out;
 	}
 
+	nfs_init_cinfo(&cinfo, desc->pg_inode, desc->pg_dreq);
 	pages = data->pages.pagevec;
 	while (!list_empty(head)) {
 		req = nfs_list_entry(head->next);
@@ -1106,11 +1130,11 @@ static int nfs_flush_one(struct nfs_pageio_descriptor *desc,
 	}
 
 	if ((desc->pg_ioflags & FLUSH_COND_STABLE) &&
-	    (desc->pg_moreio || NFS_I(desc->pg_inode)->ncommit))
+	    (desc->pg_moreio || nfs_reqs_to_commit(&cinfo)))
 		desc->pg_ioflags &= ~FLUSH_COND_STABLE;
 
 	/* Set up the argument struct */
-	nfs_write_rpcsetup(data, desc->pg_count, 0, desc->pg_ioflags);
+	nfs_write_rpcsetup(data, desc->pg_count, 0, desc->pg_ioflags, &cinfo);
 	list_add(&data->list, &hdr->rpc_list);
 	desc->pg_rpc_callops = &nfs_write_common_ops;
 out:
@@ -1417,14 +1441,15 @@ void nfs_init_commit(struct nfs_commit_data *data,
 EXPORT_SYMBOL_GPL(nfs_init_commit);
 
 void nfs_retry_commit(struct list_head *page_list,
-		      struct pnfs_layout_segment *lseg)
+		      struct pnfs_layout_segment *lseg,
+		      struct nfs_commit_info *cinfo)
 {
 	struct nfs_page *req;
 
 	while (!list_empty(page_list)) {
 		req = nfs_list_entry(page_list->next);
 		nfs_list_remove_request(req);
-		nfs_mark_request_commit(req, lseg);
+		nfs_mark_request_commit(req, lseg, cinfo);
 		dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
 		dec_bdi_stat(req->wb_page->mapping->backing_dev_info,
 			     BDI_RECLAIMABLE);
@@ -1437,7 +1462,8 @@ EXPORT_SYMBOL_GPL(nfs_retry_commit);
  * Commit dirty pages
  */
 static int
-nfs_commit_list(struct inode *inode, struct list_head *head, int how)
+nfs_commit_list(struct inode *inode, struct list_head *head, int how,
+		struct nfs_commit_info *cinfo)
 {
 	struct nfs_commit_data	*data;
 
@@ -1450,7 +1476,7 @@ nfs_commit_list(struct inode *inode, struct list_head *head, int how)
 	nfs_init_commit(data, head, NULL);
 	return nfs_initiate_commit(NFS_CLIENT(inode), data, data->mds_ops, how);
  out_bad:
-	nfs_retry_commit(head, NULL);
+	nfs_retry_commit(head, NULL, cinfo);
 	nfs_commit_clear_lock(NFS_I(inode));
 	return -ENOMEM;
 }
@@ -1524,30 +1550,32 @@ static const struct rpc_call_ops nfs_commit_ops = {
 };
 
 static int nfs_generic_commit_list(struct inode *inode, struct list_head *head,
-				   int how)
+				   int how, struct nfs_commit_info *cinfo)
 {
 	int status;
 
-	status = pnfs_commit_list(inode, head, how);
+	status = pnfs_commit_list(inode, head, how, cinfo);
 	if (status == PNFS_NOT_ATTEMPTED)
-		status = nfs_commit_list(inode, head, how);
+		status = nfs_commit_list(inode, head, how, cinfo);
 	return status;
 }
 
 int nfs_commit_inode(struct inode *inode, int how)
 {
 	LIST_HEAD(head);
+	struct nfs_commit_info cinfo;
 	int may_wait = how & FLUSH_SYNC;
 	int res;
 
 	res = nfs_commit_set_lock(NFS_I(inode), may_wait);
 	if (res <= 0)
 		goto out_mark_dirty;
-	res = nfs_scan_commit(inode, &head);
+	nfs_init_cinfo_from_inode(&cinfo, inode);
+	res = nfs_scan_commit(inode, &head, &cinfo);
 	if (res) {
 		int error;
 
-		error = nfs_generic_commit_list(inode, &head, how);
+		error = nfs_generic_commit_list(inode, &head, how, &cinfo);
 		if (error < 0)
 			return error;
 		if (!may_wait)
@@ -1578,14 +1606,14 @@ static int nfs_commit_unstable_pages(struct inode *inode, struct writeback_contr
 	int ret = 0;
 
 	/* no commits means nothing needs to be done */
-	if (!nfsi->ncommit)
+	if (!nfsi->commit_info.ncommit)
 		return ret;
 
 	if (wbc->sync_mode == WB_SYNC_NONE) {
 		/* Don't commit yet if this is a non-blocking flush and there
 		 * are a lot of outstanding writes for this mapping.
 		 */
-		if (nfsi->ncommit <= (nfsi->npages >> 1))
+		if (nfsi->commit_info.ncommit <= (nfsi->npages >> 1))
 			goto out_mark_dirty;
 
 		/* don't wait for the COMMIT response */
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 8d3a2b8..8a88c16 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -179,8 +179,7 @@ struct nfs_inode {
 	__be32			cookieverf[2];
 
 	unsigned long		npages;
-	unsigned long		ncommit;
-	struct list_head	commit_list;
+	struct nfs_mds_commit_info commit_info;
 
 	/* Open contexts for shared mmap writes */
 	struct list_head	open_files;
@@ -201,7 +200,6 @@ struct nfs_inode {
 
 	/* pNFS layout information */
 	struct pnfs_layout_hdr *layout;
-	atomic_t		commits_outstanding;
 #endif /* CONFIG_NFS_V4*/
 #ifdef CONFIG_NFS_FSCACHE
 	struct fscache_cookie	*fscache;
@@ -230,7 +228,6 @@ struct nfs_inode {
 #define NFS_INO_FSCACHE		(5)		/* inode can be cached by FS-Cache */
 #define NFS_INO_FSCACHE_LOCK	(6)		/* FS-Cache cookie management lock */
 #define NFS_INO_COMMIT		(7)		/* inode is committing unstable writes */
-#define NFS_INO_PNFS_COMMIT	(8)		/* use pnfs code for commit */
 #define NFS_INO_LAYOUTCOMMIT	(9)		/* layoutcommit required */
 #define NFS_INO_LAYOUTCOMMITTING (10)		/* layoutcommit inflight */
 
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 38687b8..224e1e8 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -1079,6 +1079,21 @@ struct nfstime4 {
 };
 
 #ifdef CONFIG_NFS_V4_1
+
+struct pnfs_commit_bucket {
+	struct list_head written;
+	struct list_head committing;
+	struct pnfs_layout_segment *wlseg;
+	struct pnfs_layout_segment *clseg;
+};
+
+struct pnfs_ds_commit_info {
+	int nwritten;
+	int ncommitting;
+	int nbuckets;
+	struct pnfs_commit_bucket *buckets;
+};
+
 #define NFS4_EXCHANGE_ID_LEN	(48)
 struct nfs41_exchange_id_args {
 	struct nfs_client		*client;
@@ -1242,6 +1257,18 @@ struct nfs_write_header {
 	struct nfs_write_data	rpc_data;
 };
 
+struct nfs_mds_commit_info {
+	atomic_t rpcs_out;
+	unsigned long		ncommit;
+	struct list_head	list;
+};
+
+struct nfs_commit_info {
+	spinlock_t			*lock;
+	struct nfs_mds_commit_info	*mds;
+	struct pnfs_ds_commit_info	*ds;
+};
+
 struct nfs_commit_data {
 	struct rpc_task		task;
 	struct inode		*inode;
-- 
cgit v0.10.2


From f453a54a01c7c0453ad9550906e3d2663dd486ac Mon Sep 17 00:00:00 2001
From: Fred Isaman <iisaman@netapp.com>
Date: Fri, 20 Apr 2012 14:47:54 -0400
Subject: NFS: create nfs_commit_completion_ops

Factors out the code that needs to change when directio
starts using these code paths.

Signed-off-by: Fred Isaman <iisaman@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 145e9e7..137f5cd 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -344,13 +344,12 @@ extern int nfs_initiate_commit(struct rpc_clnt *clnt,
 			       int how);
 extern void nfs_init_commit(struct nfs_commit_data *data,
 			    struct list_head *head,
-			    struct pnfs_layout_segment *lseg);
+			    struct pnfs_layout_segment *lseg,
+			    struct nfs_commit_info *cinfo);
 void nfs_retry_commit(struct list_head *page_list,
 		      struct pnfs_layout_segment *lseg,
 		      struct nfs_commit_info *cinfo);
-void nfs_commit_clear_lock(struct nfs_inode *nfsi);
 void nfs_commitdata_release(struct nfs_commit_data *data);
-void nfs_commit_release_pages(struct nfs_commit_data *data);
 void nfs_request_add_commit_list(struct nfs_page *req, struct list_head *dst,
 				 struct nfs_commit_info *cinfo);
 void nfs_request_remove_commit_list(struct nfs_page *req,
diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
index fe2cb55..26d1da4 100644
--- a/fs/nfs/nfs4filelayout.c
+++ b/fs/nfs/nfs4filelayout.c
@@ -347,12 +347,8 @@ static void filelayout_commit_count_stats(struct rpc_task *task, void *data)
 static void filelayout_commit_release(void *calldata)
 {
 	struct nfs_commit_data *data = calldata;
-	struct nfs_commit_info cinfo;
 
-	nfs_commit_release_pages(data);
-	nfs_init_cinfo(&cinfo, data->inode, data->dreq);
-	if (atomic_dec_and_test(&cinfo.mds->rpcs_out))
-		nfs_commit_clear_lock(NFS_I(data->inode));
+	data->completion_ops->completion(data);
 	put_lseg(data->lseg);
 	nfs_commitdata_release(data);
 }
@@ -1108,7 +1104,7 @@ filelayout_commit_pagelist(struct inode *inode, struct list_head *mds_pages,
 	nreq += alloc_ds_commits(cinfo, &list);
 
 	if (nreq == 0) {
-		nfs_commit_clear_lock(NFS_I(inode));
+		cinfo->completion_ops->error_cleanup(NFS_I(inode));
 		goto out;
 	}
 
@@ -1117,14 +1113,14 @@ filelayout_commit_pagelist(struct inode *inode, struct list_head *mds_pages,
 	list_for_each_entry_safe(data, tmp, &list, pages) {
 		list_del_init(&data->pages);
 		if (!data->lseg) {
-			nfs_init_commit(data, mds_pages, NULL);
+			nfs_init_commit(data, mds_pages, NULL, cinfo);
 			nfs_initiate_commit(NFS_CLIENT(inode), data,
 					    data->mds_ops, how);
 		} else {
 			struct pnfs_commit_bucket *buckets;
 
 			buckets = cinfo->ds->buckets;
-			nfs_init_commit(data, &buckets[data->ds_commit_index].committing, data->lseg);
+			nfs_init_commit(data, &buckets[data->ds_commit_index].committing, data->lseg, cinfo);
 			filelayout_initiate_commit(data, how);
 		}
 	}
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 18bf700..333d01d 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -46,6 +46,7 @@ static void nfs_redirty_request(struct nfs_page *req);
 static const struct rpc_call_ops nfs_write_common_ops;
 static const struct rpc_call_ops nfs_commit_ops;
 static const struct nfs_pgio_completion_ops nfs_async_write_completion_ops;
+static const struct nfs_commit_completion_ops nfs_commit_completion_ops;
 
 static struct kmem_cache *nfs_wdata_cachep;
 static mempool_t *nfs_wdata_mempool;
@@ -505,6 +506,7 @@ static void nfs_init_cinfo_from_inode(struct nfs_commit_info *cinfo,
 	cinfo->lock = &inode->i_lock;
 	cinfo->mds = &NFS_I(inode)->commit_info;
 	cinfo->ds = pnfs_get_ds_info(inode);
+	cinfo->completion_ops = &nfs_commit_completion_ops;
 }
 
 void nfs_init_cinfo(struct nfs_commit_info *cinfo,
@@ -1358,13 +1360,12 @@ static int nfs_commit_set_lock(struct nfs_inode *nfsi, int may_wait)
 	return (ret < 0) ? ret : 1;
 }
 
-void nfs_commit_clear_lock(struct nfs_inode *nfsi)
+static void nfs_commit_clear_lock(struct nfs_inode *nfsi)
 {
 	clear_bit(NFS_INO_COMMIT, &nfsi->flags);
 	smp_mb__after_clear_bit();
 	wake_up_bit(&nfsi->flags, NFS_INO_COMMIT);
 }
-EXPORT_SYMBOL_GPL(nfs_commit_clear_lock);
 
 void nfs_commitdata_release(struct nfs_commit_data *data)
 {
@@ -1413,8 +1414,9 @@ EXPORT_SYMBOL_GPL(nfs_initiate_commit);
  * Set up the argument/result storage required for the RPC call.
  */
 void nfs_init_commit(struct nfs_commit_data *data,
-			    struct list_head *head,
-			    struct pnfs_layout_segment *lseg)
+		     struct list_head *head,
+		     struct pnfs_layout_segment *lseg,
+		     struct nfs_commit_info *cinfo)
 {
 	struct nfs_page *first = nfs_list_entry(head->next);
 	struct inode *inode = first->wb_context->dentry->d_inode;
@@ -1428,6 +1430,7 @@ void nfs_init_commit(struct nfs_commit_data *data,
 	data->cred	  = first->wb_context->cred;
 	data->lseg	  = lseg; /* reference transferred */
 	data->mds_ops     = &nfs_commit_ops;
+	data->completion_ops = cinfo->completion_ops;
 
 	data->args.fh     = NFS_FH(data->inode);
 	/* Note: we always request a commit of the entire inode */
@@ -1473,11 +1476,12 @@ nfs_commit_list(struct inode *inode, struct list_head *head, int how,
 		goto out_bad;
 
 	/* Set up the argument struct */
-	nfs_init_commit(data, head, NULL);
+	nfs_init_commit(data, head, NULL, cinfo);
+	atomic_inc(&cinfo->mds->rpcs_out);
 	return nfs_initiate_commit(NFS_CLIENT(inode), data, data->mds_ops, how);
  out_bad:
 	nfs_retry_commit(head, NULL, cinfo);
-	nfs_commit_clear_lock(NFS_I(inode));
+	cinfo->completion_ops->error_cleanup(NFS_I(inode));
 	return -ENOMEM;
 }
 
@@ -1495,10 +1499,11 @@ static void nfs_commit_done(struct rpc_task *task, void *calldata)
 	NFS_PROTO(data->inode)->commit_done(task, data);
 }
 
-void nfs_commit_release_pages(struct nfs_commit_data *data)
+static void nfs_commit_release_pages(struct nfs_commit_data *data)
 {
 	struct nfs_page	*req;
 	int status = data->task.tk_status;
+	struct nfs_commit_info cinfo;
 
 	while (!list_empty(&data->pages)) {
 		req = nfs_list_entry(data->pages.next);
@@ -1531,15 +1536,16 @@ void nfs_commit_release_pages(struct nfs_commit_data *data)
 	next:
 		nfs_unlock_request(req);
 	}
+	nfs_init_cinfo(&cinfo, data->inode, data->dreq);
+	if (atomic_dec_and_test(&cinfo.mds->rpcs_out))
+		nfs_commit_clear_lock(NFS_I(data->inode));
 }
-EXPORT_SYMBOL_GPL(nfs_commit_release_pages);
 
 static void nfs_commit_release(void *calldata)
 {
 	struct nfs_commit_data *data = calldata;
 
-	nfs_commit_release_pages(data);
-	nfs_commit_clear_lock(NFS_I(data->inode));
+	data->completion_ops->completion(data);
 	nfs_commitdata_release(calldata);
 }
 
@@ -1549,6 +1555,11 @@ static const struct rpc_call_ops nfs_commit_ops = {
 	.rpc_release = nfs_commit_release,
 };
 
+static const struct nfs_commit_completion_ops nfs_commit_completion_ops = {
+	.completion = nfs_commit_release_pages,
+	.error_cleanup = nfs_commit_clear_lock,
+};
+
 static int nfs_generic_commit_list(struct inode *inode, struct list_head *head,
 				   int how, struct nfs_commit_info *cinfo)
 {
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 224e1e8..0e8b88a 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -1263,10 +1263,18 @@ struct nfs_mds_commit_info {
 	struct list_head	list;
 };
 
+struct nfs_commit_data;
+struct nfs_inode;
+struct nfs_commit_completion_ops {
+	void (*error_cleanup) (struct nfs_inode *nfsi);
+	void (*completion) (struct nfs_commit_data *data);
+};
+
 struct nfs_commit_info {
 	spinlock_t			*lock;
 	struct nfs_mds_commit_info	*mds;
 	struct pnfs_ds_commit_info	*ds;
+	const struct nfs_commit_completion_ops *completion_ops;
 };
 
 struct nfs_commit_data {
@@ -1285,6 +1293,7 @@ struct nfs_commit_data {
 	struct nfs_client	*ds_clp;	/* pNFS data server */
 	int			ds_commit_index;
 	const struct rpc_call_ops *mds_ops;
+	const struct nfs_commit_completion_ops *completion_ops;
 	int (*commit_done_cb) (struct rpc_task *task, struct nfs_commit_data *data);
 };
 
-- 
cgit v0.10.2


From b359f9d09bcbaede09243cfe844172ba055d89fd Mon Sep 17 00:00:00 2001
From: Fred Isaman <iisaman@netapp.com>
Date: Fri, 20 Apr 2012 14:47:55 -0400
Subject: NFS: add dreq to nfs_commit_info

Need this to pass into nfs_commitdata_init, in order to keep data->dreq
accurate.

Signed-off-by: Fred Isaman <iisaman@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 333d01d..44a93d8 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -506,6 +506,7 @@ static void nfs_init_cinfo_from_inode(struct nfs_commit_info *cinfo,
 	cinfo->lock = &inode->i_lock;
 	cinfo->mds = &NFS_I(inode)->commit_info;
 	cinfo->ds = pnfs_get_ds_info(inode);
+	cinfo->dreq = NULL;
 	cinfo->completion_ops = &nfs_commit_completion_ops;
 }
 
@@ -1431,6 +1432,7 @@ void nfs_init_commit(struct nfs_commit_data *data,
 	data->lseg	  = lseg; /* reference transferred */
 	data->mds_ops     = &nfs_commit_ops;
 	data->completion_ops = cinfo->completion_ops;
+	data->dreq	  = cinfo->dreq;
 
 	data->args.fh     = NFS_FH(data->inode);
 	/* Note: we always request a commit of the entire inode */
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 0e8b88a..5f563bd 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -1274,6 +1274,7 @@ struct nfs_commit_info {
 	spinlock_t			*lock;
 	struct nfs_mds_commit_info	*mds;
 	struct pnfs_ds_commit_info	*ds;
+	struct nfs_direct_req		*dreq;	/* O_DIRECT request */
 	const struct nfs_commit_completion_ops *completion_ops;
 };
 
-- 
cgit v0.10.2


From 56f9cd684d25f1bae901c5a872b8427f8b417c3f Mon Sep 17 00:00:00 2001
From: Fred Isaman <iisaman@netapp.com>
Date: Fri, 20 Apr 2012 14:47:56 -0400
Subject: NFS: avoid some stat gathering for direct io

Signed-off-by: Fred Isaman <iisaman@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 44a93d8..56db9e7f 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -472,9 +472,13 @@ nfs_request_add_commit_list(struct nfs_page *req, struct list_head *dst,
 	nfs_list_add_request(req, dst);
 	cinfo->mds->ncommit++;
 	spin_unlock(cinfo->lock);
-	inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
-	inc_bdi_stat(req->wb_page->mapping->backing_dev_info, BDI_RECLAIMABLE);
-	__mark_inode_dirty(req->wb_context->dentry->d_inode, I_DIRTY_DATASYNC);
+	if (!cinfo->dreq) {
+		inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
+		inc_bdi_stat(req->wb_page->mapping->backing_dev_info,
+			     BDI_RECLAIMABLE);
+		__mark_inode_dirty(req->wb_context->dentry->d_inode,
+				   I_DIRTY_DATASYNC);
+	}
 }
 EXPORT_SYMBOL_GPL(nfs_request_add_commit_list);
 
@@ -1455,9 +1459,11 @@ void nfs_retry_commit(struct list_head *page_list,
 		req = nfs_list_entry(page_list->next);
 		nfs_list_remove_request(req);
 		nfs_mark_request_commit(req, lseg, cinfo);
-		dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
-		dec_bdi_stat(req->wb_page->mapping->backing_dev_info,
-			     BDI_RECLAIMABLE);
+		if (!cinfo->dreq) {
+			dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
+			dec_bdi_stat(req->wb_page->mapping->backing_dev_info,
+				     BDI_RECLAIMABLE);
+		}
 		nfs_unlock_request(req);
 	}
 }
-- 
cgit v0.10.2


From 1763da1234cba663b849476d451bdccac5147859 Mon Sep 17 00:00:00 2001
From: Fred Isaman <iisaman@netapp.com>
Date: Fri, 20 Apr 2012 14:47:57 -0400
Subject: NFS: rewrite directio write to use async coalesce code

This also has the advantage that it allows directio to use pnfs.

Signed-off-by: Fred Isaman <iisaman@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 4ba9a2c..d44de2f 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -56,6 +56,7 @@
 
 #include "internal.h"
 #include "iostat.h"
+#include "pnfs.h"
 
 #define NFSDBG_FACILITY		NFSDBG_VFS
 
@@ -81,16 +82,19 @@ struct nfs_direct_req {
 	struct completion	completion;	/* wait for i/o completion */
 
 	/* commit state */
-	struct list_head	rewrite_list;	/* saved nfs_write_data structs */
-	struct nfs_commit_data *commit_data;	/* special write_data for commits */
+	struct nfs_mds_commit_info mds_cinfo;	/* Storage for cinfo */
+	struct pnfs_ds_commit_info ds_cinfo;	/* Storage for cinfo */
+	struct work_struct	work;
 	int			flags;
 #define NFS_ODIRECT_DO_COMMIT		(1)	/* an unstable reply was received */
 #define NFS_ODIRECT_RESCHED_WRITES	(2)	/* write verification failed */
 	struct nfs_writeverf	verf;		/* unstable write verifier */
 };
 
+static const struct nfs_pgio_completion_ops nfs_direct_write_completion_ops;
+static const struct nfs_commit_completion_ops nfs_direct_commit_completion_ops;
 static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode);
-static const struct rpc_call_ops nfs_write_direct_ops;
+static void nfs_direct_write_schedule_work(struct work_struct *work);
 
 static inline void get_dreq(struct nfs_direct_req *dreq)
 {
@@ -131,6 +135,16 @@ static void nfs_direct_release_pages(struct page **pages, unsigned int npages)
 		page_cache_release(pages[i]);
 }
 
+void nfs_init_cinfo_from_dreq(struct nfs_commit_info *cinfo,
+			      struct nfs_direct_req *dreq)
+{
+	cinfo->lock = &dreq->lock;
+	cinfo->mds = &dreq->mds_cinfo;
+	cinfo->ds = &dreq->ds_cinfo;
+	cinfo->dreq = dreq;
+	cinfo->completion_ops = &nfs_direct_commit_completion_ops;
+}
+
 static inline struct nfs_direct_req *nfs_direct_req_alloc(void)
 {
 	struct nfs_direct_req *dreq;
@@ -142,7 +156,11 @@ static inline struct nfs_direct_req *nfs_direct_req_alloc(void)
 	kref_init(&dreq->kref);
 	kref_get(&dreq->kref);
 	init_completion(&dreq->completion);
-	INIT_LIST_HEAD(&dreq->rewrite_list);
+	dreq->mds_cinfo.ncommit = 0;
+	atomic_set(&dreq->mds_cinfo.rpcs_out, 0);
+	INIT_LIST_HEAD(&dreq->mds_cinfo.list);
+	INIT_WORK(&dreq->work, nfs_direct_write_schedule_work);
+	memset(&dreq->ds_cinfo, 0, sizeof(dreq->ds_cinfo));
 	dreq->iocb = NULL;
 	dreq->ctx = NULL;
 	dreq->l_ctx = NULL;
@@ -457,112 +475,60 @@ out:
 	return result;
 }
 
-static void nfs_direct_writehdr_release(struct nfs_write_header *whdr)
-{
-	struct nfs_write_data *data = &whdr->rpc_data;
-
-	if (data->pages.pagevec != data->pages.page_array)
-		kfree(data->pages.pagevec);
-	nfs_writehdr_free(&whdr->header);
-}
-
-static void nfs_direct_free_writedata(struct nfs_direct_req *dreq)
-{
-	while (!list_empty(&dreq->rewrite_list)) {
-		struct nfs_pgio_header *hdr = list_entry(dreq->rewrite_list.next, struct nfs_pgio_header, pages);
-		struct nfs_write_header *whdr = container_of(hdr, struct nfs_write_header, header);
-		struct nfs_page_array *p = &whdr->rpc_data.pages;
-
-		list_del(&hdr->pages);
-		nfs_direct_release_pages(p->pagevec, p->npages);
-		nfs_direct_writehdr_release(whdr);
-	}
-}
-
 #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
 static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
 {
-	struct inode *inode = dreq->inode;
-	struct list_head *p;
-	struct nfs_write_data *data;
-	struct nfs_pgio_header *hdr;
-	struct rpc_task *task;
-	struct rpc_message msg = {
-		.rpc_cred = dreq->ctx->cred,
-	};
-	struct rpc_task_setup task_setup_data = {
-		.rpc_client = NFS_CLIENT(inode),
-		.rpc_message = &msg,
-		.callback_ops = &nfs_write_direct_ops,
-		.workqueue = nfsiod_workqueue,
-		.flags = RPC_TASK_ASYNC,
-	};
+	struct nfs_pageio_descriptor desc;
+	struct nfs_page *req, *tmp;
+	LIST_HEAD(reqs);
+	struct nfs_commit_info cinfo;
+	LIST_HEAD(failed);
+
+	nfs_init_cinfo_from_dreq(&cinfo, dreq);
+	pnfs_recover_commit_reqs(dreq->inode, &reqs, &cinfo);
+	spin_lock(cinfo.lock);
+	nfs_scan_commit_list(&cinfo.mds->list, &reqs, &cinfo, 0);
+	spin_unlock(cinfo.lock);
 
 	dreq->count = 0;
 	get_dreq(dreq);
 
-	list_for_each(p, &dreq->rewrite_list) {
-		hdr = list_entry(p, struct nfs_pgio_header, pages);
-		data = &(container_of(hdr, struct nfs_write_header, header))->rpc_data;
-
-		get_dreq(dreq);
-
-		/* Use stable writes */
-		data->args.stable = NFS_FILE_SYNC;
-
-		/*
-		 * Reset data->res.
-		 */
-		nfs_fattr_init(&data->fattr);
-		data->res.count = data->args.count;
-		memset(&data->verf, 0, sizeof(data->verf));
-
-		/*
-		 * Reuse data->task; data->args should not have changed
-		 * since the original request was sent.
-		 */
-		task_setup_data.task = &data->task;
-		task_setup_data.callback_data = data;
-		msg.rpc_argp = &data->args;
-		msg.rpc_resp = &data->res;
-		NFS_PROTO(inode)->write_setup(data, &msg);
-
-		/*
-		 * We're called via an RPC callback, so BKL is already held.
-		 */
-		task = rpc_run_task(&task_setup_data);
-		if (!IS_ERR(task))
-			rpc_put_task(task);
-
-		dprintk("NFS: %5u rescheduled direct write call (req %s/%Ld, %u bytes @ offset %Lu)\n",
-				data->task.tk_pid,
-				inode->i_sb->s_id,
-				(long long)NFS_FILEID(inode),
-				data->args.count,
-				(unsigned long long)data->args.offset);
-	}
+	nfs_pageio_init_write(&desc, dreq->inode, FLUSH_STABLE,
+			      &nfs_direct_write_completion_ops);
+	desc.pg_dreq = dreq;
 
-	if (put_dreq(dreq))
-		nfs_direct_write_complete(dreq, inode);
-}
+	list_for_each_entry_safe(req, tmp, &reqs, wb_list) {
+		if (!nfs_pageio_add_request(&desc, req)) {
+			nfs_list_add_request(req, &failed);
+			spin_lock(cinfo.lock);
+			dreq->flags = 0;
+			dreq->error = -EIO;
+			spin_unlock(cinfo.lock);
+		}
+	}
+	nfs_pageio_complete(&desc);
 
-static void nfs_direct_commit_result(struct rpc_task *task, void *calldata)
-{
-	struct nfs_commit_data *data = calldata;
+	while (!list_empty(&failed)) {
+		page_cache_release(req->wb_page);
+		nfs_release_request(req);
+		nfs_unlock_request(req);
+	}
 
-	/* Call the NFS version-specific code */
-	NFS_PROTO(data->inode)->commit_done(task, data);
+	if (put_dreq(dreq))
+		nfs_direct_write_complete(dreq, dreq->inode);
 }
 
-static void nfs_direct_commit_release(void *calldata)
+static void nfs_direct_commit_complete(struct nfs_commit_data *data)
 {
-	struct nfs_commit_data *data = calldata;
 	struct nfs_direct_req *dreq = data->dreq;
+	struct nfs_commit_info cinfo;
+	struct nfs_page *req;
 	int status = data->task.tk_status;
 
+	nfs_init_cinfo_from_dreq(&cinfo, dreq);
 	if (status < 0) {
 		dprintk("NFS: %5u commit failed with error %d.\n",
-				data->task.tk_pid, status);
+			data->task.tk_pid, status);
 		dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
 	} else if (memcmp(&dreq->verf, &data->verf, sizeof(data->verf))) {
 		dprintk("NFS: %5u commit verify failed\n", data->task.tk_pid);
@@ -570,59 +536,49 @@ static void nfs_direct_commit_release(void *calldata)
 	}
 
 	dprintk("NFS: %5u commit returned %d\n", data->task.tk_pid, status);
-	nfs_direct_write_complete(dreq, data->inode);
-	nfs_commit_free(data);
+	while (!list_empty(&data->pages)) {
+		req = nfs_list_entry(data->pages.next);
+		nfs_list_remove_request(req);
+		if (dreq->flags == NFS_ODIRECT_RESCHED_WRITES) {
+			/* Note the rewrite will go through mds */
+			nfs_mark_request_commit(req, NULL, &cinfo);
+		} else {
+			page_cache_release(req->wb_page);
+			nfs_release_request(req);
+		}
+		nfs_unlock_request(req);
+	}
+
+	if (atomic_dec_and_test(&cinfo.mds->rpcs_out))
+		nfs_direct_write_complete(dreq, data->inode);
 }
 
-static const struct rpc_call_ops nfs_commit_direct_ops = {
-	.rpc_call_prepare = nfs_commit_prepare,
-	.rpc_call_done = nfs_direct_commit_result,
-	.rpc_release = nfs_direct_commit_release,
+static void nfs_direct_error_cleanup(struct nfs_inode *nfsi)
+{
+	/* There is no lock to clear */
+}
+
+static const struct nfs_commit_completion_ops nfs_direct_commit_completion_ops = {
+	.completion = nfs_direct_commit_complete,
+	.error_cleanup = nfs_direct_error_cleanup,
 };
 
 static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq)
 {
-	struct nfs_commit_data *data = dreq->commit_data;
-	struct rpc_task *task;
-	struct rpc_message msg = {
-		.rpc_argp = &data->args,
-		.rpc_resp = &data->res,
-		.rpc_cred = dreq->ctx->cred,
-	};
-	struct rpc_task_setup task_setup_data = {
-		.task = &data->task,
-		.rpc_client = NFS_CLIENT(dreq->inode),
-		.rpc_message = &msg,
-		.callback_ops = &nfs_commit_direct_ops,
-		.callback_data = data,
-		.workqueue = nfsiod_workqueue,
-		.flags = RPC_TASK_ASYNC,
-	};
-
-	data->inode = dreq->inode;
-	data->cred = msg.rpc_cred;
-
-	data->args.fh = NFS_FH(data->inode);
-	data->args.offset = 0;
-	data->args.count = 0;
-	data->res.fattr = &data->fattr;
-	data->res.verf = &data->verf;
-	nfs_fattr_init(&data->fattr);
-
-	NFS_PROTO(data->inode)->commit_setup(data, &msg);
-
-	/* Note: task.tk_ops->rpc_release will free dreq->commit_data */
-	dreq->commit_data = NULL;
-
-	dprintk("NFS: %5u initiated commit call\n", data->task.tk_pid);
-
-	task = rpc_run_task(&task_setup_data);
-	if (!IS_ERR(task))
-		rpc_put_task(task);
+	int res;
+	struct nfs_commit_info cinfo;
+	LIST_HEAD(mds_list);
+
+	nfs_init_cinfo_from_dreq(&cinfo, dreq);
+	nfs_scan_commit(dreq->inode, &mds_list, &cinfo);
+	res = nfs_generic_commit_list(dreq->inode, &mds_list, 0, &cinfo);
+	if (res < 0) /* res == -ENOMEM */
+		nfs_direct_write_reschedule(dreq);
 }
 
-static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode)
+static void nfs_direct_write_schedule_work(struct work_struct *work)
 {
+	struct nfs_direct_req *dreq = container_of(work, struct nfs_direct_req, work);
 	int flags = dreq->flags;
 
 	dreq->flags = 0;
@@ -634,90 +590,29 @@ static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode
 			nfs_direct_write_reschedule(dreq);
 			break;
 		default:
-			if (dreq->commit_data != NULL)
-				nfs_commit_free(dreq->commit_data);
-			nfs_direct_free_writedata(dreq);
-			nfs_zap_mapping(inode, inode->i_mapping);
+			nfs_zap_mapping(dreq->inode, dreq->inode->i_mapping);
 			nfs_direct_complete(dreq);
 	}
 }
 
-static void nfs_alloc_commit_data(struct nfs_direct_req *dreq)
+static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode)
 {
-	dreq->commit_data = nfs_commitdata_alloc();
-	if (dreq->commit_data != NULL)
-		dreq->commit_data->dreq = dreq;
+	schedule_work(&dreq->work); /* Calls nfs_direct_write_schedule_work */
 }
+
 #else
-static inline void nfs_alloc_commit_data(struct nfs_direct_req *dreq)
-{
-	dreq->commit_data = NULL;
-}
 
 static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode)
 {
-	nfs_direct_free_writedata(dreq);
 	nfs_zap_mapping(inode, inode->i_mapping);
 	nfs_direct_complete(dreq);
 }
 #endif
 
-static void nfs_direct_write_result(struct rpc_task *task, void *calldata)
-{
-	struct nfs_write_data *data = calldata;
-
-	nfs_writeback_done(task, data);
-}
-
 /*
  * NB: Return the value of the first error return code.  Subsequent
  *     errors after the first one are ignored.
  */
-static void nfs_direct_write_release(void *calldata)
-{
-	struct nfs_write_data *data = calldata;
-	struct nfs_pgio_header *hdr = data->header;
-	struct nfs_direct_req *dreq = (struct nfs_direct_req *) hdr->req;
-	int status = data->task.tk_status;
-
-	spin_lock(&dreq->lock);
-
-	if (unlikely(status < 0)) {
-		/* An error has occurred, so we should not commit */
-		dreq->flags = 0;
-		dreq->error = status;
-	}
-	if (unlikely(dreq->error != 0))
-		goto out_unlock;
-
-	dreq->count += data->res.count;
-
-	if (data->res.verf->committed != NFS_FILE_SYNC) {
-		switch (dreq->flags) {
-			case 0:
-				memcpy(&dreq->verf, &data->verf, sizeof(dreq->verf));
-				dreq->flags = NFS_ODIRECT_DO_COMMIT;
-				break;
-			case NFS_ODIRECT_DO_COMMIT:
-				if (memcmp(&dreq->verf, &data->verf, sizeof(dreq->verf))) {
-					dprintk("NFS: %5u write verify failed\n", data->task.tk_pid);
-					dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
-				}
-		}
-	}
-out_unlock:
-	spin_unlock(&dreq->lock);
-
-	if (put_dreq(dreq))
-		nfs_direct_write_complete(dreq, hdr->inode);
-}
-
-static const struct rpc_call_ops nfs_write_direct_ops = {
-	.rpc_call_prepare = nfs_write_prepare,
-	.rpc_call_done = nfs_direct_write_result,
-	.rpc_release = nfs_direct_write_release,
-};
-
 /*
  * For each wsize'd chunk of the user's buffer, dispatch an NFS WRITE
  * operation.  If nfs_writedata_alloc() or get_user_pages() fails,
@@ -725,143 +620,181 @@ static const struct rpc_call_ops nfs_write_direct_ops = {
  * handled automatically by nfs_direct_write_result().  Otherwise, if
  * no requests have been sent, just return an error.
  */
-static ssize_t nfs_direct_write_schedule_segment(struct nfs_direct_req *dreq,
+static ssize_t nfs_direct_write_schedule_segment(struct nfs_pageio_descriptor *desc,
 						 const struct iovec *iov,
-						 loff_t pos, int sync)
+						 loff_t pos)
 {
+	struct nfs_direct_req *dreq = desc->pg_dreq;
 	struct nfs_open_context *ctx = dreq->ctx;
 	struct inode *inode = ctx->dentry->d_inode;
 	unsigned long user_addr = (unsigned long)iov->iov_base;
 	size_t count = iov->iov_len;
-	struct rpc_task *task;
-	struct rpc_message msg = {
-		.rpc_cred = ctx->cred,
-	};
-	struct rpc_task_setup task_setup_data = {
-		.rpc_client = NFS_CLIENT(inode),
-		.rpc_message = &msg,
-		.callback_ops = &nfs_write_direct_ops,
-		.workqueue = nfsiod_workqueue,
-		.flags = RPC_TASK_ASYNC,
-	};
 	size_t wsize = NFS_SERVER(inode)->wsize;
 	unsigned int pgbase;
 	int result;
 	ssize_t started = 0;
+	struct page **pagevec = NULL;
+	unsigned int npages;
 
 	do {
-		struct nfs_write_header *whdr;
-		struct nfs_write_data *data;
-		struct nfs_page_array *pages;
 		size_t bytes;
+		int i;
 
 		pgbase = user_addr & ~PAGE_MASK;
-		bytes = min(wsize,count);
+		bytes = min(max(wsize, PAGE_SIZE), count);
 
 		result = -ENOMEM;
-		whdr = nfs_writehdr_alloc();
-		if (unlikely(!whdr))
+		npages = nfs_page_array_len(pgbase, bytes);
+		if (!pagevec)
+			pagevec = kmalloc(npages * sizeof(struct page *), GFP_KERNEL);
+		if (!pagevec)
 			break;
 
-		data = nfs_writedata_alloc(&whdr->header, nfs_page_array_len(pgbase, bytes));
-		if (!data) {
-			nfs_writehdr_free(&whdr->header);
-			break;
-		}
-		data->header = &whdr->header;
-		atomic_inc(&data->header->refcnt);
-		pages = &data->pages;
-
 		down_read(&current->mm->mmap_sem);
 		result = get_user_pages(current, current->mm, user_addr,
-					pages->npages, 0, 0, pages->pagevec, NULL);
+					npages, 0, 0, pagevec, NULL);
 		up_read(&current->mm->mmap_sem);
-		if (result < 0) {
-			nfs_direct_writehdr_release(whdr);
+		if (result < 0)
 			break;
-		}
-		if ((unsigned)result < pages->npages) {
+
+		if ((unsigned)result < npages) {
 			bytes = result * PAGE_SIZE;
 			if (bytes <= pgbase) {
-				nfs_direct_release_pages(pages->pagevec, result);
-				nfs_direct_writehdr_release(whdr);
+				nfs_direct_release_pages(pagevec, result);
 				break;
 			}
 			bytes -= pgbase;
-			pages->npages = result;
+			npages = result;
 		}
 
-		get_dreq(dreq);
-
-		list_move_tail(&whdr->header.pages, &dreq->rewrite_list);
-
-		whdr->header.req = (struct nfs_page *) dreq;
-		whdr->header.inode = inode;
-		whdr->header.cred = msg.rpc_cred;
-		data->args.fh = NFS_FH(inode);
-		data->args.context = ctx;
-		data->args.lock_context = dreq->l_ctx;
-		data->args.offset = pos;
-		data->args.pgbase = pgbase;
-		data->args.pages = pages->pagevec;
-		data->args.count = bytes;
-		data->args.stable = sync;
-		data->res.fattr = &data->fattr;
-		data->res.count = bytes;
-		data->res.verf = &data->verf;
-		nfs_fattr_init(&data->fattr);
-
-		task_setup_data.task = &data->task;
-		task_setup_data.callback_data = data;
-		msg.rpc_argp = &data->args;
-		msg.rpc_resp = &data->res;
-		NFS_PROTO(inode)->write_setup(data, &msg);
-
-		task = rpc_run_task(&task_setup_data);
-		if (IS_ERR(task))
-			break;
+		for (i = 0; i < npages; i++) {
+			struct nfs_page *req;
+			unsigned int req_len = min(bytes, PAGE_SIZE - pgbase);
 
-		dprintk("NFS: %5u initiated direct write call "
-			"(req %s/%Ld, %zu bytes @ offset %Lu)\n",
-				task->tk_pid,
-				inode->i_sb->s_id,
-				(long long)NFS_FILEID(inode),
-				bytes,
-				(unsigned long long)data->args.offset);
-		rpc_put_task(task);
-
-		started += bytes;
-		user_addr += bytes;
-		pos += bytes;
-
-		/* FIXME: Remove this useless math from the final patch */
-		pgbase += bytes;
-		pgbase &= ~PAGE_MASK;
-		BUG_ON(pgbase != (user_addr & ~PAGE_MASK));
-
-		count -= bytes;
+			req = nfs_create_request(dreq->ctx, dreq->inode,
+						 pagevec[i],
+						 pgbase, req_len);
+			if (IS_ERR(req)) {
+				nfs_direct_release_pages(pagevec + i,
+							 npages - i);
+				result = PTR_ERR(req);
+				break;
+			}
+			nfs_lock_request(req);
+			req->wb_index = pos >> PAGE_SHIFT;
+			req->wb_offset = pos & ~PAGE_MASK;
+			if (!nfs_pageio_add_request(desc, req)) {
+				result = desc->pg_error;
+				nfs_unlock_request(req);
+				nfs_release_request(req);
+				nfs_direct_release_pages(pagevec + i,
+							 npages - i);
+			}
+			pgbase = 0;
+			bytes -= req_len;
+			started += req_len;
+			user_addr += req_len;
+			pos += req_len;
+			count -= req_len;
+		}
 	} while (count != 0);
 
+	kfree(pagevec);
+
 	if (started)
 		return started;
 	return result < 0 ? (ssize_t) result : -EFAULT;
 }
 
+static void nfs_direct_write_completion(struct nfs_pgio_header *hdr)
+{
+	struct nfs_direct_req *dreq = hdr->dreq;
+	struct nfs_commit_info cinfo;
+	int bit = -1;
+	struct nfs_page *req = nfs_list_entry(hdr->pages.next);
+
+	if (test_bit(NFS_IOHDR_REDO, &hdr->flags))
+		goto out_put;
+
+	nfs_init_cinfo_from_dreq(&cinfo, dreq);
+
+	spin_lock(&dreq->lock);
+
+	if (test_bit(NFS_IOHDR_ERROR, &hdr->flags)) {
+		dreq->flags = 0;
+		dreq->error = hdr->error;
+	}
+	if (dreq->error != 0)
+		bit = NFS_IOHDR_ERROR;
+	else {
+		dreq->count += hdr->good_bytes;
+		if (test_bit(NFS_IOHDR_NEED_RESCHED, &hdr->flags)) {
+			dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
+			bit = NFS_IOHDR_NEED_RESCHED;
+		} else if (test_bit(NFS_IOHDR_NEED_COMMIT, &hdr->flags)) {
+			if (dreq->flags == NFS_ODIRECT_RESCHED_WRITES)
+				bit = NFS_IOHDR_NEED_RESCHED;
+			else if (dreq->flags == 0) {
+				memcpy(&dreq->verf, &req->wb_verf,
+				       sizeof(dreq->verf));
+				bit = NFS_IOHDR_NEED_COMMIT;
+				dreq->flags = NFS_ODIRECT_DO_COMMIT;
+			} else if (dreq->flags == NFS_ODIRECT_DO_COMMIT) {
+				if (memcmp(&dreq->verf, &req->wb_verf, sizeof(dreq->verf))) {
+					dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
+					bit = NFS_IOHDR_NEED_RESCHED;
+				} else
+					bit = NFS_IOHDR_NEED_COMMIT;
+			}
+		}
+	}
+	spin_unlock(&dreq->lock);
+
+	while (!list_empty(&hdr->pages)) {
+		req = nfs_list_entry(hdr->pages.next);
+		nfs_list_remove_request(req);
+		switch (bit) {
+		case NFS_IOHDR_NEED_RESCHED:
+		case NFS_IOHDR_NEED_COMMIT:
+			nfs_mark_request_commit(req, hdr->lseg, &cinfo);
+			break;
+		default:
+			page_cache_release(req->wb_page);
+			nfs_release_request(req);
+		}
+		nfs_unlock_request(req);
+	}
+
+out_put:
+	if (put_dreq(dreq))
+		nfs_direct_write_complete(dreq, hdr->inode);
+	hdr->release(hdr);
+}
+
+static const struct nfs_pgio_completion_ops nfs_direct_write_completion_ops = {
+	.error_cleanup = nfs_sync_pgio_error,
+	.init_hdr = nfs_direct_pgio_init,
+	.completion = nfs_direct_write_completion,
+};
+
 static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
 					       const struct iovec *iov,
 					       unsigned long nr_segs,
-					       loff_t pos, int sync)
+					       loff_t pos)
 {
+	struct nfs_pageio_descriptor desc;
 	ssize_t result = 0;
 	size_t requested_bytes = 0;
 	unsigned long seg;
 
+	nfs_pageio_init_write(&desc, dreq->inode, FLUSH_COND_STABLE,
+			      &nfs_direct_write_completion_ops);
+	desc.pg_dreq = dreq;
 	get_dreq(dreq);
 
 	for (seg = 0; seg < nr_segs; seg++) {
 		const struct iovec *vec = &iov[seg];
-		result = nfs_direct_write_schedule_segment(dreq, vec,
-							   pos, sync);
+		result = nfs_direct_write_schedule_segment(&desc, vec, pos);
 		if (result < 0)
 			break;
 		requested_bytes += result;
@@ -869,6 +802,7 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
 			break;
 		pos += vec->iov_len;
 	}
+	nfs_pageio_complete(&desc);
 
 	/*
 	 * If no bytes were started, return the error, and let the
@@ -891,16 +825,10 @@ static ssize_t nfs_direct_write(struct kiocb *iocb, const struct iovec *iov,
 	ssize_t result = -ENOMEM;
 	struct inode *inode = iocb->ki_filp->f_mapping->host;
 	struct nfs_direct_req *dreq;
-	size_t wsize = NFS_SERVER(inode)->wsize;
-	int sync = NFS_UNSTABLE;
 
 	dreq = nfs_direct_req_alloc();
 	if (!dreq)
 		goto out;
-	nfs_alloc_commit_data(dreq);
-
-	if (dreq->commit_data == NULL || count <= wsize)
-		sync = NFS_FILE_SYNC;
 
 	dreq->inode = inode;
 	dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp));
@@ -910,7 +838,7 @@ static ssize_t nfs_direct_write(struct kiocb *iocb, const struct iovec *iov,
 	if (!is_sync_kiocb(iocb))
 		dreq->iocb = iocb;
 
-	result = nfs_direct_write_schedule_iovec(dreq, iov, nr_segs, pos, sync);
+	result = nfs_direct_write_schedule_iovec(dreq, iov, nr_segs, pos);
 	if (!result)
 		result = nfs_direct_wait(dreq);
 out_release:
@@ -1030,10 +958,15 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
 	task_io_account_write(count);
 
 	retval = nfs_direct_write(iocb, iov, nr_segs, pos, count);
+	if (retval > 0) {
+		struct inode *inode = mapping->host;
 
-	if (retval > 0)
 		iocb->ki_pos = pos + retval;
-
+		spin_lock(&inode->i_lock);
+		if (i_size_read(inode) < iocb->ki_pos)
+			i_size_write(inode, iocb->ki_pos);
+		spin_unlock(&inode->i_lock);
+	}
 out:
 	return retval;
 }
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 137f5cd..d68810f 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -320,10 +320,11 @@ extern void nfs_pageio_reset_read_mds(struct nfs_pageio_descriptor *pgio);
 extern void nfs_readdata_release(struct nfs_read_data *rdata);
 
 /* write.c */
+extern void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio,
+			struct inode *inode, int ioflags,
+			const struct nfs_pgio_completion_ops *compl_ops);
 extern struct nfs_write_header *nfs_writehdr_alloc(void);
 extern void nfs_writehdr_free(struct nfs_pgio_header *hdr);
-extern struct nfs_write_data *nfs_writedata_alloc(struct nfs_pgio_header *hdr,
-						  unsigned int pagecount);
 extern int nfs_generic_flush(struct nfs_pageio_descriptor *desc,
 			     struct nfs_pgio_header *hdr);
 extern void nfs_pageio_init_write_mds(struct nfs_pageio_descriptor *pgio,
@@ -346,6 +347,15 @@ extern void nfs_init_commit(struct nfs_commit_data *data,
 			    struct list_head *head,
 			    struct pnfs_layout_segment *lseg,
 			    struct nfs_commit_info *cinfo);
+int nfs_scan_commit_list(struct list_head *src, struct list_head *dst,
+			 struct nfs_commit_info *cinfo, int max);
+int nfs_scan_commit(struct inode *inode, struct list_head *dst,
+		    struct nfs_commit_info *cinfo);
+void nfs_mark_request_commit(struct nfs_page *req,
+			     struct pnfs_layout_segment *lseg,
+			     struct nfs_commit_info *cinfo);
+int nfs_generic_commit_list(struct inode *inode, struct list_head *head,
+			    int how, struct nfs_commit_info *cinfo);
 void nfs_retry_commit(struct list_head *page_list,
 		      struct pnfs_layout_segment *lseg,
 		      struct nfs_commit_info *cinfo);
@@ -365,6 +375,10 @@ extern int nfs_migrate_page(struct address_space *,
 #define nfs_migrate_page NULL
 #endif
 
+/* direct.c */
+void nfs_init_cinfo_from_dreq(struct nfs_commit_info *cinfo,
+			      struct nfs_direct_req *dreq);
+
 /* nfs4proc.c */
 extern void __nfs4_read_done_cb(struct nfs_read_data *);
 extern void nfs4_reset_read(struct rpc_task *task, struct nfs_read_data *data);
diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
index 26d1da4..806a55f 100644
--- a/fs/nfs/nfs4filelayout.c
+++ b/fs/nfs/nfs4filelayout.c
@@ -996,12 +996,9 @@ static int filelayout_initiate_commit(struct nfs_commit_data *data, int how)
 }
 
 static int
-filelayout_scan_ds_commit_list(struct pnfs_commit_bucket *bucket,
-			       struct nfs_commit_info *cinfo,
-			       int max)
+transfer_commit_list(struct list_head *src, struct list_head *dst,
+		     struct nfs_commit_info *cinfo, int max)
 {
-	struct list_head *src = &bucket->written;
-	struct list_head *dst = &bucket->committing;
 	struct nfs_page *req, *tmp;
 	int ret = 0;
 
@@ -1014,9 +1011,22 @@ filelayout_scan_ds_commit_list(struct pnfs_commit_bucket *bucket,
 		clear_bit(PG_COMMIT_TO_DS, &req->wb_flags);
 		nfs_list_add_request(req, dst);
 		ret++;
-		if (ret == max)
+		if ((ret == max) && !cinfo->dreq)
 			break;
 	}
+	return ret;
+}
+
+static int
+filelayout_scan_ds_commit_list(struct pnfs_commit_bucket *bucket,
+			       struct nfs_commit_info *cinfo,
+			       int max)
+{
+	struct list_head *src = &bucket->written;
+	struct list_head *dst = &bucket->committing;
+	int ret;
+
+	ret = transfer_commit_list(src, dst, cinfo, max);
 	if (ret) {
 		cinfo->ds->nwritten -= ret;
 		cinfo->ds->ncommitting += ret;
@@ -1046,6 +1056,27 @@ static int filelayout_scan_commit_lists(struct nfs_commit_info *cinfo,
 	return rv;
 }
 
+/* Pull everything off the committing lists and dump into @dst */
+static void filelayout_recover_commit_reqs(struct list_head *dst,
+					   struct nfs_commit_info *cinfo)
+{
+	struct pnfs_commit_bucket *b;
+	int i;
+
+	/* NOTE cinfo->lock is NOT held, relying on fact that this is
+	 * only called on single thread per dreq.
+	 * Can't take the lock because need to do put_lseg
+	 */
+	for (i = 0, b = cinfo->ds->buckets; i < cinfo->ds->nbuckets; i++, b++) {
+		if (transfer_commit_list(&b->written, dst, cinfo, 0)) {
+			BUG_ON(!list_empty(&b->written));
+			put_lseg(b->wlseg);
+			b->wlseg = NULL;
+		}
+	}
+	cinfo->ds->nwritten = 0;
+}
+
 static unsigned int
 alloc_ds_commits(struct nfs_commit_info *cinfo, struct list_head *list)
 {
@@ -1170,6 +1201,7 @@ static struct pnfs_layoutdriver_type filelayout_type = {
 	.mark_request_commit	= filelayout_mark_request_commit,
 	.clear_request_commit	= filelayout_clear_request_commit,
 	.scan_commit_lists	= filelayout_scan_commit_lists,
+	.recover_commit_reqs	= filelayout_recover_commit_reqs,
 	.commit_pagelist	= filelayout_commit_pagelist,
 	.read_pagelist		= filelayout_read_pagelist,
 	.write_pagelist		= filelayout_write_pagelist,
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 4cd8760..8efbee7 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -102,6 +102,8 @@ struct pnfs_layoutdriver_type {
 				      struct nfs_commit_info *cinfo);
 	int (*scan_commit_lists) (struct nfs_commit_info *cinfo,
 				  int max);
+	void (*recover_commit_reqs) (struct list_head *list,
+				     struct nfs_commit_info *cinfo);
 	int (*commit_pagelist)(struct inode *inode,
 			       struct list_head *mds_pages,
 			       int how,
@@ -323,6 +325,15 @@ pnfs_scan_commit_lists(struct inode *inode, struct nfs_commit_info *cinfo,
 		return NFS_SERVER(inode)->pnfs_curr_ld->scan_commit_lists(cinfo, max);
 }
 
+static inline void
+pnfs_recover_commit_reqs(struct inode *inode, struct list_head *list,
+			 struct nfs_commit_info *cinfo)
+{
+	if (cinfo->ds == NULL || cinfo->ds->nwritten == 0)
+		return;
+	NFS_SERVER(inode)->pnfs_curr_ld->recover_commit_reqs(list, cinfo);
+}
+
 /* Should the pNFS client commit and return the layout upon a setattr */
 static inline bool
 pnfs_ld_layoutret_on_setattr(struct inode *inode)
@@ -456,6 +467,12 @@ pnfs_scan_commit_lists(struct inode *inode, struct nfs_commit_info *cinfo,
 	return 0;
 }
 
+static inline void
+pnfs_recover_commit_reqs(struct inode *inode, struct list_head *list,
+			 struct nfs_commit_info *cinfo)
+{
+}
+
 static inline int pnfs_layoutcommit_inode(struct inode *inode, bool sync)
 {
 	return 0;
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 56db9e7f..fec214b 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -39,9 +39,6 @@
 /*
  * Local function declarations
  */
-static void nfs_pageio_init_write(struct nfs_pageio_descriptor *desc,
-			struct inode *inode, int ioflags,
-			const struct nfs_pgio_completion_ops *compl_ops);
 static void nfs_redirty_request(struct nfs_page *req);
 static const struct rpc_call_ops nfs_write_common_ops;
 static const struct rpc_call_ops nfs_commit_ops;
@@ -87,8 +84,8 @@ struct nfs_write_header *nfs_writehdr_alloc(void)
 	return p;
 }
 
-struct nfs_write_data *nfs_writedata_alloc(struct nfs_pgio_header *hdr,
-					   unsigned int pagecount)
+static struct nfs_write_data *nfs_writedata_alloc(struct nfs_pgio_header *hdr,
+						  unsigned int pagecount)
 {
 	struct nfs_write_data *data, *prealloc;
 
@@ -518,14 +515,17 @@ void nfs_init_cinfo(struct nfs_commit_info *cinfo,
 		    struct inode *inode,
 		    struct nfs_direct_req *dreq)
 {
-	nfs_init_cinfo_from_inode(cinfo, inode);
+	if (dreq)
+		nfs_init_cinfo_from_dreq(cinfo, dreq);
+	else
+		nfs_init_cinfo_from_inode(cinfo, inode);
 }
 EXPORT_SYMBOL_GPL(nfs_init_cinfo);
 
 /*
  * Add a request to the inode's commit list.
  */
-static void
+void
 nfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg,
 			struct nfs_commit_info *cinfo)
 {
@@ -567,7 +567,7 @@ int nfs_write_need_commit(struct nfs_write_data *data)
 }
 
 #else
-static void
+void
 nfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg,
 			struct nfs_commit_info *cinfo)
 {
@@ -632,7 +632,7 @@ nfs_reqs_to_commit(struct nfs_commit_info *cinfo)
 }
 
 /* cinfo->lock held by caller */
-static int
+int
 nfs_scan_commit_list(struct list_head *src, struct list_head *dst,
 		     struct nfs_commit_info *cinfo, int max)
 {
@@ -647,7 +647,7 @@ nfs_scan_commit_list(struct list_head *src, struct list_head *dst,
 		nfs_request_remove_commit_list(req, cinfo);
 		nfs_list_add_request(req, dst);
 		ret++;
-		if (ret == max)
+		if ((ret == max) && !cinfo->dreq)
 			break;
 	}
 	return ret;
@@ -662,7 +662,7 @@ nfs_scan_commit_list(struct list_head *src, struct list_head *dst,
  * Moves requests from the inode's 'commit' request list.
  * The requests are *not* checked to ensure that they form a contiguous set.
  */
-static int
+int
 nfs_scan_commit(struct inode *inode, struct list_head *dst,
 		struct nfs_commit_info *cinfo)
 {
@@ -686,8 +686,8 @@ static unsigned long nfs_reqs_to_commit(struct nfs_commit_info *cinfo)
 	return 0;
 }
 
-static inline int nfs_scan_commit(struct inode *inode, struct list_head *dst,
-				  struct nfs_commit_info *cinfo)
+int nfs_scan_commit(struct inode *inode, struct list_head *dst,
+		    struct nfs_commit_info *cinfo)
 {
 	return 0;
 }
@@ -1202,9 +1202,9 @@ void nfs_pageio_reset_write_mds(struct nfs_pageio_descriptor *pgio)
 }
 EXPORT_SYMBOL_GPL(nfs_pageio_reset_write_mds);
 
-static void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio,
-				struct inode *inode, int ioflags,
-				const struct nfs_pgio_completion_ops *compl_ops)
+void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio,
+			   struct inode *inode, int ioflags,
+			   const struct nfs_pgio_completion_ops *compl_ops)
 {
 	if (!pnfs_pageio_init_write(pgio, inode, ioflags, compl_ops))
 		nfs_pageio_init_write_mds(pgio, inode, ioflags, compl_ops);
@@ -1568,8 +1568,8 @@ static const struct nfs_commit_completion_ops nfs_commit_completion_ops = {
 	.error_cleanup = nfs_commit_clear_lock,
 };
 
-static int nfs_generic_commit_list(struct inode *inode, struct list_head *head,
-				   int how, struct nfs_commit_info *cinfo)
+int nfs_generic_commit_list(struct inode *inode, struct list_head *head,
+			    int how, struct nfs_commit_info *cinfo)
 {
 	int status;
 
-- 
cgit v0.10.2


From df0117481cd94dbb8970f4be9d05b0568fa09ab1 Mon Sep 17 00:00:00 2001
From: Fred Isaman <iisaman@netapp.com>
Date: Tue, 24 Apr 2012 14:50:34 -0400
Subject: NFS: Prevent garbage cinfo->ds from leaking out

This is a bugfix that applies on top of the previous directio patches,
that fixes a bug introduced in "NFS: create struct nfs_commit_info".

Signed-off-by: Fred Isaman <iisaman@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
index 806a55f..80a63f6 100644
--- a/fs/nfs/nfs4filelayout.c
+++ b/fs/nfs/nfs4filelayout.c
@@ -1184,7 +1184,12 @@ filelayout_free_layout_hdr(struct pnfs_layout_hdr *lo)
 static struct pnfs_ds_commit_info *
 filelayout_get_ds_info(struct inode *inode)
 {
-	return &FILELAYOUT_FROM_HDR(NFS_I(inode)->layout)->commit_info;
+	struct pnfs_layout_hdr *layout = NFS_I(inode)->layout;
+
+	if (layout == NULL)
+		return NULL;
+	else
+		return &FILELAYOUT_FROM_HDR(layout)->commit_info;
 }
 
 static struct pnfs_layoutdriver_type filelayout_type = {
-- 
cgit v0.10.2


From 2671bfc3beb44e70636bd0208274426db57f73b5 Mon Sep 17 00:00:00 2001
From: Bryan Schumaker <bjschuma@netapp.com>
Date: Fri, 27 Apr 2012 13:27:44 -0400
Subject: NFS: Remove secinfo knowledge out of the generic client

And also remove the unneeded rpc_op.

Signed-off-by: Bryan Schumaker <bjschuma@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index d68810f..d699444 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -286,9 +286,6 @@ extern void nfs_sb_deactive(struct super_block *sb);
 extern char *nfs_path(char **p, struct dentry *dentry,
 		      char *buffer, ssize_t buflen);
 extern struct vfsmount *nfs_d_automount(struct path *path);
-#ifdef CONFIG_NFS_V4
-rpc_authflavor_t nfs_find_best_sec(struct nfs4_secinfo_flavors *);
-#endif
 
 /* getroot.c */
 extern struct dentry *nfs_get_root(struct super_block *, struct nfs_fh *,
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index d51868e..2a9591b 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -119,35 +119,6 @@ Elong:
 }
 
 #ifdef CONFIG_NFS_V4
-rpc_authflavor_t nfs_find_best_sec(struct nfs4_secinfo_flavors *flavors)
-{
-	struct gss_api_mech *mech;
-	struct xdr_netobj oid;
-	int i;
-	rpc_authflavor_t pseudoflavor = RPC_AUTH_UNIX;
-
-	for (i = 0; i < flavors->num_flavors; i++) {
-		struct nfs4_secinfo_flavor *flavor;
-		flavor = &flavors->flavors[i];
-
-		if (flavor->flavor == RPC_AUTH_NULL || flavor->flavor == RPC_AUTH_UNIX) {
-			pseudoflavor = flavor->flavor;
-			break;
-		} else if (flavor->flavor == RPC_AUTH_GSS) {
-			oid.len  = flavor->gss.sec_oid4.len;
-			oid.data = flavor->gss.sec_oid4.data;
-			mech = gss_mech_get_by_OID(&oid);
-			if (!mech)
-				continue;
-			pseudoflavor = gss_svc_to_pseudoflavor(mech, flavor->gss.service);
-			gss_mech_put(mech);
-			break;
-		}
-	}
-
-	return pseudoflavor;
-}
-
 static struct rpc_clnt *nfs_lookup_mountpoint(struct inode *dir,
 					      struct qstr *name,
 					      struct nfs_fh *fh,
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 8d75021..53a487e 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -206,6 +206,7 @@ extern const struct dentry_operations nfs4_dentry_operations;
 extern const struct inode_operations nfs4_dir_inode_operations;
 
 /* nfs4namespace.c */
+rpc_authflavor_t nfs_find_best_sec(struct nfs4_secinfo_flavors *);
 struct rpc_clnt *nfs4_create_sec_client(struct rpc_clnt *, struct inode *, struct qstr *);
 
 /* nfs4proc.c */
diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c
index a7f3ded..a69ee39 100644
--- a/fs/nfs/nfs4namespace.c
+++ b/fs/nfs/nfs4namespace.c
@@ -132,6 +132,35 @@ static size_t nfs_parse_server_name(char *string, size_t len,
 	return ret;
 }
 
+rpc_authflavor_t nfs_find_best_sec(struct nfs4_secinfo_flavors *flavors)
+{
+	struct gss_api_mech *mech;
+	struct xdr_netobj oid;
+	int i;
+	rpc_authflavor_t pseudoflavor = RPC_AUTH_UNIX;
+
+	for (i = 0; i < flavors->num_flavors; i++) {
+		struct nfs4_secinfo_flavor *flavor;
+		flavor = &flavors->flavors[i];
+
+		if (flavor->flavor == RPC_AUTH_NULL || flavor->flavor == RPC_AUTH_UNIX) {
+			pseudoflavor = flavor->flavor;
+			break;
+		} else if (flavor->flavor == RPC_AUTH_GSS) {
+			oid.len  = flavor->gss.sec_oid4.len;
+			oid.data = flavor->gss.sec_oid4.data;
+			mech = gss_mech_get_by_OID(&oid);
+			if (!mech)
+				continue;
+			pseudoflavor = gss_svc_to_pseudoflavor(mech, flavor->gss.service);
+			gss_mech_put(mech);
+			break;
+		}
+	}
+
+	return pseudoflavor;
+}
+
 static rpc_authflavor_t nfs4_negotiate_security(struct inode *inode, struct qstr *name)
 {
 	struct page *page;
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 87af80d..fa661b9 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -6610,7 +6610,6 @@ const struct nfs_rpc_ops nfs_v4_clientops = {
 	.close_context  = nfs4_close_context,
 	.open_context	= nfs4_atomic_open,
 	.init_client	= nfs4_init_client,
-	.secinfo	= nfs4_proc_secinfo,
 };
 
 static const struct xattr_handler nfs4_xattr_nfs4_acl_handler = {
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 5f563bd..eb1f143 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -1397,7 +1397,6 @@ struct nfs_rpc_ops {
 				struct iattr *iattr);
 	int	(*init_client) (struct nfs_client *, const struct rpc_timeout *,
 				const char *, rpc_authflavor_t, int);
-	int	(*secinfo)(struct inode *, const struct qstr *, struct nfs4_secinfo_flavors *);
 };
 
 /*
-- 
cgit v0.10.2


From 281cad46b34db4dbb1d1e603f7b9cfe25d1ae7c9 Mon Sep 17 00:00:00 2001
From: Bryan Schumaker <bjschuma@netapp.com>
Date: Fri, 27 Apr 2012 13:27:45 -0400
Subject: NFS: Create a submount rpc_op

This simplifies the code for v2 and v3 and gives v4 a chance to decide
on referrals without needing to modify the generic client.

Signed-off-by: Bryan Schumaker <bjschuma@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index d699444..0fd1efa 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -185,17 +185,6 @@ static inline void nfs_fs_proc_exit(void)
 }
 #endif
 
-/* nfs4namespace.c */
-#ifdef CONFIG_NFS_V4
-extern struct vfsmount *nfs_do_refmount(struct rpc_clnt *client, struct dentry *dentry);
-#else
-static inline
-struct vfsmount *nfs_do_refmount(struct rpc_clnt *client, struct dentry *dentry)
-{
-	return ERR_PTR(-ENOENT);
-}
-#endif
-
 /* callback_xdr.c */
 extern struct svc_version nfs4_callback_version1;
 extern struct svc_version nfs4_callback_version4;
@@ -286,6 +275,10 @@ extern void nfs_sb_deactive(struct super_block *sb);
 extern char *nfs_path(char **p, struct dentry *dentry,
 		      char *buffer, ssize_t buflen);
 extern struct vfsmount *nfs_d_automount(struct path *path);
+struct vfsmount *nfs_submount(struct nfs_server *, struct dentry *,
+			      struct nfs_fh *, struct nfs_fattr *);
+struct vfsmount *nfs_do_submount(struct dentry *, struct nfs_fh *,
+				 struct nfs_fattr *, rpc_authflavor_t);
 
 /* getroot.c */
 extern struct dentry *nfs_get_root(struct super_block *, struct nfs_fh *,
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index 2a9591b..e36fd8a 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -26,11 +26,6 @@ static LIST_HEAD(nfs_automount_list);
 static DECLARE_DELAYED_WORK(nfs_automount_task, nfs_expire_automounts);
 int nfs_mountpoint_expiry_timeout = 500 * HZ;
 
-static struct vfsmount *nfs_do_submount(struct dentry *dentry,
-					struct nfs_fh *fh,
-					struct nfs_fattr *fattr,
-					rpc_authflavor_t authflavor);
-
 /*
  * nfs_path - reconstruct the path given an arbitrary dentry
  * @base - used to return pointer to the end of devname part of path
@@ -118,35 +113,6 @@ Elong:
 	return ERR_PTR(-ENAMETOOLONG);
 }
 
-#ifdef CONFIG_NFS_V4
-static struct rpc_clnt *nfs_lookup_mountpoint(struct inode *dir,
-					      struct qstr *name,
-					      struct nfs_fh *fh,
-					      struct nfs_fattr *fattr)
-{
-	int err;
-
-	if (NFS_PROTO(dir)->version == 4)
-		return nfs4_proc_lookup_mountpoint(dir, name, fh, fattr);
-
-	err = NFS_PROTO(dir)->lookup(NFS_SERVER(dir)->client, dir, name, fh, fattr);
-	if (err)
-		return ERR_PTR(err);
-	return rpc_clone_client(NFS_SERVER(dir)->client);
-}
-#else /* CONFIG_NFS_V4 */
-static inline struct rpc_clnt *nfs_lookup_mountpoint(struct inode *dir,
-						     struct qstr *name,
-						     struct nfs_fh *fh,
-						     struct nfs_fattr *fattr)
-{
-	int err = NFS_PROTO(dir)->lookup(NFS_SERVER(dir)->client, dir, name, fh, fattr);
-	if (err)
-		return ERR_PTR(err);
-	return rpc_clone_client(NFS_SERVER(dir)->client);
-}
-#endif /* CONFIG_NFS_V4 */
-
 /*
  * nfs_d_automount - Handle crossing a mountpoint on the server
  * @path - The mountpoint
@@ -162,10 +128,9 @@ static inline struct rpc_clnt *nfs_lookup_mountpoint(struct inode *dir,
 struct vfsmount *nfs_d_automount(struct path *path)
 {
 	struct vfsmount *mnt;
-	struct dentry *parent;
+	struct nfs_server *server = NFS_SERVER(path->dentry->d_inode);
 	struct nfs_fh *fh = NULL;
 	struct nfs_fattr *fattr = NULL;
-	struct rpc_clnt *client;
 
 	dprintk("--> nfs_d_automount()\n");
 
@@ -181,21 +146,7 @@ struct vfsmount *nfs_d_automount(struct path *path)
 
 	dprintk("%s: enter\n", __func__);
 
-	/* Look it up again to get its attributes */
-	parent = dget_parent(path->dentry);
-	client = nfs_lookup_mountpoint(parent->d_inode, &path->dentry->d_name, fh, fattr);
-	dput(parent);
-	if (IS_ERR(client)) {
-		mnt = ERR_CAST(client);
-		goto out;
-	}
-
-	if (fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL)
-		mnt = nfs_do_refmount(client, path->dentry);
-	else
-		mnt = nfs_do_submount(path->dentry, fh, fattr, client->cl_auth->au_flavor);
-	rpc_shutdown_client(client);
-
+	mnt = server->nfs_client->rpc_ops->submount(server, path->dentry, fh, fattr);
 	if (IS_ERR(mnt))
 		goto out;
 
@@ -268,10 +219,8 @@ static struct vfsmount *nfs_do_clone_mount(struct nfs_server *server,
  * @authflavor - security flavor to use when performing the mount
  *
  */
-static struct vfsmount *nfs_do_submount(struct dentry *dentry,
-					struct nfs_fh *fh,
-					struct nfs_fattr *fattr,
-					rpc_authflavor_t authflavor)
+struct vfsmount *nfs_do_submount(struct dentry *dentry, struct nfs_fh *fh,
+				 struct nfs_fattr *fattr, rpc_authflavor_t authflavor)
 {
 	struct nfs_clone_mount mountdata = {
 		.sb = dentry->d_sb,
@@ -304,3 +253,19 @@ out:
 	dprintk("<-- nfs_do_submount() = %p\n", mnt);
 	return mnt;
 }
+
+struct vfsmount *nfs_submount(struct nfs_server *server, struct dentry *dentry,
+			      struct nfs_fh *fh, struct nfs_fattr *fattr)
+{
+	int err;
+	struct dentry *parent = dget_parent(dentry);
+
+	/* Look it up again to get its attributes */
+	err = server->nfs_client->rpc_ops->lookup(server->client, parent->d_inode,
+						  &dentry->d_name, fh, fattr);
+	dput(parent);
+	if (err != 0)
+		return ERR_PTR(err);
+
+	return nfs_do_submount(dentry, fh, fattr, server->client->cl_auth->au_flavor);
+}
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index 56dcefc..c23214d 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -885,6 +885,7 @@ const struct nfs_rpc_ops nfs_v3_clientops = {
 	.file_inode_ops	= &nfs3_file_inode_operations,
 	.file_ops	= &nfs_file_operations,
 	.getroot	= nfs3_proc_get_root,
+	.submount	= nfs_submount,
 	.getattr	= nfs3_proc_getattr,
 	.setattr	= nfs3_proc_setattr,
 	.lookup		= nfs3_proc_lookup,
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 53a487e..97365b0 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -208,6 +208,8 @@ extern const struct inode_operations nfs4_dir_inode_operations;
 /* nfs4namespace.c */
 rpc_authflavor_t nfs_find_best_sec(struct nfs4_secinfo_flavors *);
 struct rpc_clnt *nfs4_create_sec_client(struct rpc_clnt *, struct inode *, struct qstr *);
+struct vfsmount *nfs4_submount(struct nfs_server *, struct dentry *,
+			       struct nfs_fh *, struct nfs_fattr *);
 
 /* nfs4proc.c */
 extern int nfs4_proc_setclientid(struct nfs_client *, u32, unsigned short, struct rpc_cred *, struct nfs4_setclientid_res *);
diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c
index a69ee39..80fc0fe 100644
--- a/fs/nfs/nfs4namespace.c
+++ b/fs/nfs/nfs4namespace.c
@@ -329,7 +329,7 @@ out:
  * @dentry - dentry of referral
  *
  */
-struct vfsmount *nfs_do_refmount(struct rpc_clnt *client, struct dentry *dentry)
+static struct vfsmount *nfs_do_refmount(struct rpc_clnt *client, struct dentry *dentry)
 {
 	struct vfsmount *mnt = ERR_PTR(-ENOMEM);
 	struct dentry *parent;
@@ -370,3 +370,25 @@ out:
 	dprintk("%s: done\n", __func__);
 	return mnt;
 }
+
+struct vfsmount *nfs4_submount(struct nfs_server *server, struct dentry *dentry,
+			       struct nfs_fh *fh, struct nfs_fattr *fattr)
+{
+	struct dentry *parent = dget_parent(dentry);
+	struct rpc_clnt *client;
+	struct vfsmount *mnt;
+
+	/* Look it up again to get its attributes and sec flavor */
+	client = nfs4_proc_lookup_mountpoint(parent->d_inode, &dentry->d_name, fh, fattr);
+	dput(parent);
+	if (IS_ERR(client))
+		return ERR_CAST(client);
+
+	if (fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL)
+		mnt = nfs_do_refmount(client, dentry);
+	else
+		mnt = nfs_do_submount(dentry, fh, fattr, client->cl_auth->au_flavor);
+
+	rpc_shutdown_client(client);
+	return mnt;
+}
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index fa661b9..2091af2 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -6571,6 +6571,7 @@ const struct nfs_rpc_ops nfs_v4_clientops = {
 	.file_inode_ops	= &nfs4_file_inode_operations,
 	.file_ops	= &nfs4_file_operations,
 	.getroot	= nfs4_proc_get_root,
+	.submount	= nfs4_submount,
 	.getattr	= nfs4_proc_getattr,
 	.setattr	= nfs4_proc_setattr,
 	.lookup		= nfs4_proc_lookup,
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index 22ee705..76b3229 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -742,6 +742,7 @@ const struct nfs_rpc_ops nfs_v2_clientops = {
 	.file_inode_ops	= &nfs_file_inode_operations,
 	.file_ops	= &nfs_file_operations,
 	.getroot	= nfs_proc_get_root,
+	.submount	= nfs_submount,
 	.getattr	= nfs_proc_getattr,
 	.setattr	= nfs_proc_setattr,
 	.lookup		= nfs_proc_lookup,
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index eb1f143..4dada94 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -1341,6 +1341,8 @@ struct nfs_rpc_ops {
 
 	int	(*getroot) (struct nfs_server *, struct nfs_fh *,
 			    struct nfs_fsinfo *);
+	struct vfsmount *(*submount) (struct nfs_server *, struct dentry *,
+				      struct nfs_fh *, struct nfs_fattr *);
 	int	(*getattr) (struct nfs_server *, struct nfs_fh *,
 			    struct nfs_fattr *);
 	int	(*setattr) (struct dentry *, struct nfs_fattr *,
-- 
cgit v0.10.2


From 80a16b21a81eb639f0b726549f4c46c0e9aff92e Mon Sep 17 00:00:00 2001
From: Bryan Schumaker <bjschuma@netapp.com>
Date: Fri, 27 Apr 2012 13:27:46 -0400
Subject: NFS: Remove extra rpc_clnt argument to proc_lookup

Now that I'm doing secinfo automatically in the v4 code this extra
argument isn't needed.

Signed-off-by: Bryan Schumaker <bjschuma@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 8789210..82b42e2 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1143,7 +1143,7 @@ static int nfs_lookup_revalidate(struct dentry *dentry, struct nameidata *nd)
 	if (fhandle == NULL || fattr == NULL)
 		goto out_error;
 
-	error = NFS_PROTO(dir)->lookup(NFS_SERVER(dir)->client, dir, &dentry->d_name, fhandle, fattr);
+	error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr);
 	if (error)
 		goto out_bad;
 	if (nfs_compare_fh(NFS_FH(inode), fhandle))
@@ -1299,7 +1299,7 @@ static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, stru
 	parent = dentry->d_parent;
 	/* Protect against concurrent sillydeletes */
 	nfs_block_sillyrename(parent);
-	error = NFS_PROTO(dir)->lookup(NFS_SERVER(dir)->client, dir, &dentry->d_name, fhandle, fattr);
+	error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr);
 	if (error == -ENOENT)
 		goto no_entry;
 	if (error < 0) {
@@ -1646,7 +1646,7 @@ int nfs_instantiate(struct dentry *dentry, struct nfs_fh *fhandle,
 	if (dentry->d_inode)
 		goto out;
 	if (fhandle->size == 0) {
-		error = NFS_PROTO(dir)->lookup(NFS_SERVER(dir)->client, dir, &dentry->d_name, fhandle, fattr);
+		error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr);
 		if (error)
 			goto out_error;
 	}
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index e36fd8a..08b9c93 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -261,8 +261,7 @@ struct vfsmount *nfs_submount(struct nfs_server *server, struct dentry *dentry,
 	struct dentry *parent = dget_parent(dentry);
 
 	/* Look it up again to get its attributes */
-	err = server->nfs_client->rpc_ops->lookup(server->client, parent->d_inode,
-						  &dentry->d_name, fh, fattr);
+	err = server->nfs_client->rpc_ops->lookup(parent->d_inode, &dentry->d_name, fh, fattr);
 	dput(parent);
 	if (err != 0)
 		return ERR_PTR(err);
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index c23214d..48bcad2 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -142,7 +142,7 @@ nfs3_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
 }
 
 static int
-nfs3_proc_lookup(struct rpc_clnt *clnt, struct inode *dir, struct qstr *name,
+nfs3_proc_lookup(struct inode *dir, struct qstr *name,
 		 struct nfs_fh *fhandle, struct nfs_fattr *fattr)
 {
 	struct nfs3_diropargs	arg = {
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 2091af2..1780391 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -2578,7 +2578,7 @@ out:
 	return err;
 }
 
-static int nfs4_proc_lookup(struct rpc_clnt *clnt, struct inode *dir, struct qstr *name,
+static int nfs4_proc_lookup(struct inode *dir, struct qstr *name,
 			    struct nfs_fh *fhandle, struct nfs_fattr *fattr)
 {
 	int status;
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index 76b3229..fea9163 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -178,7 +178,7 @@ nfs_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
 }
 
 static int
-nfs_proc_lookup(struct rpc_clnt *clnt, struct inode *dir, struct qstr *name,
+nfs_proc_lookup(struct inode *dir, struct qstr *name,
 		struct nfs_fh *fhandle, struct nfs_fattr *fattr)
 {
 	struct nfs_diropargs	arg = {
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 4dada94..c940d46 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -1347,7 +1347,7 @@ struct nfs_rpc_ops {
 			    struct nfs_fattr *);
 	int	(*setattr) (struct dentry *, struct nfs_fattr *,
 			    struct iattr *);
-	int	(*lookup)  (struct rpc_clnt *clnt, struct inode *, struct qstr *,
+	int	(*lookup)  (struct inode *, struct qstr *,
 			    struct nfs_fh *, struct nfs_fattr *);
 	int	(*access)  (struct inode *, struct nfs_access_entry *);
 	int	(*readlink)(struct inode *, struct page *, unsigned int,
-- 
cgit v0.10.2


From 9b5415b536cc3193e9608a7fced1372df8ce4dcf Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Fri, 27 Apr 2012 14:31:47 -0400
Subject: NFS: Fix a use-before-initialised warning in fs/nfs/write.c and
 fs/nfs/pnfs.c

If the allocation of nfs_write_header fails, the list of nfs_pages that
needs to be cleaned up is still on desc->pg_list...

Reported-by: Bryan Schumaker <bjschuma@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Acked-by: Fred Isaman <iisaman@netapp.com>

diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 4da05e4..39cbac5 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -1308,7 +1308,7 @@ pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc)
 
 	whdr = nfs_writehdr_alloc();
 	if (!whdr) {
-		desc->pg_completion_ops->error_cleanup(&hdr->pages);
+		desc->pg_completion_ops->error_cleanup(&desc->pg_list);
 		put_lseg(desc->pg_lseg);
 		desc->pg_lseg = NULL;
 		return -ENOMEM;
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index fec214b..3636191 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1164,7 +1164,7 @@ static int nfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc)
 
 	whdr = nfs_writehdr_alloc();
 	if (!whdr) {
-		desc->pg_completion_ops->error_cleanup(&hdr->pages);
+		desc->pg_completion_ops->error_cleanup(&desc->pg_list);
 		return -ENOMEM;
 	}
 	hdr = &whdr->header;
-- 
cgit v0.10.2


From b58fee2189b17719c846f65ffe9483c2814e6605 Mon Sep 17 00:00:00 2001
From: Bryan Schumaker <bjschuma@netapp.com>
Date: Mon, 30 Apr 2012 13:06:53 -0400
Subject: NFS: pnfs_pageio_init_read() and init_write() need an extra argument

This is only when CONFIG_NFS_V4_1 isn't enabled.

Signed-off-by: Bryan Schumaker <bjschuma@netapp.com>
Acked-by: Fred Isaman <iisaman@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 8efbee7..f20054b 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -424,12 +424,14 @@ static inline void unset_pnfs_layoutdriver(struct nfs_server *s)
 {
 }
 
-static inline bool pnfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, struct inode *inode)
+static inline bool pnfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, struct inode *inode,
+					 const struct nfs_pgio_completion_ops *compl_ops)
 {
 	return false;
 }
 
-static inline bool pnfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, struct inode *inode, int ioflags)
+static inline bool pnfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, struct inode *inode, int ioflags,
+					  const struct nfs_pgio_completion_ops *compl_ops)
 {
 	return false;
 }
-- 
cgit v0.10.2


From 24fc9211f4d48c04882a52e42b21c9b4abc4f9bf Mon Sep 17 00:00:00 2001
From: Bryan Schumaker <bjschuma@netapp.com>
Date: Mon, 30 Apr 2012 13:27:11 -0400
Subject: NFS: Define nfs_direct_write_schedule_work() when v3 and v4 are
 disabled

v2 doesn't have commits, so this function can be a no-op.

Signed-off-by: Bryan Schumaker <bjschuma@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index d44de2f..e83545c 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -601,6 +601,9 @@ static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode
 }
 
 #else
+static void nfs_direct_write_schedule_work(struct work_struct *work)
+{
+}
 
 static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode)
 {
-- 
cgit v0.10.2


From 68cd6fa4f3be07ba648e22617dfa16a40d671d19 Mon Sep 17 00:00:00 2001
From: Bryan Schumaker <bjschuma@netapp.com>
Date: Mon, 30 Apr 2012 14:30:22 -0400
Subject: NFS: Define dummy nfs_init_cinfo() and nfs_init_cinfo_from_inode()

These are needed when v3 and v4 are not enabled.

Signed-off-by: Bryan Schumaker <bjschuma@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 3636191..2f80aa5 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -567,6 +567,17 @@ int nfs_write_need_commit(struct nfs_write_data *data)
 }
 
 #else
+static void nfs_init_cinfo_from_inode(struct nfs_commit_info *cinfo,
+				      struct inode *inode)
+{
+}
+
+void nfs_init_cinfo(struct nfs_commit_info *cinfo,
+		    struct inode *inode,
+		    struct nfs_direct_req *dreq)
+{
+}
+
 void
 nfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg,
 			struct nfs_commit_info *cinfo)
-- 
cgit v0.10.2


From 71e8cc00c63e8518ce86b4079355fc9086a4869d Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Mon, 30 Apr 2012 13:22:54 -0400
Subject: NFS: Ensure that we break out of read/write_schedule_segment on error

Currently we do break out of the for() loop, but we also need to
break out of the enclosing do {} while()...

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Cc: Fred Isaman <iisaman@netapp.com>

diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index e83545c..f30d5c2 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -396,7 +396,7 @@ static ssize_t nfs_direct_read_schedule_segment(struct nfs_pageio_descriptor *de
 			pos += req_len;
 			count -= req_len;
 		}
-	} while (count != 0);
+	} while (count != 0 && result >= 0);
 
 	kfree(pagevec);
 
@@ -692,6 +692,7 @@ static ssize_t nfs_direct_write_schedule_segment(struct nfs_pageio_descriptor *d
 				nfs_release_request(req);
 				nfs_direct_release_pages(pagevec + i,
 							 npages - i);
+				break;
 			}
 			pgbase = 0;
 			bytes -= req_len;
@@ -700,7 +701,7 @@ static ssize_t nfs_direct_write_schedule_segment(struct nfs_pageio_descriptor *d
 			pos += req_len;
 			count -= req_len;
 		}
-	} while (count != 0);
+	} while (count != 0 && result >= 0);
 
 	kfree(pagevec);
 
-- 
cgit v0.10.2


From 3e9e0ca3f19e911ce13c2e6c9858fcb41a37496c Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Mon, 30 Apr 2012 13:40:06 -0400
Subject: NFS: O_DIRECT pgio_completion_ops error_cleanup must unlock the
 request

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Cc: Fred Isaman <iisaman@netapp.com>

diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index f30d5c2..af02bde 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -292,7 +292,7 @@ out_put:
 	hdr->release(hdr);
 }
 
-static void nfs_sync_pgio_error(struct list_head *head)
+static void nfs_read_sync_pgio_error(struct list_head *head)
 {
 	struct nfs_page *req;
 
@@ -309,7 +309,7 @@ static void nfs_direct_pgio_init(struct nfs_pgio_header *hdr)
 }
 
 static const struct nfs_pgio_completion_ops nfs_direct_read_completion_ops = {
-	.error_cleanup = nfs_sync_pgio_error,
+	.error_cleanup = nfs_read_sync_pgio_error,
 	.init_hdr = nfs_direct_pgio_init,
 	.completion = nfs_direct_read_completion,
 };
@@ -775,8 +775,20 @@ out_put:
 	hdr->release(hdr);
 }
 
+static void nfs_write_sync_pgio_error(struct list_head *head)
+{
+	struct nfs_page *req;
+
+	while (!list_empty(head)) {
+		req = nfs_list_entry(head->next);
+		nfs_list_remove_request(req);
+		nfs_release_request(req);
+		nfs_unlock_request(req);
+	}
+}
+
 static const struct nfs_pgio_completion_ops nfs_direct_write_completion_ops = {
-	.error_cleanup = nfs_sync_pgio_error,
+	.error_cleanup = nfs_write_sync_pgio_error,
 	.init_hdr = nfs_direct_pgio_init,
 	.completion = nfs_direct_write_completion,
 };
-- 
cgit v0.10.2


From 6d74743b088d116e31fe1b73f47e782ee2016b94 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Mon, 30 Apr 2012 13:27:31 -0400
Subject: NFS: Simplify O_DIRECT page referencing

The O_DIRECT code shouldn't need to hold 2 references to each page. The
reference held by the struct nfs_page should suffice.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Cc: Fred Isaman <iisaman@netapp.com>

diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index af02bde..78d1ead 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -268,10 +268,9 @@ static void nfs_direct_read_completion(struct nfs_pgio_header *hdr)
 			}
 			bytes += req->wb_bytes;
 			nfs_list_remove_request(req);
-			nfs_direct_readpage_release(req);
 			if (!PageCompound(page))
 				set_page_dirty(page);
-			page_cache_release(page);
+			nfs_direct_readpage_release(req);
 		}
 	} else {
 		while (!list_empty(&hdr->pages)) {
@@ -281,7 +280,6 @@ static void nfs_direct_read_completion(struct nfs_pgio_header *hdr)
 				if (!PageCompound(req->wb_page))
 					set_page_dirty(req->wb_page);
 			bytes += req->wb_bytes;
-			page_cache_release(req->wb_page);
 			nfs_list_remove_request(req);
 			nfs_direct_readpage_release(req);
 		}
@@ -375,8 +373,6 @@ static ssize_t nfs_direct_read_schedule_segment(struct nfs_pageio_descriptor *de
 						 pagevec[i],
 						 pgbase, req_len);
 			if (IS_ERR(req)) {
-				nfs_direct_release_pages(pagevec + i,
-							 npages - i);
 				result = PTR_ERR(req);
 				break;
 			}
@@ -385,8 +381,6 @@ static ssize_t nfs_direct_read_schedule_segment(struct nfs_pageio_descriptor *de
 			if (!nfs_pageio_add_request(desc, req)) {
 				result = desc->pg_error;
 				nfs_release_request(req);
-				nfs_direct_release_pages(pagevec + i,
-							 npages - i);
 				break;
 			}
 			pgbase = 0;
@@ -396,6 +390,8 @@ static ssize_t nfs_direct_read_schedule_segment(struct nfs_pageio_descriptor *de
 			pos += req_len;
 			count -= req_len;
 		}
+		/* The nfs_page now hold references to these pages */
+		nfs_direct_release_pages(pagevec, npages);
 	} while (count != 0 && result >= 0);
 
 	kfree(pagevec);
@@ -509,7 +505,6 @@ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
 	nfs_pageio_complete(&desc);
 
 	while (!list_empty(&failed)) {
-		page_cache_release(req->wb_page);
 		nfs_release_request(req);
 		nfs_unlock_request(req);
 	}
@@ -542,10 +537,8 @@ static void nfs_direct_commit_complete(struct nfs_commit_data *data)
 		if (dreq->flags == NFS_ODIRECT_RESCHED_WRITES) {
 			/* Note the rewrite will go through mds */
 			nfs_mark_request_commit(req, NULL, &cinfo);
-		} else {
-			page_cache_release(req->wb_page);
+		} else
 			nfs_release_request(req);
-		}
 		nfs_unlock_request(req);
 	}
 
@@ -678,8 +671,6 @@ static ssize_t nfs_direct_write_schedule_segment(struct nfs_pageio_descriptor *d
 						 pagevec[i],
 						 pgbase, req_len);
 			if (IS_ERR(req)) {
-				nfs_direct_release_pages(pagevec + i,
-							 npages - i);
 				result = PTR_ERR(req);
 				break;
 			}
@@ -690,8 +681,6 @@ static ssize_t nfs_direct_write_schedule_segment(struct nfs_pageio_descriptor *d
 				result = desc->pg_error;
 				nfs_unlock_request(req);
 				nfs_release_request(req);
-				nfs_direct_release_pages(pagevec + i,
-							 npages - i);
 				break;
 			}
 			pgbase = 0;
@@ -701,6 +690,8 @@ static ssize_t nfs_direct_write_schedule_segment(struct nfs_pageio_descriptor *d
 			pos += req_len;
 			count -= req_len;
 		}
+		/* The nfs_page now hold references to these pages */
+		nfs_direct_release_pages(pagevec, npages);
 	} while (count != 0 && result >= 0);
 
 	kfree(pagevec);
@@ -763,7 +754,6 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr)
 			nfs_mark_request_commit(req, hdr->lseg, &cinfo);
 			break;
 		default:
-			page_cache_release(req->wb_page);
 			nfs_release_request(req);
 		}
 		nfs_unlock_request(req);
-- 
cgit v0.10.2


From 292f3eeef00a20fa0ef4feec62792ad0065760a0 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Mon, 30 Apr 2012 18:31:49 -0400
Subject: NFS: Use kmem_cache_zalloc() in nfs_direct_req_alloc

Simplify the initialisation of O_DIRECT requests.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Cc: Fred Isaman <iisaman@netapp.com>

diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 78d1ead..f17e469 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -149,26 +149,16 @@ static inline struct nfs_direct_req *nfs_direct_req_alloc(void)
 {
 	struct nfs_direct_req *dreq;
 
-	dreq = kmem_cache_alloc(nfs_direct_cachep, GFP_KERNEL);
+	dreq = kmem_cache_zalloc(nfs_direct_cachep, GFP_KERNEL);
 	if (!dreq)
 		return NULL;
 
 	kref_init(&dreq->kref);
 	kref_get(&dreq->kref);
 	init_completion(&dreq->completion);
-	dreq->mds_cinfo.ncommit = 0;
-	atomic_set(&dreq->mds_cinfo.rpcs_out, 0);
 	INIT_LIST_HEAD(&dreq->mds_cinfo.list);
 	INIT_WORK(&dreq->work, nfs_direct_write_schedule_work);
-	memset(&dreq->ds_cinfo, 0, sizeof(dreq->ds_cinfo));
-	dreq->iocb = NULL;
-	dreq->ctx = NULL;
-	dreq->l_ctx = NULL;
 	spin_lock_init(&dreq->lock);
-	atomic_set(&dreq->io_count, 0);
-	dreq->count = 0;
-	dreq->error = 0;
-	dreq->flags = 0;
 
 	return dreq;
 }
-- 
cgit v0.10.2


From 4f97615d19c370d1d907ef37f8bcd9c3672851ca Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Mon, 30 Apr 2012 18:39:20 -0400
Subject: NFS: Fix a compile issue when CONFIG_NFS_V4_1 is undefined

struct nfs_direct_req can't compile when struct pnfs_ds_commit_info
is undefined.

Reported-by: Bryan Schumaker <bjschuma@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Cc: Fred Isaman <iisaman@netapp.com>

diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index c940d46..6deb8f0 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -1176,6 +1176,11 @@ struct nfs41_free_stateid_res {
 	struct nfs4_sequence_res	seq_res;
 };
 
+#else
+
+struct pnfs_ds_commit_info {
+};
+
 #endif /* CONFIG_NFS_V4_1 */
 
 struct nfs_page;
-- 
cgit v0.10.2


From f4bf7cf4cab90c98fe68a7afa12fb72790fd04bf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= <u.kleine-koenig@pengutronix.de>
Date: Fri, 30 Mar 2012 22:04:56 +0200
Subject: mfd: Mark const init data with __initconst instead of __initdata for
 ab5500
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

As long as there is no other non-const variable marked __initdata in the
same compilation unit it doesn't hurt. If there were one however
compilation would fail with

	error: $variablename causes a section type conflict

because a section containing const variables is marked read only and so
cannot contain non-const variables.

Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>

diff --git a/drivers/mfd/ab5500-core.c b/drivers/mfd/ab5500-core.c
index 54d0fe4..3765c76 100644
--- a/drivers/mfd/ab5500-core.c
+++ b/drivers/mfd/ab5500-core.c
@@ -1291,7 +1291,7 @@ struct ab_family_id {
 	char	*name;
 };
 
-static const struct ab_family_id ids[] __initdata = {
+static const struct ab_family_id ids[] __initconst = {
 	/* AB5500 */
 	{
 		.id = AB5500_1_0,
-- 
cgit v0.10.2


From 4630b130b30be6420394ba31121e111c8771ca08 Mon Sep 17 00:00:00 2001
From: Aaron Sierra <asierra@xes-inc.com>
Date: Wed, 28 Mar 2012 09:43:10 -0500
Subject: mfd: Add LPC driver for Intel ICH chipsets

This driver currently creates resources for use by a forthcoming ICH
chipset GPIO driver. It could be expanded to create the resources for
converting the esb2rom (mtd) and iTCO_wdt (wdt), and potentially more,
drivers to use the mfd model.

Signed-off-by: Aaron Sierra <asierra@xes-inc.com>
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>

diff --git a/drivers/mfd/Kconfig b/drivers/mfd/Kconfig
index 11e44386..c6edba6 100644
--- a/drivers/mfd/Kconfig
+++ b/drivers/mfd/Kconfig
@@ -748,6 +748,15 @@ config LPC_SCH
 	  LPC bridge function of the Intel SCH provides support for
 	  System Management Bus and General Purpose I/O.
 
+config LPC_ICH
+	tristate "Intel ICH LPC"
+	depends on PCI
+	select MFD_CORE
+	help
+	  The LPC bridge function of the Intel ICH provides support for
+	  many functional units. This driver provides needed support for
+	  other drivers to control these functions, currently GPIO.
+
 config MFD_RDC321X
 	tristate "Support for RDC-R321x southbridge"
 	select MFD_CORE
diff --git a/drivers/mfd/Makefile b/drivers/mfd/Makefile
index 05fa538..c4500c3 100644
--- a/drivers/mfd/Makefile
+++ b/drivers/mfd/Makefile
@@ -99,6 +99,7 @@ obj-$(CONFIG_MFD_DB5500_PRCMU)	+= db5500-prcmu.o
 obj-$(CONFIG_MFD_TIMBERDALE)    += timberdale.o
 obj-$(CONFIG_PMIC_ADP5520)	+= adp5520.o
 obj-$(CONFIG_LPC_SCH)		+= lpc_sch.o
+obj-$(CONFIG_LPC_ICH)		+= lpc_ich.o
 obj-$(CONFIG_MFD_RDC321X)	+= rdc321x-southbridge.o
 obj-$(CONFIG_MFD_JANZ_CMODIO)	+= janz-cmodio.o
 obj-$(CONFIG_MFD_JZ4740_ADC)	+= jz4740-adc.o
diff --git a/drivers/mfd/lpc_ich.c b/drivers/mfd/lpc_ich.c
new file mode 100644
index 0000000..7e3a7b6
--- /dev/null
+++ b/drivers/mfd/lpc_ich.c
@@ -0,0 +1,719 @@
+/*
+ *  lpc_ich.c - LPC interface for Intel ICH
+ *
+ *  LPC bridge function of the Intel ICH contains many other
+ *  functional units, such as Interrupt controllers, Timers,
+ *  Power Management, System Management, GPIO, RTC, and LPC
+ *  Configuration Registers.
+ *
+ *  This driver is derived from lpc_sch.
+
+ *  Copyright (c) 2011 Extreme Engineering Solution, Inc.
+ *  Author: Aaron Sierra <asierra@xes-inc.com>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License 2 as published
+ *  by the Free Software Foundation.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ *  This driver supports the following I/O Controller hubs:
+ *	(See the intel documentation on http://developer.intel.com.)
+ *	document number 290655-003, 290677-014: 82801AA (ICH), 82801AB (ICHO)
+ *	document number 290687-002, 298242-027: 82801BA (ICH2)
+ *	document number 290733-003, 290739-013: 82801CA (ICH3-S)
+ *	document number 290716-001, 290718-007: 82801CAM (ICH3-M)
+ *	document number 290744-001, 290745-025: 82801DB (ICH4)
+ *	document number 252337-001, 252663-008: 82801DBM (ICH4-M)
+ *	document number 273599-001, 273645-002: 82801E (C-ICH)
+ *	document number 252516-001, 252517-028: 82801EB (ICH5), 82801ER (ICH5R)
+ *	document number 300641-004, 300884-013: 6300ESB
+ *	document number 301473-002, 301474-026: 82801F (ICH6)
+ *	document number 313082-001, 313075-006: 631xESB, 632xESB
+ *	document number 307013-003, 307014-024: 82801G (ICH7)
+ *	document number 322896-001, 322897-001: NM10
+ *	document number 313056-003, 313057-017: 82801H (ICH8)
+ *	document number 316972-004, 316973-012: 82801I (ICH9)
+ *	document number 319973-002, 319974-002: 82801J (ICH10)
+ *	document number 322169-001, 322170-003: 5 Series, 3400 Series (PCH)
+ *	document number 320066-003, 320257-008: EP80597 (IICH)
+ *	document number 324645-001, 324646-001: Cougar Point (CPT)
+ *	document number TBD : Patsburg (PBG)
+ *	document number TBD : DH89xxCC
+ *	document number TBD : Panther Point
+ *	document number TBD : Lynx Point
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/acpi.h>
+#include <linux/pci.h>
+#include <linux/mfd/core.h>
+#include <linux/mfd/lpc_ich.h>
+
+#define ACPIBASE		0x40
+#define ACPIBASE_GPE_OFF	0x28
+#define ACPIBASE_GPE_END	0x2f
+#define ACPICTRL		0x44
+
+#define GPIOBASE		0x48
+#define GPIOCTRL		0x4C
+
+static int lpc_ich_acpi_save = -1;
+static int lpc_ich_gpio_save = -1;
+
+static struct resource gpio_ich_res[] = {
+	/* GPIO */
+	{
+		.flags = IORESOURCE_IO,
+	},
+	/* ACPI - GPE0 */
+	{
+		.flags = IORESOURCE_IO,
+	},
+};
+
+enum lpc_cells {
+	LPC_GPIO = 0,
+};
+
+static struct mfd_cell lpc_ich_cells[] = {
+	[LPC_GPIO] = {
+		.name = "gpio_ich",
+		.num_resources = ARRAY_SIZE(gpio_ich_res),
+		.resources = gpio_ich_res,
+		.ignore_resource_conflicts = true,
+	},
+};
+
+/* chipset related info */
+enum lpc_chipsets {
+	LPC_ICH = 0,	/* ICH */
+	LPC_ICH0,	/* ICH0 */
+	LPC_ICH2,	/* ICH2 */
+	LPC_ICH2M,	/* ICH2-M */
+	LPC_ICH3,	/* ICH3-S */
+	LPC_ICH3M,	/* ICH3-M */
+	LPC_ICH4,	/* ICH4 */
+	LPC_ICH4M,	/* ICH4-M */
+	LPC_CICH,	/* C-ICH */
+	LPC_ICH5,	/* ICH5 & ICH5R */
+	LPC_6300ESB,	/* 6300ESB */
+	LPC_ICH6,	/* ICH6 & ICH6R */
+	LPC_ICH6M,	/* ICH6-M */
+	LPC_ICH6W,	/* ICH6W & ICH6RW */
+	LPC_631XESB,	/* 631xESB/632xESB */
+	LPC_ICH7,	/* ICH7 & ICH7R */
+	LPC_ICH7DH,	/* ICH7DH */
+	LPC_ICH7M,	/* ICH7-M & ICH7-U */
+	LPC_ICH7MDH,	/* ICH7-M DH */
+	LPC_NM10,	/* NM10 */
+	LPC_ICH8,	/* ICH8 & ICH8R */
+	LPC_ICH8DH,	/* ICH8DH */
+	LPC_ICH8DO,	/* ICH8DO */
+	LPC_ICH8M,	/* ICH8M */
+	LPC_ICH8ME,	/* ICH8M-E */
+	LPC_ICH9,	/* ICH9 */
+	LPC_ICH9R,	/* ICH9R */
+	LPC_ICH9DH,	/* ICH9DH */
+	LPC_ICH9DO,	/* ICH9DO */
+	LPC_ICH9M,	/* ICH9M */
+	LPC_ICH9ME,	/* ICH9M-E */
+	LPC_ICH10,	/* ICH10 */
+	LPC_ICH10R,	/* ICH10R */
+	LPC_ICH10D,	/* ICH10D */
+	LPC_ICH10DO,	/* ICH10DO */
+	LPC_PCH,	/* PCH Desktop Full Featured */
+	LPC_PCHM,	/* PCH Mobile Full Featured */
+	LPC_P55,	/* P55 */
+	LPC_PM55,	/* PM55 */
+	LPC_H55,	/* H55 */
+	LPC_QM57,	/* QM57 */
+	LPC_H57,	/* H57 */
+	LPC_HM55,	/* HM55 */
+	LPC_Q57,	/* Q57 */
+	LPC_HM57,	/* HM57 */
+	LPC_PCHMSFF,	/* PCH Mobile SFF Full Featured */
+	LPC_QS57,	/* QS57 */
+	LPC_3400,	/* 3400 */
+	LPC_3420,	/* 3420 */
+	LPC_3450,	/* 3450 */
+	LPC_EP80579,	/* EP80579 */
+	LPC_CPT,	/* Cougar Point */
+	LPC_CPTD,	/* Cougar Point Desktop */
+	LPC_CPTM,	/* Cougar Point Mobile */
+	LPC_PBG,	/* Patsburg */
+	LPC_DH89XXCC,	/* DH89xxCC */
+	LPC_PPT,	/* Panther Point */
+	LPC_LPT,	/* Lynx Point */
+};
+
+struct lpc_ich_info lpc_chipset_info[] __devinitdata = {
+	[LPC_ICH] = {
+		.name = "ICH",
+	},
+	[LPC_ICH0] = {
+		.name = "ICH0",
+	},
+	[LPC_ICH2] = {
+		.name = "ICH2",
+	},
+	[LPC_ICH2M] = {
+		.name = "ICH2-M",
+	},
+	[LPC_ICH3] = {
+		.name = "ICH3-S",
+	},
+	[LPC_ICH3M] = {
+		.name = "ICH3-M",
+	},
+	[LPC_ICH4] = {
+		.name = "ICH4",
+	},
+	[LPC_ICH4M] = {
+		.name = "ICH4-M",
+	},
+	[LPC_CICH] = {
+		.name = "C-ICH",
+	},
+	[LPC_ICH5] = {
+		.name = "ICH5 or ICH5R",
+	},
+	[LPC_6300ESB] = {
+		.name = "6300ESB",
+	},
+	[LPC_ICH6] = {
+		.name = "ICH6 or ICH6R",
+		.gpio_version = ICH_V6_GPIO,
+	},
+	[LPC_ICH6M] = {
+		.name = "ICH6-M",
+		.gpio_version = ICH_V6_GPIO,
+	},
+	[LPC_ICH6W] = {
+		.name = "ICH6W or ICH6RW",
+		.gpio_version = ICH_V6_GPIO,
+	},
+	[LPC_631XESB] = {
+		.name = "631xESB/632xESB",
+		.gpio_version = ICH_V6_GPIO,
+	},
+	[LPC_ICH7] = {
+		.name = "ICH7 or ICH7R",
+		.gpio_version = ICH_V7_GPIO,
+	},
+	[LPC_ICH7DH] = {
+		.name = "ICH7DH",
+		.gpio_version = ICH_V7_GPIO,
+	},
+	[LPC_ICH7M] = {
+		.name = "ICH7-M or ICH7-U",
+		.gpio_version = ICH_V7_GPIO,
+	},
+	[LPC_ICH7MDH] = {
+		.name = "ICH7-M DH",
+		.gpio_version = ICH_V7_GPIO,
+	},
+	[LPC_NM10] = {
+		.name = "NM10",
+	},
+	[LPC_ICH8] = {
+		.name = "ICH8 or ICH8R",
+		.gpio_version = ICH_V7_GPIO,
+	},
+	[LPC_ICH8DH] = {
+		.name = "ICH8DH",
+		.gpio_version = ICH_V7_GPIO,
+	},
+	[LPC_ICH8DO] = {
+		.name = "ICH8DO",
+		.gpio_version = ICH_V7_GPIO,
+	},
+	[LPC_ICH8M] = {
+		.name = "ICH8M",
+		.gpio_version = ICH_V7_GPIO,
+	},
+	[LPC_ICH8ME] = {
+		.name = "ICH8M-E",
+		.gpio_version = ICH_V7_GPIO,
+	},
+	[LPC_ICH9] = {
+		.name = "ICH9",
+		.gpio_version = ICH_V9_GPIO,
+	},
+	[LPC_ICH9R] = {
+		.name = "ICH9R",
+		.gpio_version = ICH_V9_GPIO,
+	},
+	[LPC_ICH9DH] = {
+		.name = "ICH9DH",
+		.gpio_version = ICH_V9_GPIO,
+	},
+	[LPC_ICH9DO] = {
+		.name = "ICH9DO",
+		.gpio_version = ICH_V9_GPIO,
+	},
+	[LPC_ICH9M] = {
+		.name = "ICH9M",
+		.gpio_version = ICH_V9_GPIO,
+	},
+	[LPC_ICH9ME] = {
+		.name = "ICH9M-E",
+		.gpio_version = ICH_V9_GPIO,
+	},
+	[LPC_ICH10] = {
+		.name = "ICH10",
+		.gpio_version = ICH_V10CONS_GPIO,
+	},
+	[LPC_ICH10R] = {
+		.name = "ICH10R",
+		.gpio_version = ICH_V10CONS_GPIO,
+	},
+	[LPC_ICH10D] = {
+		.name = "ICH10D",
+		.gpio_version = ICH_V10CORP_GPIO,
+	},
+	[LPC_ICH10DO] = {
+		.name = "ICH10DO",
+		.gpio_version = ICH_V10CORP_GPIO,
+	},
+	[LPC_PCH] = {
+		.name = "PCH Desktop Full Featured",
+		.gpio_version = ICH_V5_GPIO,
+	},
+	[LPC_PCHM] = {
+		.name = "PCH Mobile Full Featured",
+		.gpio_version = ICH_V5_GPIO,
+	},
+	[LPC_P55] = {
+		.name = "P55",
+		.gpio_version = ICH_V5_GPIO,
+	},
+	[LPC_PM55] = {
+		.name = "PM55",
+		.gpio_version = ICH_V5_GPIO,
+	},
+	[LPC_H55] = {
+		.name = "H55",
+		.gpio_version = ICH_V5_GPIO,
+	},
+	[LPC_QM57] = {
+		.name = "QM57",
+		.gpio_version = ICH_V5_GPIO,
+	},
+	[LPC_H57] = {
+		.name = "H57",
+		.gpio_version = ICH_V5_GPIO,
+	},
+	[LPC_HM55] = {
+		.name = "HM55",
+		.gpio_version = ICH_V5_GPIO,
+	},
+	[LPC_Q57] = {
+		.name = "Q57",
+		.gpio_version = ICH_V5_GPIO,
+	},
+	[LPC_HM57] = {
+		.name = "HM57",
+		.gpio_version = ICH_V5_GPIO,
+	},
+	[LPC_PCHMSFF] = {
+		.name = "PCH Mobile SFF Full Featured",
+		.gpio_version = ICH_V5_GPIO,
+	},
+	[LPC_QS57] = {
+		.name = "QS57",
+		.gpio_version = ICH_V5_GPIO,
+	},
+	[LPC_3400] = {
+		.name = "3400",
+		.gpio_version = ICH_V5_GPIO,
+	},
+	[LPC_3420] = {
+		.name = "3420",
+		.gpio_version = ICH_V5_GPIO,
+	},
+	[LPC_3450] = {
+		.name = "3450",
+		.gpio_version = ICH_V5_GPIO,
+	},
+	[LPC_EP80579] = {
+		.name = "EP80579",
+	},
+	[LPC_CPT] = {
+		.name = "Cougar Point",
+		.gpio_version = ICH_V5_GPIO,
+	},
+	[LPC_CPTD] = {
+		.name = "Cougar Point Desktop",
+		.gpio_version = ICH_V5_GPIO,
+	},
+	[LPC_CPTM] = {
+		.name = "Cougar Point Mobile",
+		.gpio_version = ICH_V5_GPIO,
+	},
+	[LPC_PBG] = {
+		.name = "Patsburg",
+	},
+	[LPC_DH89XXCC] = {
+		.name = "DH89xxCC",
+	},
+	[LPC_PPT] = {
+		.name = "Panther Point",
+	},
+	[LPC_LPT] = {
+		.name = "Lynx Point",
+	},
+};
+
+/*
+ * This data only exists for exporting the supported PCI ids
+ * via MODULE_DEVICE_TABLE.  We do not actually register a
+ * pci_driver, because the I/O Controller Hub has also other
+ * functions that probably will be registered by other drivers.
+ */
+static DEFINE_PCI_DEVICE_TABLE(lpc_ich_ids) = {
+	{ PCI_VDEVICE(INTEL, 0x2410), LPC_ICH},
+	{ PCI_VDEVICE(INTEL, 0x2420), LPC_ICH0},
+	{ PCI_VDEVICE(INTEL, 0x2440), LPC_ICH2},
+	{ PCI_VDEVICE(INTEL, 0x244c), LPC_ICH2M},
+	{ PCI_VDEVICE(INTEL, 0x2480), LPC_ICH3},
+	{ PCI_VDEVICE(INTEL, 0x248c), LPC_ICH3M},
+	{ PCI_VDEVICE(INTEL, 0x24c0), LPC_ICH4},
+	{ PCI_VDEVICE(INTEL, 0x24cc), LPC_ICH4M},
+	{ PCI_VDEVICE(INTEL, 0x2450), LPC_CICH},
+	{ PCI_VDEVICE(INTEL, 0x24d0), LPC_ICH5},
+	{ PCI_VDEVICE(INTEL, 0x25a1), LPC_6300ESB},
+	{ PCI_VDEVICE(INTEL, 0x2640), LPC_ICH6},
+	{ PCI_VDEVICE(INTEL, 0x2641), LPC_ICH6M},
+	{ PCI_VDEVICE(INTEL, 0x2642), LPC_ICH6W},
+	{ PCI_VDEVICE(INTEL, 0x2670), LPC_631XESB},
+	{ PCI_VDEVICE(INTEL, 0x2671), LPC_631XESB},
+	{ PCI_VDEVICE(INTEL, 0x2672), LPC_631XESB},
+	{ PCI_VDEVICE(INTEL, 0x2673), LPC_631XESB},
+	{ PCI_VDEVICE(INTEL, 0x2674), LPC_631XESB},
+	{ PCI_VDEVICE(INTEL, 0x2675), LPC_631XESB},
+	{ PCI_VDEVICE(INTEL, 0x2676), LPC_631XESB},
+	{ PCI_VDEVICE(INTEL, 0x2677), LPC_631XESB},
+	{ PCI_VDEVICE(INTEL, 0x2678), LPC_631XESB},
+	{ PCI_VDEVICE(INTEL, 0x2679), LPC_631XESB},
+	{ PCI_VDEVICE(INTEL, 0x267a), LPC_631XESB},
+	{ PCI_VDEVICE(INTEL, 0x267b), LPC_631XESB},
+	{ PCI_VDEVICE(INTEL, 0x267c), LPC_631XESB},
+	{ PCI_VDEVICE(INTEL, 0x267d), LPC_631XESB},
+	{ PCI_VDEVICE(INTEL, 0x267e), LPC_631XESB},
+	{ PCI_VDEVICE(INTEL, 0x267f), LPC_631XESB},
+	{ PCI_VDEVICE(INTEL, 0x27b8), LPC_ICH7},
+	{ PCI_VDEVICE(INTEL, 0x27b0), LPC_ICH7DH},
+	{ PCI_VDEVICE(INTEL, 0x27b9), LPC_ICH7M},
+	{ PCI_VDEVICE(INTEL, 0x27bd), LPC_ICH7MDH},
+	{ PCI_VDEVICE(INTEL, 0x27bc), LPC_NM10},
+	{ PCI_VDEVICE(INTEL, 0x2810), LPC_ICH8},
+	{ PCI_VDEVICE(INTEL, 0x2812), LPC_ICH8DH},
+	{ PCI_VDEVICE(INTEL, 0x2814), LPC_ICH8DO},
+	{ PCI_VDEVICE(INTEL, 0x2815), LPC_ICH8M},
+	{ PCI_VDEVICE(INTEL, 0x2811), LPC_ICH8ME},
+	{ PCI_VDEVICE(INTEL, 0x2918), LPC_ICH9},
+	{ PCI_VDEVICE(INTEL, 0x2916), LPC_ICH9R},
+	{ PCI_VDEVICE(INTEL, 0x2912), LPC_ICH9DH},
+	{ PCI_VDEVICE(INTEL, 0x2914), LPC_ICH9DO},
+	{ PCI_VDEVICE(INTEL, 0x2919), LPC_ICH9M},
+	{ PCI_VDEVICE(INTEL, 0x2917), LPC_ICH9ME},
+	{ PCI_VDEVICE(INTEL, 0x3a18), LPC_ICH10},
+	{ PCI_VDEVICE(INTEL, 0x3a16), LPC_ICH10R},
+	{ PCI_VDEVICE(INTEL, 0x3a1a), LPC_ICH10D},
+	{ PCI_VDEVICE(INTEL, 0x3a14), LPC_ICH10DO},
+	{ PCI_VDEVICE(INTEL, 0x3b00), LPC_PCH},
+	{ PCI_VDEVICE(INTEL, 0x3b01), LPC_PCHM},
+	{ PCI_VDEVICE(INTEL, 0x3b02), LPC_P55},
+	{ PCI_VDEVICE(INTEL, 0x3b03), LPC_PM55},
+	{ PCI_VDEVICE(INTEL, 0x3b06), LPC_H55},
+	{ PCI_VDEVICE(INTEL, 0x3b07), LPC_QM57},
+	{ PCI_VDEVICE(INTEL, 0x3b08), LPC_H57},
+	{ PCI_VDEVICE(INTEL, 0x3b09), LPC_HM55},
+	{ PCI_VDEVICE(INTEL, 0x3b0a), LPC_Q57},
+	{ PCI_VDEVICE(INTEL, 0x3b0b), LPC_HM57},
+	{ PCI_VDEVICE(INTEL, 0x3b0d), LPC_PCHMSFF},
+	{ PCI_VDEVICE(INTEL, 0x3b0f), LPC_QS57},
+	{ PCI_VDEVICE(INTEL, 0x3b12), LPC_3400},
+	{ PCI_VDEVICE(INTEL, 0x3b14), LPC_3420},
+	{ PCI_VDEVICE(INTEL, 0x3b16), LPC_3450},
+	{ PCI_VDEVICE(INTEL, 0x5031), LPC_EP80579},
+	{ PCI_VDEVICE(INTEL, 0x1c41), LPC_CPT},
+	{ PCI_VDEVICE(INTEL, 0x1c42), LPC_CPTD},
+	{ PCI_VDEVICE(INTEL, 0x1c43), LPC_CPTM},
+	{ PCI_VDEVICE(INTEL, 0x1c44), LPC_CPT},
+	{ PCI_VDEVICE(INTEL, 0x1c45), LPC_CPT},
+	{ PCI_VDEVICE(INTEL, 0x1c46), LPC_CPT},
+	{ PCI_VDEVICE(INTEL, 0x1c47), LPC_CPT},
+	{ PCI_VDEVICE(INTEL, 0x1c48), LPC_CPT},
+	{ PCI_VDEVICE(INTEL, 0x1c49), LPC_CPT},
+	{ PCI_VDEVICE(INTEL, 0x1c4a), LPC_CPT},
+	{ PCI_VDEVICE(INTEL, 0x1c4b), LPC_CPT},
+	{ PCI_VDEVICE(INTEL, 0x1c4c), LPC_CPT},
+	{ PCI_VDEVICE(INTEL, 0x1c4d), LPC_CPT},
+	{ PCI_VDEVICE(INTEL, 0x1c4e), LPC_CPT},
+	{ PCI_VDEVICE(INTEL, 0x1c4f), LPC_CPT},
+	{ PCI_VDEVICE(INTEL, 0x1c50), LPC_CPT},
+	{ PCI_VDEVICE(INTEL, 0x1c51), LPC_CPT},
+	{ PCI_VDEVICE(INTEL, 0x1c52), LPC_CPT},
+	{ PCI_VDEVICE(INTEL, 0x1c53), LPC_CPT},
+	{ PCI_VDEVICE(INTEL, 0x1c54), LPC_CPT},
+	{ PCI_VDEVICE(INTEL, 0x1c55), LPC_CPT},
+	{ PCI_VDEVICE(INTEL, 0x1c56), LPC_CPT},
+	{ PCI_VDEVICE(INTEL, 0x1c57), LPC_CPT},
+	{ PCI_VDEVICE(INTEL, 0x1c58), LPC_CPT},
+	{ PCI_VDEVICE(INTEL, 0x1c59), LPC_CPT},
+	{ PCI_VDEVICE(INTEL, 0x1c5a), LPC_CPT},
+	{ PCI_VDEVICE(INTEL, 0x1c5b), LPC_CPT},
+	{ PCI_VDEVICE(INTEL, 0x1c5c), LPC_CPT},
+	{ PCI_VDEVICE(INTEL, 0x1c5d), LPC_CPT},
+	{ PCI_VDEVICE(INTEL, 0x1c5e), LPC_CPT},
+	{ PCI_VDEVICE(INTEL, 0x1c5f), LPC_CPT},
+	{ PCI_VDEVICE(INTEL, 0x1d40), LPC_PBG},
+	{ PCI_VDEVICE(INTEL, 0x1d41), LPC_PBG},
+	{ PCI_VDEVICE(INTEL, 0x2310), LPC_DH89XXCC},
+	{ PCI_VDEVICE(INTEL, 0x1e40), LPC_PPT},
+	{ PCI_VDEVICE(INTEL, 0x1e41), LPC_PPT},
+	{ PCI_VDEVICE(INTEL, 0x1e42), LPC_PPT},
+	{ PCI_VDEVICE(INTEL, 0x1e43), LPC_PPT},
+	{ PCI_VDEVICE(INTEL, 0x1e44), LPC_PPT},
+	{ PCI_VDEVICE(INTEL, 0x1e45), LPC_PPT},
+	{ PCI_VDEVICE(INTEL, 0x1e46), LPC_PPT},
+	{ PCI_VDEVICE(INTEL, 0x1e47), LPC_PPT},
+	{ PCI_VDEVICE(INTEL, 0x1e48), LPC_PPT},
+	{ PCI_VDEVICE(INTEL, 0x1e49), LPC_PPT},
+	{ PCI_VDEVICE(INTEL, 0x1e4a), LPC_PPT},
+	{ PCI_VDEVICE(INTEL, 0x1e4b), LPC_PPT},
+	{ PCI_VDEVICE(INTEL, 0x1e4c), LPC_PPT},
+	{ PCI_VDEVICE(INTEL, 0x1e4d), LPC_PPT},
+	{ PCI_VDEVICE(INTEL, 0x1e4e), LPC_PPT},
+	{ PCI_VDEVICE(INTEL, 0x1e4f), LPC_PPT},
+	{ PCI_VDEVICE(INTEL, 0x1e50), LPC_PPT},
+	{ PCI_VDEVICE(INTEL, 0x1e51), LPC_PPT},
+	{ PCI_VDEVICE(INTEL, 0x1e52), LPC_PPT},
+	{ PCI_VDEVICE(INTEL, 0x1e53), LPC_PPT},
+	{ PCI_VDEVICE(INTEL, 0x1e54), LPC_PPT},
+	{ PCI_VDEVICE(INTEL, 0x1e55), LPC_PPT},
+	{ PCI_VDEVICE(INTEL, 0x1e56), LPC_PPT},
+	{ PCI_VDEVICE(INTEL, 0x1e57), LPC_PPT},
+	{ PCI_VDEVICE(INTEL, 0x1e58), LPC_PPT},
+	{ PCI_VDEVICE(INTEL, 0x1e59), LPC_PPT},
+	{ PCI_VDEVICE(INTEL, 0x1e5a), LPC_PPT},
+	{ PCI_VDEVICE(INTEL, 0x1e5b), LPC_PPT},
+	{ PCI_VDEVICE(INTEL, 0x1e5c), LPC_PPT},
+	{ PCI_VDEVICE(INTEL, 0x1e5d), LPC_PPT},
+	{ PCI_VDEVICE(INTEL, 0x1e5e), LPC_PPT},
+	{ PCI_VDEVICE(INTEL, 0x1e5f), LPC_PPT},
+	{ PCI_VDEVICE(INTEL, 0x8c40), LPC_LPT},
+	{ PCI_VDEVICE(INTEL, 0x8c41), LPC_LPT},
+	{ PCI_VDEVICE(INTEL, 0x8c42), LPC_LPT},
+	{ PCI_VDEVICE(INTEL, 0x8c43), LPC_LPT},
+	{ PCI_VDEVICE(INTEL, 0x8c44), LPC_LPT},
+	{ PCI_VDEVICE(INTEL, 0x8c45), LPC_LPT},
+	{ PCI_VDEVICE(INTEL, 0x8c46), LPC_LPT},
+	{ PCI_VDEVICE(INTEL, 0x8c47), LPC_LPT},
+	{ PCI_VDEVICE(INTEL, 0x8c48), LPC_LPT},
+	{ PCI_VDEVICE(INTEL, 0x8c49), LPC_LPT},
+	{ PCI_VDEVICE(INTEL, 0x8c4a), LPC_LPT},
+	{ PCI_VDEVICE(INTEL, 0x8c4b), LPC_LPT},
+	{ PCI_VDEVICE(INTEL, 0x8c4c), LPC_LPT},
+	{ PCI_VDEVICE(INTEL, 0x8c4d), LPC_LPT},
+	{ PCI_VDEVICE(INTEL, 0x8c4e), LPC_LPT},
+	{ PCI_VDEVICE(INTEL, 0x8c4f), LPC_LPT},
+	{ PCI_VDEVICE(INTEL, 0x8c50), LPC_LPT},
+	{ PCI_VDEVICE(INTEL, 0x8c51), LPC_LPT},
+	{ PCI_VDEVICE(INTEL, 0x8c52), LPC_LPT},
+	{ PCI_VDEVICE(INTEL, 0x8c53), LPC_LPT},
+	{ PCI_VDEVICE(INTEL, 0x8c54), LPC_LPT},
+	{ PCI_VDEVICE(INTEL, 0x8c55), LPC_LPT},
+	{ PCI_VDEVICE(INTEL, 0x8c56), LPC_LPT},
+	{ PCI_VDEVICE(INTEL, 0x8c57), LPC_LPT},
+	{ PCI_VDEVICE(INTEL, 0x8c58), LPC_LPT},
+	{ PCI_VDEVICE(INTEL, 0x8c59), LPC_LPT},
+	{ PCI_VDEVICE(INTEL, 0x8c5a), LPC_LPT},
+	{ PCI_VDEVICE(INTEL, 0x8c5b), LPC_LPT},
+	{ PCI_VDEVICE(INTEL, 0x8c5c), LPC_LPT},
+	{ PCI_VDEVICE(INTEL, 0x8c5d), LPC_LPT},
+	{ PCI_VDEVICE(INTEL, 0x8c5e), LPC_LPT},
+	{ PCI_VDEVICE(INTEL, 0x8c5f), LPC_LPT},
+	{ 0, },			/* End of list */
+};
+MODULE_DEVICE_TABLE(pci, lpc_ich_ids);
+
+static void lpc_ich_restore_config_space(struct pci_dev *dev)
+{
+	if (lpc_ich_acpi_save >= 0) {
+		pci_write_config_byte(dev, ACPICTRL, lpc_ich_acpi_save);
+		lpc_ich_acpi_save = -1;
+	}
+
+	if (lpc_ich_gpio_save >= 0) {
+		pci_write_config_byte(dev, GPIOCTRL, lpc_ich_gpio_save);
+		lpc_ich_gpio_save = -1;
+	}
+}
+
+static void __devinit lpc_ich_enable_acpi_space(struct pci_dev *dev)
+{
+	u8 reg_save;
+
+	pci_read_config_byte(dev, ACPICTRL, &reg_save);
+	pci_write_config_byte(dev, ACPICTRL, reg_save | 0x10);
+	lpc_ich_acpi_save = reg_save;
+}
+
+static void __devinit lpc_ich_enable_gpio_space(struct pci_dev *dev)
+{
+	u8 reg_save;
+
+	pci_read_config_byte(dev, GPIOCTRL, &reg_save);
+	pci_write_config_byte(dev, GPIOCTRL, reg_save | 0x10);
+	lpc_ich_gpio_save = reg_save;
+}
+
+static void __devinit lpc_ich_finalize_cell(struct mfd_cell *cell,
+					const struct pci_device_id *id)
+{
+	cell->platform_data = &lpc_chipset_info[id->driver_data];
+	cell->pdata_size = sizeof(struct lpc_ich_info);
+}
+
+static int __devinit lpc_ich_init_gpio(struct pci_dev *dev,
+				const struct pci_device_id *id)
+{
+	u32 base_addr_cfg;
+	u32 base_addr;
+	int ret;
+	bool acpi_conflict = false;
+	struct resource *res;
+
+	/* Setup power management base register */
+	pci_read_config_dword(dev, ACPIBASE, &base_addr_cfg);
+	base_addr = base_addr_cfg & 0x0000ff80;
+	if (!base_addr) {
+		dev_err(&dev->dev, "I/O space for ACPI uninitialized\n");
+		lpc_ich_cells[LPC_GPIO].num_resources--;
+		goto gpe0_done;
+	}
+
+	res = &gpio_ich_res[ICH_RES_GPE0];
+	res->start = base_addr + ACPIBASE_GPE_OFF;
+	res->end = base_addr + ACPIBASE_GPE_END;
+	ret = acpi_check_resource_conflict(res);
+	if (ret) {
+		/*
+		 * This isn't fatal for the GPIO, but we have to make sure that
+		 * the platform_device subsystem doesn't see this resource
+		 * or it will register an invalid region.
+		 */
+		lpc_ich_cells[LPC_GPIO].num_resources--;
+		acpi_conflict = true;
+	} else {
+		lpc_ich_enable_acpi_space(dev);
+	}
+
+gpe0_done:
+	/* Setup GPIO base register */
+	pci_read_config_dword(dev, GPIOBASE, &base_addr_cfg);
+	base_addr = base_addr_cfg & 0x0000ff80;
+	if (!base_addr) {
+		dev_err(&dev->dev, "I/O space for GPIO uninitialized\n");
+		ret = -ENODEV;
+		goto gpio_done;
+	}
+
+	/* Older devices provide fewer GPIO and have a smaller resource size. */
+	res = &gpio_ich_res[ICH_RES_GPIO];
+	res->start = base_addr;
+	switch (lpc_chipset_info[id->driver_data].gpio_version) {
+	case ICH_V5_GPIO:
+	case ICH_V10CORP_GPIO:
+		res->end = res->start + 128 - 1;
+		break;
+	default:
+		res->end = res->start + 64 - 1;
+		break;
+	}
+
+	ret = acpi_check_resource_conflict(res);
+	if (ret) {
+		/* this isn't necessarily fatal for the GPIO */
+		acpi_conflict = true;
+		goto gpio_done;
+	}
+	lpc_ich_enable_gpio_space(dev);
+
+	lpc_ich_finalize_cell(&lpc_ich_cells[LPC_GPIO], id);
+	ret = mfd_add_devices(&dev->dev, -1, &lpc_ich_cells[LPC_GPIO],
+				1, NULL, 0);
+
+gpio_done:
+	if (acpi_conflict)
+		pr_warn("Resource conflict(s) found affecting %s\n",
+				lpc_ich_cells[LPC_GPIO].name);
+	return ret;
+}
+
+static int __devinit lpc_ich_probe(struct pci_dev *dev,
+				const struct pci_device_id *id)
+{
+	int ret;
+	bool cell_added = false;
+
+	ret = lpc_ich_init_gpio(dev, id);
+	if (!ret)
+		cell_added = true;
+
+	/*
+	 * We only care if at least one or none of the cells registered
+	 * successfully.
+	 */
+	if (!cell_added) {
+		lpc_ich_restore_config_space(dev);
+		return -ENODEV;
+	}
+
+	return 0;
+}
+
+static void __devexit lpc_ich_remove(struct pci_dev *dev)
+{
+	mfd_remove_devices(&dev->dev);
+	lpc_ich_restore_config_space(dev);
+}
+
+static struct pci_driver lpc_ich_driver = {
+	.name		= "lpc_ich",
+	.id_table	= lpc_ich_ids,
+	.probe		= lpc_ich_probe,
+	.remove		= __devexit_p(lpc_ich_remove),
+};
+
+static int __init lpc_ich_init(void)
+{
+	return pci_register_driver(&lpc_ich_driver);
+}
+
+static void __exit lpc_ich_exit(void)
+{
+	pci_unregister_driver(&lpc_ich_driver);
+}
+
+module_init(lpc_ich_init);
+module_exit(lpc_ich_exit);
+
+MODULE_AUTHOR("Aaron Sierra <asierra@xes-inc.com>");
+MODULE_DESCRIPTION("LPC interface for Intel ICH");
+MODULE_LICENSE("GPL");
diff --git a/include/linux/mfd/lpc_ich.h b/include/linux/mfd/lpc_ich.h
new file mode 100644
index 0000000..91300b1
--- /dev/null
+++ b/include/linux/mfd/lpc_ich.h
@@ -0,0 +1,41 @@
+/*
+ *  linux/drivers/mfd/lpc_ich.h
+ *
+ *  Copyright (c) 2012 Extreme Engineering Solution, Inc.
+ *  Author: Aaron Sierra <asierra@xes-inc.com>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License 2 as published
+ *  by the Free Software Foundation.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+#ifndef LPC_ICH_H
+#define LPC_ICH_H
+
+/* GPIO resources */
+#define ICH_RES_GPIO	0
+#define ICH_RES_GPE0	1
+
+/* GPIO compatibility */
+#define ICH_I3100_GPIO		0x401
+#define ICH_V5_GPIO		0x501
+#define ICH_V6_GPIO		0x601
+#define ICH_V7_GPIO		0x701
+#define ICH_V9_GPIO		0x801
+#define ICH_V10CORP_GPIO	0xa01
+#define ICH_V10CONS_GPIO	0xa11
+
+struct lpc_ich_info {
+	char name[32];
+	unsigned int gpio_version;
+};
+
+#endif
-- 
cgit v0.10.2


From 6ed9f9c405f97cb7cda485f589cfa6c2bb3fb78e Mon Sep 17 00:00:00 2001
From: Peter Tyser <ptyser@xes-inc.com>
Date: Wed, 18 Apr 2012 09:48:24 -0500
Subject: gpio: Add support for Intel ICHx/3100/Series[56] GPIO

This driver works on many Intel chipsets, including the ICH6, ICH7,
ICH8, ICH9, ICH10, 3100, Series 5/3400 (Ibex Peak), Series 6/C200
(Cougar Point), and NM10 (Tiger Point).

Additional Intel chipsets should be easily supported if needed, eg the
ICH1-5, EP80579, etc.

Tested on QM67 (Cougar Point), QM57 (Ibex Peak), 3100 (Whitmore Lake),
and NM10 (Tiger Point).

Includes work from Jean Delvare:
        - Resource leak removal during module load/unload
        - GPIO API bit value enforcement

Also includes code cleanup from Guenter Roeck and Grant Likely.

Signed-off-by: Peter Tyser <ptyser@xes-inc.com>
Signed-off-by: Aaron Sierra <asierra@xes-inc.com>
Acked-by: Grant Likely <grant.likely@secretlab.ca>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>

diff --git a/MAINTAINERS b/MAINTAINERS
index bb76fc4..28ebbb5 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -3340,6 +3340,12 @@ W:	http://www.developer.ibm.com/welcome/netfinity/serveraid.html
 S:	Supported
 F:	drivers/scsi/ips.*
 
+ICH LPC AND GPIO DRIVER
+M:	Peter Tyser <ptyser@xes-inc.com>
+S:	Maintained
+F:	drivers/mfd/lpc_ich.c
+F:	drivers/gpio/gpio-ich.c
+
 IDE SUBSYSTEM
 M:	"David S. Miller" <davem@davemloft.net>
 L:	linux-ide@vger.kernel.org
diff --git a/drivers/gpio/Kconfig b/drivers/gpio/Kconfig
index e03653d..a0d5796 100644
--- a/drivers/gpio/Kconfig
+++ b/drivers/gpio/Kconfig
@@ -178,6 +178,19 @@ config GPIO_SCH
 	  The Intel Tunnel Creek processor has 5 GPIOs powered by the
 	  core power rail and 9 from suspend power supply.
 
+config GPIO_ICH
+	tristate "Intel ICH GPIO"
+	depends on PCI && X86
+	select MFD_CORE
+	select LPC_ICH
+	help
+	  Say yes here to support the GPIO functionality of a number of Intel
+	  ICH-based chipsets.  Currently supported devices: ICH6, ICH7, ICH8
+	  ICH9, ICH10, Series 5/3400 (eg Ibex Peak), Series 6/C200 (eg
+	  Cougar Point), NM10 (Tiger Point), and 3100 (Whitmore Lake).
+
+	  If unsure, say N.
+
 config GPIO_VX855
 	tristate "VIA VX855/VX875 GPIO"
 	depends on PCI
diff --git a/drivers/gpio/Makefile b/drivers/gpio/Makefile
index 007f54b..183c42b 100644
--- a/drivers/gpio/Makefile
+++ b/drivers/gpio/Makefile
@@ -17,6 +17,7 @@ obj-$(CONFIG_GPIO_DA9052)	+= gpio-da9052.o
 obj-$(CONFIG_ARCH_DAVINCI)	+= gpio-davinci.o
 obj-$(CONFIG_GPIO_EP93XX)	+= gpio-ep93xx.o
 obj-$(CONFIG_GPIO_GE_FPGA)	+= gpio-ge.o
+obj-$(CONFIG_GPIO_ICH)		+= gpio-ich.o
 obj-$(CONFIG_GPIO_IT8761E)	+= gpio-it8761e.o
 obj-$(CONFIG_GPIO_JANZ_TTL)	+= gpio-janz-ttl.o
 obj-$(CONFIG_ARCH_KS8695)	+= gpio-ks8695.o
diff --git a/drivers/gpio/gpio-ich.c b/drivers/gpio/gpio-ich.c
new file mode 100644
index 0000000..b7c0651
--- /dev/null
+++ b/drivers/gpio/gpio-ich.c
@@ -0,0 +1,419 @@
+/*
+ * Intel ICH6-10, Series 5 and 6 GPIO driver
+ *
+ * Copyright (C) 2010 Extreme Engineering Solutions.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/gpio.h>
+#include <linux/platform_device.h>
+#include <linux/mfd/lpc_ich.h>
+
+#define DRV_NAME "gpio_ich"
+
+/*
+ * GPIO register offsets in GPIO I/O space.
+ * Each chunk of 32 GPIOs is manipulated via its own USE_SELx, IO_SELx, and
+ * LVLx registers.  Logic in the read/write functions takes a register and
+ * an absolute bit number and determines the proper register offset and bit
+ * number in that register.  For example, to read the value of GPIO bit 50
+ * the code would access offset ichx_regs[2(=GPIO_LVL)][1(=50/32)],
+ * bit 18 (50%32).
+ */
+enum GPIO_REG {
+	GPIO_USE_SEL = 0,
+	GPIO_IO_SEL,
+	GPIO_LVL,
+};
+
+static const u8 ichx_regs[3][3] = {
+	{0x00, 0x30, 0x40},	/* USE_SEL[1-3] offsets */
+	{0x04, 0x34, 0x44},	/* IO_SEL[1-3] offsets */
+	{0x0c, 0x38, 0x48},	/* LVL[1-3] offsets */
+};
+
+#define ICHX_WRITE(val, reg, base_res)	outl(val, (reg) + (base_res)->start)
+#define ICHX_READ(reg, base_res)	inl((reg) + (base_res)->start)
+
+struct ichx_desc {
+	/* Max GPIO pins the chipset can have */
+	uint ngpio;
+
+	/* Whether the chipset has GPIO in GPE0_STS in the PM IO region */
+	bool uses_gpe0;
+
+	/* USE_SEL is bogus on some chipsets, eg 3100 */
+	u32 use_sel_ignore[3];
+
+	/* Some chipsets have quirks, let these use their own request/get */
+	int (*request)(struct gpio_chip *chip, unsigned offset);
+	int (*get)(struct gpio_chip *chip, unsigned offset);
+};
+
+static struct {
+	spinlock_t lock;
+	struct platform_device *dev;
+	struct gpio_chip chip;
+	struct resource *gpio_base;	/* GPIO IO base */
+	struct resource *pm_base;	/* Power Mangagment IO base */
+	struct ichx_desc *desc;	/* Pointer to chipset-specific description */
+	u32 orig_gpio_ctrl;	/* Orig CTRL value, used to restore on exit */
+} ichx_priv;
+
+static int modparam_gpiobase = -1;	/* dynamic */
+module_param_named(gpiobase, modparam_gpiobase, int, 0444);
+MODULE_PARM_DESC(gpiobase, "The GPIO number base. -1 means dynamic, "
+			   "which is the default.");
+
+static int ichx_write_bit(int reg, unsigned nr, int val, int verify)
+{
+	unsigned long flags;
+	u32 data, tmp;
+	int reg_nr = nr / 32;
+	int bit = nr & 0x1f;
+	int ret = 0;
+
+	spin_lock_irqsave(&ichx_priv.lock, flags);
+
+	data = ICHX_READ(ichx_regs[reg][reg_nr], ichx_priv.gpio_base);
+	if (val)
+		data |= 1 << bit;
+	else
+		data &= ~(1 << bit);
+	ICHX_WRITE(data, ichx_regs[reg][reg_nr], ichx_priv.gpio_base);
+	tmp = ICHX_READ(ichx_regs[reg][reg_nr], ichx_priv.gpio_base);
+	if (verify && data != tmp)
+		ret = -EPERM;
+
+	spin_unlock_irqrestore(&ichx_priv.lock, flags);
+
+	return ret;
+}
+
+static int ichx_read_bit(int reg, unsigned nr)
+{
+	unsigned long flags;
+	u32 data;
+	int reg_nr = nr / 32;
+	int bit = nr & 0x1f;
+
+	spin_lock_irqsave(&ichx_priv.lock, flags);
+
+	data = ICHX_READ(ichx_regs[reg][reg_nr], ichx_priv.gpio_base);
+
+	spin_unlock_irqrestore(&ichx_priv.lock, flags);
+
+	return data & (1 << bit) ? 1 : 0;
+}
+
+static int ichx_gpio_direction_input(struct gpio_chip *gpio, unsigned nr)
+{
+	/*
+	 * Try setting pin as an input and verify it worked since many pins
+	 * are output-only.
+	 */
+	if (ichx_write_bit(GPIO_IO_SEL, nr, 1, 1))
+		return -EINVAL;
+
+	return 0;
+}
+
+static int ichx_gpio_direction_output(struct gpio_chip *gpio, unsigned nr,
+					int val)
+{
+	/* Set GPIO output value. */
+	ichx_write_bit(GPIO_LVL, nr, val, 0);
+
+	/*
+	 * Try setting pin as an output and verify it worked since many pins
+	 * are input-only.
+	 */
+	if (ichx_write_bit(GPIO_IO_SEL, nr, 0, 1))
+		return -EINVAL;
+
+	return 0;
+}
+
+static int ichx_gpio_get(struct gpio_chip *chip, unsigned nr)
+{
+	return ichx_read_bit(GPIO_LVL, nr);
+}
+
+static int ich6_gpio_get(struct gpio_chip *chip, unsigned nr)
+{
+	unsigned long flags;
+	u32 data;
+
+	/*
+	 * GPI 0 - 15 need to be read from the power management registers on
+	 * a ICH6/3100 bridge.
+	 */
+	if (nr < 16) {
+		if (!ichx_priv.pm_base)
+			return -ENXIO;
+
+		spin_lock_irqsave(&ichx_priv.lock, flags);
+
+		/* GPI 0 - 15 are latched, write 1 to clear*/
+		ICHX_WRITE(1 << (16 + nr), 0, ichx_priv.pm_base);
+		data = ICHX_READ(0, ichx_priv.pm_base);
+
+		spin_unlock_irqrestore(&ichx_priv.lock, flags);
+
+		return (data >> 16) & (1 << nr) ? 1 : 0;
+	} else {
+		return ichx_gpio_get(chip, nr);
+	}
+}
+
+static int ichx_gpio_request(struct gpio_chip *chip, unsigned nr)
+{
+	/*
+	 * Note we assume the BIOS properly set a bridge's USE value.  Some
+	 * chips (eg Intel 3100) have bogus USE values though, so first see if
+	 * the chipset's USE value can be trusted for this specific bit.
+	 * If it can't be trusted, assume that the pin can be used as a GPIO.
+	 */
+	if (ichx_priv.desc->use_sel_ignore[nr / 32] & (1 << (nr & 0x1f)))
+		return 1;
+
+	return ichx_read_bit(GPIO_USE_SEL, nr) ? 0 : -ENODEV;
+}
+
+static int ich6_gpio_request(struct gpio_chip *chip, unsigned nr)
+{
+	/*
+	 * Fixups for bits 16 and 17 are necessary on the Intel ICH6/3100
+	 * bridge as they are controlled by USE register bits 0 and 1.  See
+	 * "Table 704 GPIO_USE_SEL1 register" in the i3100 datasheet for
+	 * additional info.
+	 */
+	if (nr == 16 || nr == 17)
+		nr -= 16;
+
+	return ichx_gpio_request(chip, nr);
+}
+
+static void ichx_gpio_set(struct gpio_chip *chip, unsigned nr, int val)
+{
+	ichx_write_bit(GPIO_LVL, nr, val, 0);
+}
+
+static void __devinit ichx_gpiolib_setup(struct gpio_chip *chip)
+{
+	chip->owner = THIS_MODULE;
+	chip->label = DRV_NAME;
+	chip->dev = &ichx_priv.dev->dev;
+
+	/* Allow chip-specific overrides of request()/get() */
+	chip->request = ichx_priv.desc->request ?
+		ichx_priv.desc->request : ichx_gpio_request;
+	chip->get = ichx_priv.desc->get ?
+		ichx_priv.desc->get : ichx_gpio_get;
+
+	chip->set = ichx_gpio_set;
+	chip->direction_input = ichx_gpio_direction_input;
+	chip->direction_output = ichx_gpio_direction_output;
+	chip->base = modparam_gpiobase;
+	chip->ngpio = ichx_priv.desc->ngpio;
+	chip->can_sleep = 0;
+	chip->dbg_show = NULL;
+}
+
+/* ICH6-based, 631xesb-based */
+static struct ichx_desc ich6_desc = {
+	/* Bridges using the ICH6 controller need fixups for GPIO 0 - 17 */
+	.request = ich6_gpio_request,
+	.get = ich6_gpio_get,
+
+	/* GPIO 0-15 are read in the GPE0_STS PM register */
+	.uses_gpe0 = true,
+
+	.ngpio = 50,
+};
+
+/* Intel 3100 */
+static struct ichx_desc i3100_desc = {
+	/*
+	 * Bits 16,17, 20 of USE_SEL and bit 16 of USE_SEL2 always read 0 on
+	 * the Intel 3100.  See "Table 712. GPIO Summary Table" of 3100
+	 * Datasheet for more info.
+	 */
+	.use_sel_ignore = {0x00130000, 0x00010000, 0x0},
+
+	/* The 3100 needs fixups for GPIO 0 - 17 */
+	.request = ich6_gpio_request,
+	.get = ich6_gpio_get,
+
+	/* GPIO 0-15 are read in the GPE0_STS PM register */
+	.uses_gpe0 = true,
+
+	.ngpio = 50,
+};
+
+/* ICH7 and ICH8-based */
+static struct ichx_desc ich7_desc = {
+	.ngpio = 50,
+};
+
+/* ICH9-based */
+static struct ichx_desc ich9_desc = {
+	.ngpio = 61,
+};
+
+/* ICH10-based - Consumer/corporate versions have different amount of GPIO */
+static struct ichx_desc ich10_cons_desc = {
+	.ngpio = 61,
+};
+static struct ichx_desc ich10_corp_desc = {
+	.ngpio = 72,
+};
+
+/* Intel 5 series, 6 series, 3400 series, and C200 series */
+static struct ichx_desc intel5_desc = {
+	.ngpio = 76,
+};
+
+static int __devinit ichx_gpio_probe(struct platform_device *pdev)
+{
+	struct resource *res_base, *res_pm;
+	int err;
+	struct lpc_ich_info *ich_info = pdev->dev.platform_data;
+
+	if (!ich_info)
+		return -ENODEV;
+
+	ichx_priv.dev = pdev;
+
+	switch (ich_info->gpio_version) {
+	case ICH_I3100_GPIO:
+		ichx_priv.desc = &i3100_desc;
+		break;
+	case ICH_V5_GPIO:
+		ichx_priv.desc = &intel5_desc;
+		break;
+	case ICH_V6_GPIO:
+		ichx_priv.desc = &ich6_desc;
+		break;
+	case ICH_V7_GPIO:
+		ichx_priv.desc = &ich7_desc;
+		break;
+	case ICH_V9_GPIO:
+		ichx_priv.desc = &ich9_desc;
+		break;
+	case ICH_V10CORP_GPIO:
+		ichx_priv.desc = &ich10_corp_desc;
+		break;
+	case ICH_V10CONS_GPIO:
+		ichx_priv.desc = &ich10_cons_desc;
+		break;
+	default:
+		return -ENODEV;
+	}
+
+	res_base = platform_get_resource(pdev, IORESOURCE_IO, ICH_RES_GPIO);
+	if (!res_base || !res_base->start || !res_base->end)
+		return -ENODEV;
+
+	if (!request_region(res_base->start, resource_size(res_base),
+				pdev->name))
+		return -EBUSY;
+
+	ichx_priv.gpio_base = res_base;
+
+	/*
+	 * If necessary, determine the I/O address of ACPI/power management
+	 * registers which are needed to read the the GPE0 register for GPI pins
+	 * 0 - 15 on some chipsets.
+	 */
+	if (!ichx_priv.desc->uses_gpe0)
+		goto init;
+
+	res_pm = platform_get_resource(pdev, IORESOURCE_IO, ICH_RES_GPE0);
+	if (!res_pm) {
+		pr_warn("ACPI BAR is unavailable, GPI 0 - 15 unavailable\n");
+		goto init;
+	}
+
+	if (!request_region(res_pm->start, resource_size(res_pm),
+			pdev->name)) {
+		pr_warn("ACPI BAR is busy, GPI 0 - 15 unavailable\n");
+		goto init;
+	}
+
+	ichx_priv.pm_base = res_pm;
+
+init:
+	ichx_gpiolib_setup(&ichx_priv.chip);
+	err = gpiochip_add(&ichx_priv.chip);
+	if (err) {
+		pr_err("Failed to register GPIOs\n");
+		goto add_err;
+	}
+
+	pr_info("GPIO from %d to %d on %s\n", ichx_priv.chip.base,
+	       ichx_priv.chip.base + ichx_priv.chip.ngpio - 1, DRV_NAME);
+
+	return 0;
+
+add_err:
+	release_region(ichx_priv.gpio_base->start,
+			resource_size(ichx_priv.gpio_base));
+	if (ichx_priv.pm_base)
+		release_region(ichx_priv.pm_base->start,
+				resource_size(ichx_priv.pm_base));
+	return err;
+}
+
+static int __devexit ichx_gpio_remove(struct platform_device *pdev)
+{
+	int err;
+
+	err = gpiochip_remove(&ichx_priv.chip);
+	if (err) {
+		dev_err(&pdev->dev, "%s failed, %d\n",
+				"gpiochip_remove()", err);
+		return err;
+	}
+
+	release_region(ichx_priv.gpio_base->start,
+				resource_size(ichx_priv.gpio_base));
+	if (ichx_priv.pm_base)
+		release_region(ichx_priv.pm_base->start,
+				resource_size(ichx_priv.pm_base));
+
+	return 0;
+}
+
+static struct platform_driver ichx_gpio_driver = {
+	.driver		= {
+		.owner	= THIS_MODULE,
+		.name	= DRV_NAME,
+	},
+	.probe		= ichx_gpio_probe,
+	.remove		= __devexit_p(ichx_gpio_remove),
+};
+
+module_platform_driver(ichx_gpio_driver);
+
+MODULE_AUTHOR("Peter Tyser <ptyser@xes-inc.com>");
+MODULE_DESCRIPTION("GPIO interface for Intel ICH series");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("platform:"DRV_NAME);
-- 
cgit v0.10.2


From 38a36f5a6ab893fac87ffd1b1c92a491dfd71ea1 Mon Sep 17 00:00:00 2001
From: Axel Lin <axel.lin@gmail.com>
Date: Tue, 3 Apr 2012 09:09:19 +0800
Subject: mfd: Use module_pci_driver

This patch converts the drivers in drivers/mfd/* to use module_pci_driver()
macro which makes the code smaller and a bit simpler.

Signed-off-by: Axel Lin <axel.lin@gmail.com>
Cc: Andres Salomon <dilinger@queued.net>
Cc: Ira W. Snyder <iws@ovro.caltech.edu>
Cc: Florian Fainelli <florian@openwrt.org>
Cc: Denis Turischev <denis@compulab.co.il>
Cc: Harald Welte <HaraldWelte@viatech.com>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>

diff --git a/drivers/mfd/cs5535-mfd.c b/drivers/mfd/cs5535-mfd.c
index 315fef5..3419e72 100644
--- a/drivers/mfd/cs5535-mfd.c
+++ b/drivers/mfd/cs5535-mfd.c
@@ -186,18 +186,7 @@ static struct pci_driver cs5535_mfd_driver = {
 	.remove = __devexit_p(cs5535_mfd_remove),
 };
 
-static int __init cs5535_mfd_init(void)
-{
-	return pci_register_driver(&cs5535_mfd_driver);
-}
-
-static void __exit cs5535_mfd_exit(void)
-{
-	pci_unregister_driver(&cs5535_mfd_driver);
-}
-
-module_init(cs5535_mfd_init);
-module_exit(cs5535_mfd_exit);
+module_pci_driver(cs5535_mfd_driver);
 
 MODULE_AUTHOR("Andres Salomon <dilinger@queued.net>");
 MODULE_DESCRIPTION("MFD driver for CS5535/CS5536 southbridge's ISA PCI device");
diff --git a/drivers/mfd/janz-cmodio.c b/drivers/mfd/janz-cmodio.c
index a9223ed..2ea9998 100644
--- a/drivers/mfd/janz-cmodio.c
+++ b/drivers/mfd/janz-cmodio.c
@@ -283,23 +283,8 @@ static struct pci_driver cmodio_pci_driver = {
 	.remove   = __devexit_p(cmodio_pci_remove),
 };
 
-/*
- * Module Init / Exit
- */
-
-static int __init cmodio_init(void)
-{
-	return pci_register_driver(&cmodio_pci_driver);
-}
-
-static void __exit cmodio_exit(void)
-{
-	pci_unregister_driver(&cmodio_pci_driver);
-}
+module_pci_driver(cmodio_pci_driver);
 
 MODULE_AUTHOR("Ira W. Snyder <iws@ovro.caltech.edu>");
 MODULE_DESCRIPTION("Janz CMOD-IO PCI MODULbus Carrier Board Driver");
 MODULE_LICENSE("GPL");
-
-module_init(cmodio_init);
-module_exit(cmodio_exit);
diff --git a/drivers/mfd/lpc_sch.c b/drivers/mfd/lpc_sch.c
index abc4213..27026eb 100644
--- a/drivers/mfd/lpc_sch.c
+++ b/drivers/mfd/lpc_sch.c
@@ -167,18 +167,7 @@ static struct pci_driver lpc_sch_driver = {
 	.remove		= __devexit_p(lpc_sch_remove),
 };
 
-static int __init lpc_sch_init(void)
-{
-	return pci_register_driver(&lpc_sch_driver);
-}
-
-static void __exit lpc_sch_exit(void)
-{
-	pci_unregister_driver(&lpc_sch_driver);
-}
-
-module_init(lpc_sch_init);
-module_exit(lpc_sch_exit);
+module_pci_driver(lpc_sch_driver);
 
 MODULE_AUTHOR("Denis Turischev <denis@compulab.co.il>");
 MODULE_DESCRIPTION("LPC interface for Intel Poulsbo SCH");
diff --git a/drivers/mfd/rdc321x-southbridge.c b/drivers/mfd/rdc321x-southbridge.c
index 809bd4a..685d61e 100644
--- a/drivers/mfd/rdc321x-southbridge.c
+++ b/drivers/mfd/rdc321x-southbridge.c
@@ -108,18 +108,7 @@ static struct pci_driver rdc321x_sb_driver = {
 	.remove		= __devexit_p(rdc321x_sb_remove),
 };
 
-static int __init rdc321x_sb_init(void)
-{
-	return pci_register_driver(&rdc321x_sb_driver);
-}
-
-static void __exit rdc321x_sb_exit(void)
-{
-	pci_unregister_driver(&rdc321x_sb_driver);
-}
-
-module_init(rdc321x_sb_init);
-module_exit(rdc321x_sb_exit);
+module_pci_driver(rdc321x_sb_driver);
 
 MODULE_AUTHOR("Florian Fainelli <florian@openwrt.org>");
 MODULE_LICENSE("GPL");
diff --git a/drivers/mfd/vx855.c b/drivers/mfd/vx855.c
index b73cc15..872aff2 100644
--- a/drivers/mfd/vx855.c
+++ b/drivers/mfd/vx855.c
@@ -131,17 +131,7 @@ static struct pci_driver vx855_pci_driver = {
 	.remove		= __devexit_p(vx855_remove),
 };
 
-static int vx855_init(void)
-{
-	return pci_register_driver(&vx855_pci_driver);
-}
-module_init(vx855_init);
-
-static void vx855_exit(void)
-{
-	pci_unregister_driver(&vx855_pci_driver);
-}
-module_exit(vx855_exit);
+module_pci_driver(vx855_pci_driver);
 
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Harald Welte <HaraldWelte@viatech.com>");
-- 
cgit v0.10.2


From 18996db262d0f1e79357315e033ace24488f92ad Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@opensource.wolfsonmicro.com>
Date: Thu, 5 Apr 2012 15:16:56 +0100
Subject: mfd: Cache wm8994 chip revision

There's no need to mark the chip revision registers as volatile, it won't
change at runtime so we can cache it from the device at startup.

Signed-off-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>

diff --git a/drivers/mfd/wm8994-regmap.c b/drivers/mfd/wm8994-regmap.c
index bfd25af..52e9e29 100644
--- a/drivers/mfd/wm8994-regmap.c
+++ b/drivers/mfd/wm8994-regmap.c
@@ -1122,7 +1122,6 @@ static bool wm8994_volatile_register(struct device *dev, unsigned int reg)
 	case WM8994_RATE_STATUS:
 	case WM8958_MIC_DETECT_3:
 	case WM8994_DC_SERVO_4E:
-	case WM8994_CHIP_REVISION:
 	case WM8994_INTERRUPT_STATUS_1:
 	case WM8994_INTERRUPT_STATUS_2:
 		return true;
-- 
cgit v0.10.2


From f22a9c6fd56f0a7b24ee43466e524ad4576c6dfd Mon Sep 17 00:00:00 2001
From: Paul Parsons <lost.distance@yahoo.com>
Date: Thu, 5 Apr 2012 17:45:04 +0100
Subject: mfd: Add PCMCIA/CF support to asic3

This patch is part of a set which adds PCMCIA/CF support for the hx4700.
This patch adds asic3_set_register() calls to:
1. Enable the PCMCIA/CF in asic3_probe().
2. Disable the PCMCIA/CF in asic3_remove().

Signed-off-by: Paul Parsons <lost.distance@yahoo.com>
Acked-by: Philipp Zabel <philipp.zabel@gmail.com>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>

diff --git a/drivers/mfd/asic3.c b/drivers/mfd/asic3.c
index 1582c3d..f75dc67 100644
--- a/drivers/mfd/asic3.c
+++ b/drivers/mfd/asic3.c
@@ -1000,6 +1000,9 @@ static int __init asic3_probe(struct platform_device *pdev)
 
 	asic3_mfd_probe(pdev, pdata, mem);
 
+	asic3_set_register(asic, ASIC3_OFFSET(EXTCF, SELECT),
+		(ASIC3_EXTCF_CF0_BUF_EN|ASIC3_EXTCF_CF0_PWAIT_EN), 1);
+
 	dev_info(asic->dev, "ASIC3 Core driver\n");
 
 	return 0;
@@ -1021,6 +1024,9 @@ static int __devexit asic3_remove(struct platform_device *pdev)
 	int ret;
 	struct asic3 *asic = platform_get_drvdata(pdev);
 
+	asic3_set_register(asic, ASIC3_OFFSET(EXTCF, SELECT),
+		(ASIC3_EXTCF_CF0_BUF_EN|ASIC3_EXTCF_CF0_PWAIT_EN), 0);
+
 	asic3_mfd_remove(pdev);
 
 	ret = asic3_gpio_remove(pdev);
-- 
cgit v0.10.2


From e1277f45d8748ff59608b140780f75390cb5400c Mon Sep 17 00:00:00 2001
From: Laxman Dewangan <ldewangan@nvidia.com>
Date: Mon, 9 Apr 2012 13:55:55 +0530
Subject: mfd: Add rc5t583's gpio in mfd device list

Adding the gpio of RC583 in the list of rc583 mfd devices
to register the gpio driver of RC5T583.

Signed-off-by: Laxman Dewangan <ldewangan@nvidia.com>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>

diff --git a/drivers/mfd/rc5t583.c b/drivers/mfd/rc5t583.c
index 44afae0..1cb4c35 100644
--- a/drivers/mfd/rc5t583.c
+++ b/drivers/mfd/rc5t583.c
@@ -75,6 +75,7 @@ static struct deepsleep_control_data deepsleep_data[] = {
 	(RC5T583_EXT_PWRREQ1_CONTROL | RC5T583_EXT_PWRREQ2_CONTROL)
 
 static struct mfd_cell rc5t583_subdevs[] = {
+	{.name = "rc5t583-gpio",},
 	{.name = "rc5t583-regulator",},
 	{.name = "rc5t583-rtc",      },
 	{.name = "rc5t583-key",      }
-- 
cgit v0.10.2


From 4f304245bb6cfa665ff21b12c059499eafa8b725 Mon Sep 17 00:00:00 2001
From: Paul Parsons <lost.distance@yahoo.com>
Date: Mon, 9 Apr 2012 13:18:31 +0100
Subject: mfd: Set asic3 DS1WM clock_rate

The mfd/asic3 driver does not set the ds1wm_driver_data clock_rate field
before passing the structure to the DS1WM w1 busmaster driver.
This was not noticed before commit 26a6afb, because ds1wm_find_divisor()
unintentionally returned the correct divisor when a zero clock_rate was
passed in. However after that commit DS1WM fails a zero clock_rate:

ds1wm ds1wm: no suitable divisor for 0Hz clock

This patch sets the ds1wm_driver_data clock_rate field.

Signed-off-by: Paul Parsons <lost.distance@yahoo.com>
Acked-by: Philipp Zabel <philipp.zabel@gmail.com>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>

diff --git a/drivers/mfd/asic3.c b/drivers/mfd/asic3.c
index f75dc67..4c3ec81 100644
--- a/drivers/mfd/asic3.c
+++ b/drivers/mfd/asic3.c
@@ -894,10 +894,13 @@ static int __init asic3_mfd_probe(struct platform_device *pdev,
 	asic3_mmc_resources[0].start >>= asic->bus_shift;
 	asic3_mmc_resources[0].end   >>= asic->bus_shift;
 
-	ret = mfd_add_devices(&pdev->dev, pdev->id,
+	if (pdata->clock_rate) {
+		ds1wm_pdata.clock_rate = pdata->clock_rate;
+		ret = mfd_add_devices(&pdev->dev, pdev->id,
 			&asic3_cell_ds1wm, 1, mem, asic->irq_base);
-	if (ret < 0)
-		goto out;
+		if (ret < 0)
+			goto out;
+	}
 
 	if (mem_sdio && (irq >= 0)) {
 		ret = mfd_add_devices(&pdev->dev, pdev->id,
diff --git a/include/linux/mfd/asic3.h b/include/linux/mfd/asic3.h
index ed793b7..3fda7e5 100644
--- a/include/linux/mfd/asic3.h
+++ b/include/linux/mfd/asic3.h
@@ -31,6 +31,8 @@ struct asic3_platform_data {
 
 	unsigned int gpio_base;
 
+	unsigned int clock_rate;
+
 	struct asic3_led *leds;
 };
 
-- 
cgit v0.10.2


From 1baf665b8167c0ab1240e76b1eae647d5ab60b23 Mon Sep 17 00:00:00 2001
From: Lars-Peter Clausen <lars@metafoo.de>
Date: Tue, 10 Apr 2012 22:51:28 +0200
Subject: mfd: Remove redundant spi driver bus initialization

In ancient times it was necessary to manually initialize the bus field of an
spi_driver to spi_bus_type. These days this is done in spi_driver_register() so
we can drop the manual assignment.

The patch was generated using the following coccinelle semantic patch:
// <smpl>
@@
identifier _driver;
@@
struct spi_driver _driver = {
    .driver = {
-           .bus = &spi_bus_type,
    },
};
// </smpl>

Signed-off-by: Lars-Peter Clausen <lars@metafoo.de>
Cc: Srinidhi Kasagar <srinidhi.kasagar@stericsson.com>
Acked-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>

diff --git a/drivers/mfd/da9052-spi.c b/drivers/mfd/da9052-spi.c
index 6faf149..6e09498 100644
--- a/drivers/mfd/da9052-spi.c
+++ b/drivers/mfd/da9052-spi.c
@@ -88,7 +88,6 @@ static struct spi_driver da9052_spi_driver = {
 	.id_table = da9052_spi_id,
 	.driver = {
 		.name = "da9052",
-		.bus = &spi_bus_type,
 		.owner = THIS_MODULE,
 	},
 };
diff --git a/drivers/mfd/stmpe-spi.c b/drivers/mfd/stmpe-spi.c
index b58c43c..afd4590 100644
--- a/drivers/mfd/stmpe-spi.c
+++ b/drivers/mfd/stmpe-spi.c
@@ -122,7 +122,6 @@ MODULE_DEVICE_TABLE(spi, stmpe_id);
 static struct spi_driver stmpe_spi_driver = {
 	.driver = {
 		.name	= "stmpe-spi",
-		.bus	= &spi_bus_type,
 		.owner	= THIS_MODULE,
 #ifdef CONFIG_PM
 		.pm	= &stmpe_dev_pm_ops,
-- 
cgit v0.10.2


From 2fe372fc2a037c8de0c721b45cd0e4e9c8d8c25e Mon Sep 17 00:00:00 2001
From: Paul Parsons <lost.distance@yahoo.com>
Date: Wed, 11 Apr 2012 00:35:34 +0100
Subject: mfd: Avoid unbalanced asic3 irq wakeup enables/disables

The mfd/asic3 driver does not currently define a irq_set_wake() handler.
Consequently any attempt to configure the 3 ASIC3 GPIO buttons - RECORD,
CALENDAR, HOME - as wakeup sources results in Unbalanced IRQ warnings
when the system is woken from sleep mode:

WARNING: at kernel/irq/manage.c:520 irq_set_irq_wake+0xc4/0xf8()
Unbalanced IRQ 342 wake disable
...
WARNING: at kernel/irq/manage.c:520 irq_set_irq_wake+0xc4/0xf8()
Unbalanced IRQ 337 wake disable
...
WARNING: at kernel/irq/manage.c:520 irq_set_irq_wake+0xc4/0xf8()
Unbalanced IRQ 339 wake disable
...

This patch adds a irq_set_wake() handler to the mfd/asic3 driver.

Signed-off-by: Paul Parsons <lost.distance@yahoo.com>
Cc: Philipp Zabel <philipp.zabel@gmail.com>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>

diff --git a/drivers/mfd/asic3.c b/drivers/mfd/asic3.c
index 4c3ec81..9d4a492 100644
--- a/drivers/mfd/asic3.c
+++ b/drivers/mfd/asic3.c
@@ -353,12 +353,28 @@ static int asic3_gpio_irq_type(struct irq_data *data, unsigned int type)
 	return 0;
 }
 
+static int asic3_gpio_irq_set_wake(struct irq_data *data, unsigned int on)
+{
+	struct asic3 *asic = irq_data_get_irq_chip_data(data);
+	u32 bank, index;
+	u16 bit;
+
+	bank = asic3_irq_to_bank(asic, data->irq);
+	index = asic3_irq_to_index(asic, data->irq);
+	bit = 1<<index;
+
+	asic3_set_register(asic, bank + ASIC3_GPIO_SLEEP_MASK, bit, !on);
+
+	return 0;
+}
+
 static struct irq_chip asic3_gpio_irq_chip = {
 	.name		= "ASIC3-GPIO",
 	.irq_ack	= asic3_mask_gpio_irq,
 	.irq_mask	= asic3_mask_gpio_irq,
 	.irq_unmask	= asic3_unmask_gpio_irq,
 	.irq_set_type	= asic3_gpio_irq_type,
+	.irq_set_wake	= asic3_gpio_irq_set_wake,
 };
 
 static struct irq_chip asic3_irq_chip = {
-- 
cgit v0.10.2


From 465c29d384f60c5fd9bcbc92efb9e249e585740a Mon Sep 17 00:00:00 2001
From: Pasi Savanainen <ext-pasi.m.savanainen@nokia.com>
Date: Thu, 12 Apr 2012 12:40:37 +0300
Subject: mfd: Convert Intel MSIC driver to use devm_* interfaces.

The devm_* functions eliminate the need for manual resource releasing
and simplify error handling. Resources allocated by devm_* are freed
automatically on driver detach.

Signed-off-by: Pasi Savanainen <ext-pasi.m.savanainen@gmail.com>
Acked-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>

diff --git a/drivers/mfd/intel_msic.c b/drivers/mfd/intel_msic.c
index b76657e..59df558 100644
--- a/drivers/mfd/intel_msic.c
+++ b/drivers/mfd/intel_msic.c
@@ -406,7 +406,7 @@ static int __devinit intel_msic_probe(struct platform_device *pdev)
 		return -ENXIO;
 	}
 
-	msic = kzalloc(sizeof(*msic), GFP_KERNEL);
+	msic = devm_kzalloc(&pdev->dev, sizeof(*msic), GFP_KERNEL);
 	if (!msic)
 		return -ENOMEM;
 
@@ -421,21 +421,13 @@ static int __devinit intel_msic_probe(struct platform_device *pdev)
 	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
 	if (!res) {
 		dev_err(&pdev->dev, "failed to get SRAM iomem resource\n");
-		ret = -ENODEV;
-		goto fail_free_msic;
+		return -ENODEV;
 	}
 
-	res = request_mem_region(res->start, resource_size(res), pdev->name);
-	if (!res) {
-		ret = -EBUSY;
-		goto fail_free_msic;
-	}
-
-	msic->irq_base = ioremap_nocache(res->start, resource_size(res));
+	msic->irq_base = devm_request_and_ioremap(&pdev->dev, res);
 	if (!msic->irq_base) {
 		dev_err(&pdev->dev, "failed to map SRAM memory\n");
-		ret = -ENOMEM;
-		goto fail_release_region;
+		return -ENOMEM;
 	}
 
 	platform_set_drvdata(pdev, msic);
@@ -443,7 +435,7 @@ static int __devinit intel_msic_probe(struct platform_device *pdev)
 	ret = intel_msic_init_devices(msic);
 	if (ret) {
 		dev_err(&pdev->dev, "failed to initialize MSIC devices\n");
-		goto fail_unmap_mem;
+		return ret;
 	}
 
 	dev_info(&pdev->dev, "Intel MSIC version %c%d (vendor %#x)\n",
@@ -451,27 +443,14 @@ static int __devinit intel_msic_probe(struct platform_device *pdev)
 		 msic->vendor);
 
 	return 0;
-
-fail_unmap_mem:
-	iounmap(msic->irq_base);
-fail_release_region:
-	release_mem_region(res->start, resource_size(res));
-fail_free_msic:
-	kfree(msic);
-
-	return ret;
 }
 
 static int __devexit intel_msic_remove(struct platform_device *pdev)
 {
 	struct intel_msic *msic = platform_get_drvdata(pdev);
-	struct resource *res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
 
 	intel_msic_remove_devices(msic);
 	platform_set_drvdata(pdev, NULL);
-	iounmap(msic->irq_base);
-	release_mem_region(res->start, resource_size(res));
-	kfree(msic);
 
 	return 0;
 }
-- 
cgit v0.10.2


From 58d114b669d2b86aa79eac6688590c808072579b Mon Sep 17 00:00:00 2001
From: "Ying-Chun Liu (PaulLiu)" <paul.liu@linaro.org>
Date: Mon, 16 Apr 2012 00:01:50 +0800
Subject: mfd: Add device-tree support for da9502 i2c driver

This patch adds device-tree support for dialog MFD and the binding
documentations.

Signed-off-by: Ying-Chun Liu (PaulLiu) <paul.liu@linaro.org>
Cc: Samuel Ortiz <sameo@linux.intel.com>
Cc: Mark Brown <broonie@opensource.wolfsonmicro.com>
Cc: Shawn Guo <shawn.guo@linaro.org>
Cc: Ashish Jangam <ashish.jangam@kpitcummins.com>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>

diff --git a/Documentation/devicetree/bindings/mfd/da9052-i2c.txt b/Documentation/devicetree/bindings/mfd/da9052-i2c.txt
new file mode 100644
index 0000000..1857f4a
--- /dev/null
+++ b/Documentation/devicetree/bindings/mfd/da9052-i2c.txt
@@ -0,0 +1,60 @@
+* Dialog DA9052/53 Power Management Integrated Circuit (PMIC)
+
+Required properties:
+- compatible : Should be "dlg,da9052", "dlg,da9053-aa",
+			 "dlg,da9053-ab", or "dlg,da9053-bb"
+
+Sub-nodes:
+- regulators : Contain the regulator nodes. The DA9052/53 regulators are
+  bound using their names as listed below:
+
+    buck0     : regulator BUCK0
+    buck1     : regulator BUCK1
+    buck2     : regulator BUCK2
+    buck3     : regulator BUCK3
+    ldo4      : regulator LDO4
+    ldo5      : regulator LDO5
+    ldo6      : regulator LDO6
+    ldo7      : regulator LDO7
+    ldo8      : regulator LDO8
+    ldo9      : regulator LDO9
+    ldo10     : regulator LDO10
+    ldo11     : regulator LDO11
+    ldo12     : regulator LDO12
+    ldo13     : regulator LDO13
+
+  The bindings details of individual regulator device can be found in:
+  Documentation/devicetree/bindings/regulator/regulator.txt
+
+Examples:
+
+i2c@63fc8000 { /* I2C1 */
+	status = "okay";
+
+	pmic: dialog@48 {
+		compatible = "dlg,da9053-aa";
+		reg = <0x48>;
+
+		regulators {
+			buck0 {
+				regulator-min-microvolt = <500000>;
+				regulator-max-microvolt = <2075000>;
+			};
+
+			buck1 {
+				regulator-min-microvolt = <500000>;
+				regulator-max-microvolt = <2075000>;
+			};
+
+			buck2 {
+				regulator-min-microvolt = <925000>;
+				regulator-max-microvolt = <2500000>;
+			};
+
+			buck3 {
+				regulator-min-microvolt = <925000>;
+				regulator-max-microvolt = <2500000>;
+			};
+		};
+	};
+};
diff --git a/drivers/mfd/da9052-i2c.c b/drivers/mfd/da9052-i2c.c
index 36b88e3..d8abdb3 100644
--- a/drivers/mfd/da9052-i2c.c
+++ b/drivers/mfd/da9052-i2c.c
@@ -22,6 +22,11 @@
 #include <linux/mfd/da9052/da9052.h>
 #include <linux/mfd/da9052/reg.h>
 
+#ifdef CONFIG_OF
+#include <linux/of.h>
+#include <linux/of_device.h>
+#endif
+
 static int da9052_i2c_enable_multiwrite(struct da9052 *da9052)
 {
 	int reg_val, ret;
@@ -41,6 +46,24 @@ static int da9052_i2c_enable_multiwrite(struct da9052 *da9052)
 	return 0;
 }
 
+static struct i2c_device_id da9052_i2c_id[] = {
+	{"da9052", DA9052},
+	{"da9053-aa", DA9053_AA},
+	{"da9053-ba", DA9053_BA},
+	{"da9053-bb", DA9053_BB},
+	{}
+};
+
+#ifdef CONFIG_OF
+static const struct of_device_id dialog_dt_ids[] = {
+	{ .compatible = "dlg,da9052", .data = &da9052_i2c_id[0] },
+	{ .compatible = "dlg,da9053-aa", .data = &da9052_i2c_id[1] },
+	{ .compatible = "dlg,da9053-ab", .data = &da9052_i2c_id[2] },
+	{ .compatible = "dlg,da9053-bb", .data = &da9052_i2c_id[3] },
+	{ /* sentinel */ }
+};
+#endif
+
 static int __devinit da9052_i2c_probe(struct i2c_client *client,
 				       const struct i2c_device_id *id)
 {
@@ -76,6 +99,22 @@ static int __devinit da9052_i2c_probe(struct i2c_client *client,
 	if (ret < 0)
 		goto err_regmap;
 
+#ifdef CONFIG_OF
+	if (!id) {
+		struct device_node *np = client->dev.of_node;
+		const struct of_device_id *deviceid;
+
+		deviceid = of_match_node(np, dialog_dt_ids);
+		id = (const struct i2c_device_id *)deviceid->data;
+	}
+#endif
+
+	if (!id) {
+		ret = -ENODEV;
+		dev_err(&client->dev, "id is null.\n");
+		goto err_regmap;
+	}
+
 	ret = da9052_device_init(da9052, id->driver_data);
 	if (ret != 0)
 		goto err_regmap;
@@ -100,14 +139,6 @@ static int __devexit da9052_i2c_remove(struct i2c_client *client)
 	return 0;
 }
 
-static struct i2c_device_id da9052_i2c_id[] = {
-	{"da9052", DA9052},
-	{"da9053-aa", DA9053_AA},
-	{"da9053-ba", DA9053_BA},
-	{"da9053-bb", DA9053_BB},
-	{}
-};
-
 static struct i2c_driver da9052_i2c_driver = {
 	.probe = da9052_i2c_probe,
 	.remove = __devexit_p(da9052_i2c_remove),
@@ -115,6 +146,9 @@ static struct i2c_driver da9052_i2c_driver = {
 	.driver = {
 		.name = "da9052",
 		.owner = THIS_MODULE,
+#ifdef CONFIG_OF
+		.of_match_table = dialog_dt_ids,
+#endif
 	},
 };
 
-- 
cgit v0.10.2


From 201cf052810d20814a77ca0e0045a2c1a3508a1f Mon Sep 17 00:00:00 2001
From: Laxman Dewangan <ldewangan@nvidia.com>
Date: Wed, 18 Apr 2012 12:13:51 +0200
Subject: mfd: Add support for tps65910 device sleep

Adding support for device sleep through the external input control
signal "SLEEP".
Changing the SLEEP signal state can switch the device into SLEEP and
ACTIVE state.
Also adding sleep configuration for different resources so that they
should be keep on during sleep state of device.

Signed-off-by: Laxman Dewangan <ldewangan@nvidia.com>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>

diff --git a/drivers/mfd/tps65910.c b/drivers/mfd/tps65910.c
index bf2b25e..ae7f47b 100644
--- a/drivers/mfd/tps65910.c
+++ b/drivers/mfd/tps65910.c
@@ -90,6 +90,66 @@ static const struct regmap_config tps65910_regmap_config = {
 	.cache_type = REGCACHE_RBTREE,
 };
 
+static int __init tps65910_sleepinit(struct tps65910 *tps65910,
+		struct tps65910_board *pmic_pdata)
+{
+	struct device *dev = NULL;
+	int ret = 0;
+
+	dev = tps65910->dev;
+
+	if (!pmic_pdata->en_dev_slp)
+		return 0;
+
+	/* enabling SLEEP device state */
+	ret = tps65910_set_bits(tps65910, TPS65910_DEVCTRL,
+				DEVCTRL_DEV_SLP_MASK);
+	if (ret < 0) {
+		dev_err(dev, "set dev_slp failed: %d\n", ret);
+		goto err_sleep_init;
+	}
+
+	/* Return if there is no sleep keepon data. */
+	if (!pmic_pdata->slp_keepon)
+		return 0;
+
+	if (pmic_pdata->slp_keepon->therm_keepon) {
+		ret = tps65910_set_bits(tps65910, TPS65910_SLEEP_KEEP_RES_ON,
+				SLEEP_KEEP_RES_ON_THERM_KEEPON_MASK);
+		if (ret < 0) {
+			dev_err(dev, "set therm_keepon failed: %d\n", ret);
+			goto disable_dev_slp;
+		}
+	}
+
+	if (pmic_pdata->slp_keepon->clkout32k_keepon) {
+		ret = tps65910_set_bits(tps65910, TPS65910_SLEEP_KEEP_RES_ON,
+				SLEEP_KEEP_RES_ON_CLKOUT32K_KEEPON_MASK);
+		if (ret < 0) {
+			dev_err(dev, "set clkout32k_keepon failed: %d\n", ret);
+			goto disable_dev_slp;
+		}
+	}
+
+	if (pmic_pdata->slp_keepon->i2chs_keepon) {
+		ret = tps65910_set_bits(tps65910, TPS65910_SLEEP_KEEP_RES_ON,
+				SLEEP_KEEP_RES_ON_I2CHS_KEEPON_MASK);
+		if (ret < 0) {
+			dev_err(dev, "set i2chs_keepon failed: %d\n", ret);
+			goto disable_dev_slp;
+		}
+	}
+
+	return 0;
+
+disable_dev_slp:
+	tps65910_clear_bits(tps65910, TPS65910_DEVCTRL, DEVCTRL_DEV_SLP_MASK);
+
+err_sleep_init:
+	return ret;
+}
+
+
 static int tps65910_i2c_probe(struct i2c_client *i2c,
 			    const struct i2c_device_id *id)
 {
@@ -140,6 +200,8 @@ static int tps65910_i2c_probe(struct i2c_client *i2c,
 
 	tps65910_irq_init(tps65910, init_data->irq, init_data);
 
+	tps65910_sleepinit(tps65910, pmic_plat_data);
+
 	kfree(init_data);
 	return ret;
 
diff --git a/include/linux/mfd/tps65910.h b/include/linux/mfd/tps65910.h
index 1c6c286..56903ad 100644
--- a/include/linux/mfd/tps65910.h
+++ b/include/linux/mfd/tps65910.h
@@ -783,6 +783,18 @@
 #define TPS65910_SLEEP_CONTROL_EXT_INPUT_EN3		0x4
 #define TPS65911_SLEEP_CONTROL_EXT_INPUT_SLEEP		0x8
 
+/*
+ * Sleep keepon data: Maintains the state in sleep mode
+ * @therm_keepon: Keep on the thermal monitoring in sleep state.
+ * @clkout32k_keepon: Keep on the 32KHz clock output in sleep state.
+ * @i2chs_keepon: Keep on high speed internal clock in sleep state.
+ */
+struct tps65910_sleep_keepon_data {
+	unsigned therm_keepon:1;
+	unsigned clkout32k_keepon:1;
+	unsigned i2chs_keepon:1;
+};
+
 /**
  * struct tps65910_board
  * Board platform data may be used to initialize regulators.
@@ -794,6 +806,8 @@ struct tps65910_board {
 	int irq_base;
 	int vmbch_threshold;
 	int vmbch2_threshold;
+	bool en_dev_slp;
+	struct tps65910_sleep_keepon_data *slp_keepon;
 	bool en_gpio_sleep[TPS6591X_MAX_NUM_GPIO];
 	unsigned long regulator_ext_sleep_control[TPS65910_NUM_REGS];
 	struct regulator_init_data *tps65910_pmic_init_data[TPS65910_NUM_REGS];
-- 
cgit v0.10.2


From 12693f6c1ff6f43f34a0d105307976337f64dcdd Mon Sep 17 00:00:00 2001
From: Samuel Ortiz <sameo@linux.intel.com>
Date: Mon, 16 Apr 2012 21:28:29 +0200
Subject: mfd: No need to check for the GPIO offset from asic3_gpio_to_irq

The gpiolib code will only call our gpio_to_irq ops for our registered
GPIO range.

Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>

diff --git a/drivers/mfd/asic3.c b/drivers/mfd/asic3.c
index 9d4a492..383421b 100644
--- a/drivers/mfd/asic3.c
+++ b/drivers/mfd/asic3.c
@@ -545,7 +545,7 @@ static int asic3_gpio_to_irq(struct gpio_chip *chip, unsigned offset)
 {
 	struct asic3 *asic = container_of(chip, struct asic3, gpio);
 
-	return (offset < ASIC3_NUM_GPIOS) ? asic->irq_base + offset : -ENXIO;
+	return asic->irq_base + offset;
 }
 
 static __init int asic3_gpio_probe(struct platform_device *pdev,
-- 
cgit v0.10.2


From 4c394bb1683dbd110314ab37da7bfa6c9a77073d Mon Sep 17 00:00:00 2001
From: Russ Dill <Russ.Dill@ti.com>
Date: Sun, 22 Apr 2012 01:48:18 -0700
Subject: mfd: Fix build breakage in omap-usb-host.c

'ARM: OMAP3: USB: Fix the EHCI ULPI PHY reset issue' removes the include for
linux/gpio.h from omap-usb-host.c. This include indirectly includes plat/cpu.h
which is required by omap-usb-host.c. Fix the build breakage by including
it directly.

Acked-by: Keshava Munegowda <keshava_mgowda@ti.com>
Acked-by: Kevin Hilman <khilman@ti.com>
Signed-off-by: Russ Dill <Russ.Dill@ti.com>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>

diff --git a/drivers/mfd/omap-usb-host.c b/drivers/mfd/omap-usb-host.c
index c8aae66..7e96bb2 100644
--- a/drivers/mfd/omap-usb-host.c
+++ b/drivers/mfd/omap-usb-host.c
@@ -25,6 +25,7 @@
 #include <linux/clk.h>
 #include <linux/dma-mapping.h>
 #include <linux/spinlock.h>
+#include <plat/cpu.h>
 #include <plat/usb.h>
 #include <linux/pm_runtime.h>
 
-- 
cgit v0.10.2


From 5006fe546a4eb0393782b818d4e0e2a6b4fa3803 Mon Sep 17 00:00:00 2001
From: Marc Reilly <marc@cpdesign.com.au>
Date: Tue, 1 May 2012 12:26:46 +0200
Subject: mfd: Prepare for separating spi and i2c mc13xxx-core backends

This patch abstracts the bus specific operations from the driver core.
Generic init and cleanup is consolidated into mc13xxx_common_*.
spi specific functions are renamed to reflect such.
(The irq member of the mc13xxx struct is no longer redundant, it's used
to store the irq for cleanup time).

Signed-off-by: Marc Reilly <marc@cpdesign.com.au>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>

diff --git a/drivers/mfd/mc13xxx-core.c b/drivers/mfd/mc13xxx-core.c
index 9fd4f63..c8b7a28 100644
--- a/drivers/mfd/mc13xxx-core.c
+++ b/drivers/mfd/mc13xxx-core.c
@@ -22,8 +22,18 @@
 #include <linux/of_device.h>
 #include <linux/of_gpio.h>
 
+enum mc13xxx_id {
+	MC13XXX_ID_MC13783,
+	MC13XXX_ID_MC13892,
+	MC13XXX_ID_INVALID,
+};
+
 struct mc13xxx {
 	struct spi_device *spidev;
+
+	struct device *dev;
+	enum mc13xxx_id ictype;
+
 	struct mutex lock;
 	int irq;
 	int flags;
@@ -144,36 +154,32 @@ struct mc13xxx {
 void mc13xxx_lock(struct mc13xxx *mc13xxx)
 {
 	if (!mutex_trylock(&mc13xxx->lock)) {
-		dev_dbg(&mc13xxx->spidev->dev, "wait for %s from %pf\n",
+		dev_dbg(mc13xxx->dev, "wait for %s from %pf\n",
 				__func__, __builtin_return_address(0));
 
 		mutex_lock(&mc13xxx->lock);
 	}
-	dev_dbg(&mc13xxx->spidev->dev, "%s from %pf\n",
+	dev_dbg(mc13xxx->dev, "%s from %pf\n",
 			__func__, __builtin_return_address(0));
 }
 EXPORT_SYMBOL(mc13xxx_lock);
 
 void mc13xxx_unlock(struct mc13xxx *mc13xxx)
 {
-	dev_dbg(&mc13xxx->spidev->dev, "%s from %pf\n",
+	dev_dbg(mc13xxx->dev, "%s from %pf\n",
 			__func__, __builtin_return_address(0));
 	mutex_unlock(&mc13xxx->lock);
 }
 EXPORT_SYMBOL(mc13xxx_unlock);
 
 #define MC13XXX_REGOFFSET_SHIFT 25
-int mc13xxx_reg_read(struct mc13xxx *mc13xxx, unsigned int offset, u32 *val)
+static int mc13xxx_spi_reg_read(struct mc13xxx *mc13xxx,
+				unsigned int offset, u32 *val)
 {
 	struct spi_transfer t;
 	struct spi_message m;
 	int ret;
 
-	BUG_ON(!mutex_is_locked(&mc13xxx->lock));
-
-	if (offset > MC13XXX_NUMREGS)
-		return -EINVAL;
-
 	*val = offset << MC13XXX_REGOFFSET_SHIFT;
 
 	memset(&t, 0, sizeof(t));
@@ -195,26 +201,17 @@ int mc13xxx_reg_read(struct mc13xxx *mc13xxx, unsigned int offset, u32 *val)
 
 	*val &= 0xffffff;
 
-	dev_vdbg(&mc13xxx->spidev->dev, "[0x%02x] -> 0x%06x\n", offset, *val);
-
 	return 0;
 }
-EXPORT_SYMBOL(mc13xxx_reg_read);
 
-int mc13xxx_reg_write(struct mc13xxx *mc13xxx, unsigned int offset, u32 val)
+static int mc13xxx_spi_reg_write(struct mc13xxx *mc13xxx, unsigned int offset,
+		u32 val)
 {
 	u32 buf;
 	struct spi_transfer t;
 	struct spi_message m;
 	int ret;
 
-	BUG_ON(!mutex_is_locked(&mc13xxx->lock));
-
-	dev_vdbg(&mc13xxx->spidev->dev, "[0x%02x] <- 0x%06x\n", offset, val);
-
-	if (offset > MC13XXX_NUMREGS || val > 0xffffff)
-		return -EINVAL;
-
 	buf = 1 << 31 | offset << MC13XXX_REGOFFSET_SHIFT | val;
 
 	memset(&t, 0, sizeof(t));
@@ -235,6 +232,34 @@ int mc13xxx_reg_write(struct mc13xxx *mc13xxx, unsigned int offset, u32 val)
 
 	return 0;
 }
+
+int mc13xxx_reg_read(struct mc13xxx *mc13xxx, unsigned int offset, u32 *val)
+{
+	int ret;
+
+	BUG_ON(!mutex_is_locked(&mc13xxx->lock));
+
+	if (offset > MC13XXX_NUMREGS)
+		return -EINVAL;
+
+	ret = mc13xxx_spi_reg_read(mc13xxx, offset, val);
+	dev_vdbg(mc13xxx->dev, "[0x%02x] -> 0x%06x\n", offset, *val);
+
+	return ret;
+}
+EXPORT_SYMBOL(mc13xxx_reg_read);
+
+int mc13xxx_reg_write(struct mc13xxx *mc13xxx, unsigned int offset, u32 val)
+{
+	BUG_ON(!mutex_is_locked(&mc13xxx->lock));
+
+	dev_vdbg(mc13xxx->dev, "[0x%02x] <- 0x%06x\n", offset, val);
+
+	if (offset > MC13XXX_NUMREGS || val > 0xffffff)
+		return -EINVAL;
+
+	return mc13xxx_spi_reg_write(mc13xxx, offset, val);
+}
 EXPORT_SYMBOL(mc13xxx_reg_write);
 
 int mc13xxx_reg_rmw(struct mc13xxx *mc13xxx, unsigned int offset,
@@ -439,7 +464,7 @@ static int mc13xxx_irq_handle(struct mc13xxx *mc13xxx,
 			if (handled == IRQ_HANDLED)
 				num_handled++;
 		} else {
-			dev_err(&mc13xxx->spidev->dev,
+			dev_err(mc13xxx->dev,
 					"BUG: irq %u but no handler\n",
 					baseirq + irq);
 
@@ -475,25 +500,23 @@ static irqreturn_t mc13xxx_irq_thread(int irq, void *data)
 	return IRQ_RETVAL(handled);
 }
 
-enum mc13xxx_id {
-	MC13XXX_ID_MC13783,
-	MC13XXX_ID_MC13892,
-	MC13XXX_ID_INVALID,
-};
-
 static const char *mc13xxx_chipname[] = {
 	[MC13XXX_ID_MC13783] = "mc13783",
 	[MC13XXX_ID_MC13892] = "mc13892",
 };
 
 #define maskval(reg, mask)	(((reg) & (mask)) >> __ffs(mask))
-static int mc13xxx_identify(struct mc13xxx *mc13xxx, enum mc13xxx_id *id)
+static int mc13xxx_identify(struct mc13xxx *mc13xxx)
 {
 	u32 icid;
 	u32 revision;
-	const char *name;
 	int ret;
 
+	/*
+	 * Get the generation ID from register 46, as apparently some older
+	 * IC revisions only have this info at this location. Newer ICs seem to
+	 * have both.
+	 */
 	ret = mc13xxx_reg_read(mc13xxx, 46, &icid);
 	if (ret)
 		return ret;
@@ -502,26 +525,23 @@ static int mc13xxx_identify(struct mc13xxx *mc13xxx, enum mc13xxx_id *id)
 
 	switch (icid) {
 	case 2:
-		*id = MC13XXX_ID_MC13783;
-		name = "mc13783";
+		mc13xxx->ictype = MC13XXX_ID_MC13783;
 		break;
 	case 7:
-		*id = MC13XXX_ID_MC13892;
-		name = "mc13892";
+		mc13xxx->ictype = MC13XXX_ID_MC13892;
 		break;
 	default:
-		*id = MC13XXX_ID_INVALID;
+		mc13xxx->ictype = MC13XXX_ID_INVALID;
 		break;
 	}
 
-	if (*id == MC13XXX_ID_MC13783 || *id == MC13XXX_ID_MC13892) {
+	if (mc13xxx->ictype == MC13XXX_ID_MC13783 ||
+			mc13xxx->ictype == MC13XXX_ID_MC13892) {
 		ret = mc13xxx_reg_read(mc13xxx, MC13XXX_REVISION, &revision);
-		if (ret)
-			return ret;
 
-		dev_info(&mc13xxx->spidev->dev, "%s: rev: %d.%d, "
+		dev_info(mc13xxx->dev, "%s: rev: %d.%d, "
 				"fin: %d, fab: %d, icid: %d/%d\n",
-				mc13xxx_chipname[*id],
+				mc13xxx_chipname[mc13xxx->ictype],
 				maskval(revision, MC13XXX_REVISION_REVFULL),
 				maskval(revision, MC13XXX_REVISION_REVMETAL),
 				maskval(revision, MC13XXX_REVISION_FIN),
@@ -530,26 +550,12 @@ static int mc13xxx_identify(struct mc13xxx *mc13xxx, enum mc13xxx_id *id)
 				maskval(revision, MC13XXX_REVISION_ICIDCODE));
 	}
 
-	if (*id != MC13XXX_ID_INVALID) {
-		const struct spi_device_id *devid =
-			spi_get_device_id(mc13xxx->spidev);
-		if (!devid || devid->driver_data != *id)
-			dev_warn(&mc13xxx->spidev->dev, "device id doesn't "
-					"match auto detection!\n");
-	}
-
-	return 0;
+	return (mc13xxx->ictype == MC13XXX_ID_INVALID) ? -ENODEV : 0;
 }
 
 static const char *mc13xxx_get_chipname(struct mc13xxx *mc13xxx)
 {
-	const struct spi_device_id *devid =
-		spi_get_device_id(mc13xxx->spidev);
-
-	if (!devid)
-		return NULL;
-
-	return mc13xxx_chipname[devid->driver_data];
+	return mc13xxx_chipname[mc13xxx->ictype];
 }
 
 int mc13xxx_get_flags(struct mc13xxx *mc13xxx)
@@ -592,7 +598,7 @@ int mc13xxx_adc_do_conversion(struct mc13xxx *mc13xxx, unsigned int mode,
 	};
 	init_completion(&adcdone_data.done);
 
-	dev_dbg(&mc13xxx->spidev->dev, "%s\n", __func__);
+	dev_dbg(mc13xxx->dev, "%s\n", __func__);
 
 	mc13xxx_lock(mc13xxx);
 
@@ -637,7 +643,8 @@ int mc13xxx_adc_do_conversion(struct mc13xxx *mc13xxx, unsigned int mode,
 	adc1 |= ato << MC13783_ADC1_ATO_SHIFT;
 	if (atox)
 		adc1 |= MC13783_ADC1_ATOX;
-	dev_dbg(&mc13xxx->spidev->dev, "%s: request irq\n", __func__);
+
+	dev_dbg(mc13xxx->dev, "%s: request irq\n", __func__);
 	mc13xxx_irq_request(mc13xxx, MC13XXX_IRQ_ADCDONE,
 			mc13xxx_handler_adcdone, __func__, &adcdone_data);
 	mc13xxx_irq_ack(mc13xxx, MC13XXX_IRQ_ADCDONE);
@@ -695,7 +702,7 @@ static int mc13xxx_add_subdevice_pdata(struct mc13xxx *mc13xxx,
 	if (!cell.name)
 		return -ENOMEM;
 
-	return mfd_add_devices(&mc13xxx->spidev->dev, -1, &cell, 1, NULL, 0);
+	return mfd_add_devices(mc13xxx->dev, -1, &cell, 1, NULL, 0);
 }
 
 static int mc13xxx_add_subdevice(struct mc13xxx *mc13xxx, const char *format)
@@ -706,7 +713,7 @@ static int mc13xxx_add_subdevice(struct mc13xxx *mc13xxx, const char *format)
 #ifdef CONFIG_OF
 static int mc13xxx_probe_flags_dt(struct mc13xxx *mc13xxx)
 {
-	struct device_node *np = mc13xxx->spidev->dev.of_node;
+	struct device_node *np = mc13xxx->dev.of_node;
 
 	if (!np)
 		return -ENODEV;
@@ -752,13 +759,17 @@ static const struct of_device_id mc13xxx_dt_ids[] = {
 };
 MODULE_DEVICE_TABLE(of, mc13xxx_dt_ids);
 
-static int mc13xxx_probe(struct spi_device *spi)
+static int mc13xxx_common_init(struct mc13xxx *mc13xxx,
+		struct mc13xxx_platform_data *pdata, int irq);
+
+static void mc13xxx_common_cleanup(struct mc13xxx *mc13xxx);
+
+static int mc13xxx_spi_probe(struct spi_device *spi)
 {
 	const struct of_device_id *of_id;
 	struct spi_driver *sdrv = to_spi_driver(spi->dev.driver);
 	struct mc13xxx *mc13xxx;
 	struct mc13xxx_platform_data *pdata = dev_get_platdata(&spi->dev);
-	enum mc13xxx_id id;
 	int ret;
 
 	of_id = of_match_device(mc13xxx_dt_ids, &spi->dev);
@@ -774,13 +785,34 @@ static int mc13xxx_probe(struct spi_device *spi)
 	spi->bits_per_word = 32;
 	spi_setup(spi);
 
+	mc13xxx->dev = &spi->dev;
 	mc13xxx->spidev = spi;
 
+	ret = mc13xxx_common_init(mc13xxx, pdata, spi->irq);
+
+	if (ret) {
+		dev_set_drvdata(&spi->dev, NULL);
+	} else {
+		const struct spi_device_id *devid =
+			spi_get_device_id(mc13xxx->spidev);
+		if (!devid || devid->driver_data != mc13xxx->ictype)
+			dev_warn(mc13xxx->dev,
+				"device id doesn't match auto detection!\n");
+	}
+
+	return ret;
+}
+
+static int mc13xxx_common_init(struct mc13xxx *mc13xxx,
+		struct mc13xxx_platform_data *pdata, int irq)
+{
+	int ret;
+
 	mutex_init(&mc13xxx->lock);
 	mc13xxx_lock(mc13xxx);
 
-	ret = mc13xxx_identify(mc13xxx, &id);
-	if (ret || id == MC13XXX_ID_INVALID)
+	ret = mc13xxx_identify(mc13xxx);
+	if (ret)
 		goto err_revision;
 
 	/* mask all irqs */
@@ -792,18 +824,19 @@ static int mc13xxx_probe(struct spi_device *spi)
 	if (ret)
 		goto err_mask;
 
-	ret = request_threaded_irq(spi->irq, NULL, mc13xxx_irq_thread,
+	ret = request_threaded_irq(irq, NULL, mc13xxx_irq_thread,
 			IRQF_ONESHOT | IRQF_TRIGGER_HIGH, "mc13xxx", mc13xxx);
 
 	if (ret) {
 err_mask:
 err_revision:
 		mc13xxx_unlock(mc13xxx);
-		dev_set_drvdata(&spi->dev, NULL);
 		kfree(mc13xxx);
 		return ret;
 	}
 
+	mc13xxx->irq = irq;
+
 	mc13xxx_unlock(mc13xxx);
 
 	if (mc13xxx_probe_flags_dt(mc13xxx) < 0 && pdata)
@@ -838,39 +871,44 @@ err_revision:
 	return 0;
 }
 
-static int __devexit mc13xxx_remove(struct spi_device *spi)
+static int __devexit mc13xxx_spi_remove(struct spi_device *spi)
 {
 	struct mc13xxx *mc13xxx = dev_get_drvdata(&spi->dev);
 
-	free_irq(mc13xxx->spidev->irq, mc13xxx);
+	mc13xxx_common_cleanup(mc13xxx);
 
-	mfd_remove_devices(&spi->dev);
+	return 0;
+}
 
-	kfree(mc13xxx);
+static void mc13xxx_common_cleanup(struct mc13xxx *mc13xxx)
+{
+	free_irq(mc13xxx->irq, mc13xxx);
 
-	return 0;
+	mfd_remove_devices(mc13xxx->dev);
+
+	kfree(mc13xxx);
 }
 
-static struct spi_driver mc13xxx_driver = {
+static struct spi_driver mc13xxx_spi_driver = {
 	.id_table = mc13xxx_device_id,
 	.driver = {
 		.name = "mc13xxx",
 		.owner = THIS_MODULE,
 		.of_match_table = mc13xxx_dt_ids,
 	},
-	.probe = mc13xxx_probe,
-	.remove = __devexit_p(mc13xxx_remove),
+	.probe = mc13xxx_spi_probe,
+	.remove = __devexit_p(mc13xxx_spi_remove),
 };
 
 static int __init mc13xxx_init(void)
 {
-	return spi_register_driver(&mc13xxx_driver);
+	return spi_register_driver(&mc13xxx_spi_driver);
 }
 subsys_initcall(mc13xxx_init);
 
 static void __exit mc13xxx_exit(void)
 {
-	spi_unregister_driver(&mc13xxx_driver);
+	spi_unregister_driver(&mc13xxx_spi_driver);
 }
 module_exit(mc13xxx_exit);
 
-- 
cgit v0.10.2


From 91b5e741184ea9836cd7d7509e4f9b6eefa27df2 Mon Sep 17 00:00:00 2001
From: Marc Reilly <marc@cpdesign.com.au>
Date: Sun, 1 Apr 2012 16:41:37 +1000
Subject: mfd: Use regmap for the mc13xxx-core register access

This change converts the mc13xxx core to use regmap rather than direct
spi r/w.
The spidev member of mc13xxx struct becomes redundant and is removed.
Extra debugging aids are added to mc13xxx_reg_rmw.
Mutex init is moved to before regmap init.

Signed-off-by: Marc Reilly <marc@cpdesign.com.au>
Reviewed-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>

diff --git a/drivers/mfd/Kconfig b/drivers/mfd/Kconfig
index c6edba6..411a61a 100644
--- a/drivers/mfd/Kconfig
+++ b/drivers/mfd/Kconfig
@@ -600,6 +600,7 @@ config MFD_MC13XXX
 	depends on SPI_MASTER
 	select MFD_CORE
 	select MFD_MC13783
+	select REGMAP_SPI
 	help
 	  Support for the Freescale (Atlas) PMIC and audio CODECs
 	  MC13783 and MC13892.
diff --git a/drivers/mfd/mc13xxx-core.c b/drivers/mfd/mc13xxx-core.c
index c8b7a28..d761a2e 100644
--- a/drivers/mfd/mc13xxx-core.c
+++ b/drivers/mfd/mc13xxx-core.c
@@ -21,6 +21,8 @@
 #include <linux/of.h>
 #include <linux/of_device.h>
 #include <linux/of_gpio.h>
+#include <linux/regmap.h>
+#include <linux/err.h>
 
 enum mc13xxx_id {
 	MC13XXX_ID_MC13783,
@@ -29,7 +31,7 @@ enum mc13xxx_id {
 };
 
 struct mc13xxx {
-	struct spi_device *spidev;
+	struct regmap *regmap;
 
 	struct device *dev;
 	enum mc13xxx_id ictype;
@@ -172,67 +174,6 @@ void mc13xxx_unlock(struct mc13xxx *mc13xxx)
 }
 EXPORT_SYMBOL(mc13xxx_unlock);
 
-#define MC13XXX_REGOFFSET_SHIFT 25
-static int mc13xxx_spi_reg_read(struct mc13xxx *mc13xxx,
-				unsigned int offset, u32 *val)
-{
-	struct spi_transfer t;
-	struct spi_message m;
-	int ret;
-
-	*val = offset << MC13XXX_REGOFFSET_SHIFT;
-
-	memset(&t, 0, sizeof(t));
-
-	t.tx_buf = val;
-	t.rx_buf = val;
-	t.len = sizeof(u32);
-
-	spi_message_init(&m);
-	spi_message_add_tail(&t, &m);
-
-	ret = spi_sync(mc13xxx->spidev, &m);
-
-	/* error in message.status implies error return from spi_sync */
-	BUG_ON(!ret && m.status);
-
-	if (ret)
-		return ret;
-
-	*val &= 0xffffff;
-
-	return 0;
-}
-
-static int mc13xxx_spi_reg_write(struct mc13xxx *mc13xxx, unsigned int offset,
-		u32 val)
-{
-	u32 buf;
-	struct spi_transfer t;
-	struct spi_message m;
-	int ret;
-
-	buf = 1 << 31 | offset << MC13XXX_REGOFFSET_SHIFT | val;
-
-	memset(&t, 0, sizeof(t));
-
-	t.tx_buf = &buf;
-	t.rx_buf = &buf;
-	t.len = sizeof(u32);
-
-	spi_message_init(&m);
-	spi_message_add_tail(&t, &m);
-
-	ret = spi_sync(mc13xxx->spidev, &m);
-
-	BUG_ON(!ret && m.status);
-
-	if (ret)
-		return ret;
-
-	return 0;
-}
-
 int mc13xxx_reg_read(struct mc13xxx *mc13xxx, unsigned int offset, u32 *val)
 {
 	int ret;
@@ -242,7 +183,7 @@ int mc13xxx_reg_read(struct mc13xxx *mc13xxx, unsigned int offset, u32 *val)
 	if (offset > MC13XXX_NUMREGS)
 		return -EINVAL;
 
-	ret = mc13xxx_spi_reg_read(mc13xxx, offset, val);
+	ret = regmap_read(mc13xxx->regmap, offset, val);
 	dev_vdbg(mc13xxx->dev, "[0x%02x] -> 0x%06x\n", offset, *val);
 
 	return ret;
@@ -258,25 +199,19 @@ int mc13xxx_reg_write(struct mc13xxx *mc13xxx, unsigned int offset, u32 val)
 	if (offset > MC13XXX_NUMREGS || val > 0xffffff)
 		return -EINVAL;
 
-	return mc13xxx_spi_reg_write(mc13xxx, offset, val);
+	return regmap_write(mc13xxx->regmap, offset, val);
 }
 EXPORT_SYMBOL(mc13xxx_reg_write);
 
 int mc13xxx_reg_rmw(struct mc13xxx *mc13xxx, unsigned int offset,
 		u32 mask, u32 val)
 {
-	int ret;
-	u32 valread;
-
+	BUG_ON(!mutex_is_locked(&mc13xxx->lock));
 	BUG_ON(val & ~mask);
+	dev_vdbg(mc13xxx->dev, "[0x%02x] <- 0x%06x (mask: 0x%06x)\n",
+			offset, val, mask);
 
-	ret = mc13xxx_reg_read(mc13xxx, offset, &valread);
-	if (ret)
-		return ret;
-
-	valread = (valread & ~mask) | val;
-
-	return mc13xxx_reg_write(mc13xxx, offset, valread);
+	return regmap_update_bits(mc13xxx->regmap, offset, mask, val);
 }
 EXPORT_SYMBOL(mc13xxx_reg_rmw);
 
@@ -713,7 +648,7 @@ static int mc13xxx_add_subdevice(struct mc13xxx *mc13xxx, const char *format)
 #ifdef CONFIG_OF
 static int mc13xxx_probe_flags_dt(struct mc13xxx *mc13xxx)
 {
-	struct device_node *np = mc13xxx->dev.of_node;
+	struct device_node *np = mc13xxx->dev->of_node;
 
 	if (!np)
 		return -ENODEV;
@@ -759,6 +694,16 @@ static const struct of_device_id mc13xxx_dt_ids[] = {
 };
 MODULE_DEVICE_TABLE(of, mc13xxx_dt_ids);
 
+static struct regmap_config mc13xxx_regmap_spi_config = {
+	.reg_bits = 7,
+	.pad_bits = 1,
+	.val_bits = 24,
+
+	.max_register = MC13XXX_NUMREGS,
+
+	.cache_type = REGCACHE_NONE,
+};
+
 static int mc13xxx_common_init(struct mc13xxx *mc13xxx,
 		struct mc13xxx_platform_data *pdata, int irq);
 
@@ -783,10 +728,19 @@ static int mc13xxx_spi_probe(struct spi_device *spi)
 	dev_set_drvdata(&spi->dev, mc13xxx);
 	spi->mode = SPI_MODE_0 | SPI_CS_HIGH;
 	spi->bits_per_word = 32;
-	spi_setup(spi);
 
 	mc13xxx->dev = &spi->dev;
-	mc13xxx->spidev = spi;
+	mutex_init(&mc13xxx->lock);
+
+	mc13xxx->regmap = regmap_init_spi(spi, &mc13xxx_regmap_spi_config);
+	if (IS_ERR(mc13xxx->regmap)) {
+		ret = PTR_ERR(mc13xxx->regmap);
+		dev_err(mc13xxx->dev, "Failed to initialize register map: %d\n",
+				ret);
+		dev_set_drvdata(&spi->dev, NULL);
+		kfree(mc13xxx);
+		return ret;
+	}
 
 	ret = mc13xxx_common_init(mc13xxx, pdata, spi->irq);
 
@@ -794,7 +748,7 @@ static int mc13xxx_spi_probe(struct spi_device *spi)
 		dev_set_drvdata(&spi->dev, NULL);
 	} else {
 		const struct spi_device_id *devid =
-			spi_get_device_id(mc13xxx->spidev);
+			spi_get_device_id(spi);
 		if (!devid || devid->driver_data != mc13xxx->ictype)
 			dev_warn(mc13xxx->dev,
 				"device id doesn't match auto detection!\n");
@@ -808,7 +762,6 @@ static int mc13xxx_common_init(struct mc13xxx *mc13xxx,
 {
 	int ret;
 
-	mutex_init(&mc13xxx->lock);
 	mc13xxx_lock(mc13xxx);
 
 	ret = mc13xxx_identify(mc13xxx);
@@ -886,6 +839,8 @@ static void mc13xxx_common_cleanup(struct mc13xxx *mc13xxx)
 
 	mfd_remove_devices(mc13xxx->dev);
 
+	regmap_exit(mc13xxx->regmap);
+
 	kfree(mc13xxx);
 }
 
-- 
cgit v0.10.2


From a0c7c1d48ea9f53c67c79eda498bb8eda1422748 Mon Sep 17 00:00:00 2001
From: Marc Reilly <marc@cpdesign.com.au>
Date: Sun, 1 Apr 2012 16:41:38 +1000
Subject: mfd: Move the mc13xxx-core spi specific code into a separate module

All spi specific code is moved into a new module. The mc13xxx struct
moves to a new local include file by necessity.

A new config choice selects the SPI bus type support and by default is
value of SPI_MASTER to remain compatible with existing configs.

Signed-off-by: Marc Reilly <marc@cpdesign.com.au>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>

diff --git a/drivers/mfd/Kconfig b/drivers/mfd/Kconfig
index 411a61a..9c8294c 100644
--- a/drivers/mfd/Kconfig
+++ b/drivers/mfd/Kconfig
@@ -600,14 +600,23 @@ config MFD_MC13XXX
 	depends on SPI_MASTER
 	select MFD_CORE
 	select MFD_MC13783
-	select REGMAP_SPI
 	help
-	  Support for the Freescale (Atlas) PMIC and audio CODECs
-	  MC13783 and MC13892.
-	  This driver provides common support for accessing  the device,
+	  Enable support for the Freescale MC13783 and MC13892 PMICs.
+	  This driver provides common support for accessing the device,
 	  additional drivers must be enabled in order to use the
 	  functionality of the device.
 
+if MFD_MC13XXX
+
+config MFD_MC13XXX_SPI
+	tristate "MC13xxx SPI interface" if SPI_MASTER
+	default SPI_MASTER
+	select REGMAP_SPI
+	help
+	  Select this if your MC13xxx is connected via an SPI bus.
+
+endif
+
 config ABX500_CORE
 	bool "ST-Ericsson ABX500 Mixed Signal Circuit register functions"
 	default y if ARCH_U300 || ARCH_U8500
diff --git a/drivers/mfd/Makefile b/drivers/mfd/Makefile
index c4500c3..7ce5b5c 100644
--- a/drivers/mfd/Makefile
+++ b/drivers/mfd/Makefile
@@ -54,6 +54,7 @@ obj-$(CONFIG_TWL6030_PWM)	+= twl6030-pwm.o
 obj-$(CONFIG_TWL6040_CORE)	+= twl6040-core.o twl6040-irq.o
 
 obj-$(CONFIG_MFD_MC13XXX)	+= mc13xxx-core.o
+obj-$(CONFIG_MFD_MC13XXX_SPI)	+= mc13xxx-spi.o
 
 obj-$(CONFIG_MFD_CORE)		+= mfd-core.o
 
diff --git a/drivers/mfd/mc13xxx-core.c b/drivers/mfd/mc13xxx-core.c
index d761a2e..13a32b7 100644
--- a/drivers/mfd/mc13xxx-core.c
+++ b/drivers/mfd/mc13xxx-core.c
@@ -15,36 +15,13 @@
 #include <linux/platform_device.h>
 #include <linux/mutex.h>
 #include <linux/interrupt.h>
-#include <linux/spi/spi.h>
 #include <linux/mfd/core.h>
 #include <linux/mfd/mc13xxx.h>
 #include <linux/of.h>
 #include <linux/of_device.h>
 #include <linux/of_gpio.h>
-#include <linux/regmap.h>
-#include <linux/err.h>
 
-enum mc13xxx_id {
-	MC13XXX_ID_MC13783,
-	MC13XXX_ID_MC13892,
-	MC13XXX_ID_INVALID,
-};
-
-struct mc13xxx {
-	struct regmap *regmap;
-
-	struct device *dev;
-	enum mc13xxx_id ictype;
-
-	struct mutex lock;
-	int irq;
-	int flags;
-
-	irq_handler_t irqhandler[MC13XXX_NUM_IRQ];
-	void *irqdata[MC13XXX_NUM_IRQ];
-
-	int adcflags;
-};
+#include "mc13xxx.h"
 
 #define MC13XXX_IRQSTAT0	0
 #define MC13XXX_IRQSTAT0_ADCDONEI	(1 << 0)
@@ -151,8 +128,6 @@ struct mc13xxx {
 
 #define MC13XXX_ADC2		45
 
-#define MC13XXX_NUMREGS 0x3f
-
 void mc13xxx_lock(struct mc13xxx *mc13xxx)
 {
 	if (!mutex_trylock(&mc13xxx->lock)) {
@@ -674,90 +649,7 @@ static inline int mc13xxx_probe_flags_dt(struct mc13xxx *mc13xxx)
 }
 #endif
 
-static const struct spi_device_id mc13xxx_device_id[] = {
-	{
-		.name = "mc13783",
-		.driver_data = MC13XXX_ID_MC13783,
-	}, {
-		.name = "mc13892",
-		.driver_data = MC13XXX_ID_MC13892,
-	}, {
-		/* sentinel */
-	}
-};
-MODULE_DEVICE_TABLE(spi, mc13xxx_device_id);
-
-static const struct of_device_id mc13xxx_dt_ids[] = {
-	{ .compatible = "fsl,mc13783", .data = (void *) MC13XXX_ID_MC13783, },
-	{ .compatible = "fsl,mc13892", .data = (void *) MC13XXX_ID_MC13892, },
-	{ /* sentinel */ }
-};
-MODULE_DEVICE_TABLE(of, mc13xxx_dt_ids);
-
-static struct regmap_config mc13xxx_regmap_spi_config = {
-	.reg_bits = 7,
-	.pad_bits = 1,
-	.val_bits = 24,
-
-	.max_register = MC13XXX_NUMREGS,
-
-	.cache_type = REGCACHE_NONE,
-};
-
-static int mc13xxx_common_init(struct mc13xxx *mc13xxx,
-		struct mc13xxx_platform_data *pdata, int irq);
-
-static void mc13xxx_common_cleanup(struct mc13xxx *mc13xxx);
-
-static int mc13xxx_spi_probe(struct spi_device *spi)
-{
-	const struct of_device_id *of_id;
-	struct spi_driver *sdrv = to_spi_driver(spi->dev.driver);
-	struct mc13xxx *mc13xxx;
-	struct mc13xxx_platform_data *pdata = dev_get_platdata(&spi->dev);
-	int ret;
-
-	of_id = of_match_device(mc13xxx_dt_ids, &spi->dev);
-	if (of_id)
-		sdrv->id_table = &mc13xxx_device_id[(enum mc13xxx_id) of_id->data];
-
-	mc13xxx = kzalloc(sizeof(*mc13xxx), GFP_KERNEL);
-	if (!mc13xxx)
-		return -ENOMEM;
-
-	dev_set_drvdata(&spi->dev, mc13xxx);
-	spi->mode = SPI_MODE_0 | SPI_CS_HIGH;
-	spi->bits_per_word = 32;
-
-	mc13xxx->dev = &spi->dev;
-	mutex_init(&mc13xxx->lock);
-
-	mc13xxx->regmap = regmap_init_spi(spi, &mc13xxx_regmap_spi_config);
-	if (IS_ERR(mc13xxx->regmap)) {
-		ret = PTR_ERR(mc13xxx->regmap);
-		dev_err(mc13xxx->dev, "Failed to initialize register map: %d\n",
-				ret);
-		dev_set_drvdata(&spi->dev, NULL);
-		kfree(mc13xxx);
-		return ret;
-	}
-
-	ret = mc13xxx_common_init(mc13xxx, pdata, spi->irq);
-
-	if (ret) {
-		dev_set_drvdata(&spi->dev, NULL);
-	} else {
-		const struct spi_device_id *devid =
-			spi_get_device_id(spi);
-		if (!devid || devid->driver_data != mc13xxx->ictype)
-			dev_warn(mc13xxx->dev,
-				"device id doesn't match auto detection!\n");
-	}
-
-	return ret;
-}
-
-static int mc13xxx_common_init(struct mc13xxx *mc13xxx,
+int mc13xxx_common_init(struct mc13xxx *mc13xxx,
 		struct mc13xxx_platform_data *pdata, int irq)
 {
 	int ret;
@@ -823,17 +715,9 @@ err_revision:
 
 	return 0;
 }
+EXPORT_SYMBOL_GPL(mc13xxx_common_init);
 
-static int __devexit mc13xxx_spi_remove(struct spi_device *spi)
-{
-	struct mc13xxx *mc13xxx = dev_get_drvdata(&spi->dev);
-
-	mc13xxx_common_cleanup(mc13xxx);
-
-	return 0;
-}
-
-static void mc13xxx_common_cleanup(struct mc13xxx *mc13xxx)
+void mc13xxx_common_cleanup(struct mc13xxx *mc13xxx)
 {
 	free_irq(mc13xxx->irq, mc13xxx);
 
@@ -843,29 +727,7 @@ static void mc13xxx_common_cleanup(struct mc13xxx *mc13xxx)
 
 	kfree(mc13xxx);
 }
-
-static struct spi_driver mc13xxx_spi_driver = {
-	.id_table = mc13xxx_device_id,
-	.driver = {
-		.name = "mc13xxx",
-		.owner = THIS_MODULE,
-		.of_match_table = mc13xxx_dt_ids,
-	},
-	.probe = mc13xxx_spi_probe,
-	.remove = __devexit_p(mc13xxx_spi_remove),
-};
-
-static int __init mc13xxx_init(void)
-{
-	return spi_register_driver(&mc13xxx_spi_driver);
-}
-subsys_initcall(mc13xxx_init);
-
-static void __exit mc13xxx_exit(void)
-{
-	spi_unregister_driver(&mc13xxx_spi_driver);
-}
-module_exit(mc13xxx_exit);
+EXPORT_SYMBOL_GPL(mc13xxx_common_cleanup);
 
 MODULE_DESCRIPTION("Core driver for Freescale MC13XXX PMIC");
 MODULE_AUTHOR("Uwe Kleine-Koenig <u.kleine-koenig@pengutronix.de>");
diff --git a/drivers/mfd/mc13xxx-spi.c b/drivers/mfd/mc13xxx-spi.c
new file mode 100644
index 0000000..3fcdab3
--- /dev/null
+++ b/drivers/mfd/mc13xxx-spi.c
@@ -0,0 +1,140 @@
+/*
+ * Copyright 2009-2010 Pengutronix
+ * Uwe Kleine-Koenig <u.kleine-koenig@pengutronix.de>
+ *
+ * loosely based on an earlier driver that has
+ * Copyright 2009 Pengutronix, Sascha Hauer <s.hauer@pengutronix.de>
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License version 2 as published by the
+ * Free Software Foundation.
+ */
+
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/platform_device.h>
+#include <linux/mutex.h>
+#include <linux/interrupt.h>
+#include <linux/mfd/core.h>
+#include <linux/mfd/mc13xxx.h>
+#include <linux/of.h>
+#include <linux/of_device.h>
+#include <linux/of_gpio.h>
+#include <linux/err.h>
+#include <linux/spi/spi.h>
+
+#include "mc13xxx.h"
+
+static const struct spi_device_id mc13xxx_device_id[] = {
+	{
+		.name = "mc13783",
+		.driver_data = MC13XXX_ID_MC13783,
+	}, {
+		.name = "mc13892",
+		.driver_data = MC13XXX_ID_MC13892,
+	}, {
+		/* sentinel */
+	}
+};
+MODULE_DEVICE_TABLE(spi, mc13xxx_device_id);
+
+static const struct of_device_id mc13xxx_dt_ids[] = {
+	{ .compatible = "fsl,mc13783", .data = (void *) MC13XXX_ID_MC13783, },
+	{ .compatible = "fsl,mc13892", .data = (void *) MC13XXX_ID_MC13892, },
+	{ /* sentinel */ }
+};
+MODULE_DEVICE_TABLE(of, mc13xxx_dt_ids);
+
+static struct regmap_config mc13xxx_regmap_spi_config = {
+	.reg_bits = 7,
+	.pad_bits = 1,
+	.val_bits = 24,
+
+	.max_register = MC13XXX_NUMREGS,
+
+	.cache_type = REGCACHE_NONE,
+};
+
+static int mc13xxx_spi_probe(struct spi_device *spi)
+{
+	const struct of_device_id *of_id;
+	struct spi_driver *sdrv = to_spi_driver(spi->dev.driver);
+	struct mc13xxx *mc13xxx;
+	struct mc13xxx_platform_data *pdata = dev_get_platdata(&spi->dev);
+	int ret;
+
+	of_id = of_match_device(mc13xxx_dt_ids, &spi->dev);
+	if (of_id)
+		sdrv->id_table = &mc13xxx_device_id[(enum mc13xxx_id) of_id->data];
+
+	mc13xxx = kzalloc(sizeof(*mc13xxx), GFP_KERNEL);
+	if (!mc13xxx)
+		return -ENOMEM;
+
+	dev_set_drvdata(&spi->dev, mc13xxx);
+	spi->mode = SPI_MODE_0 | SPI_CS_HIGH;
+	spi->bits_per_word = 32;
+
+	mc13xxx->dev = &spi->dev;
+	mutex_init(&mc13xxx->lock);
+
+	mc13xxx->regmap = regmap_init_spi(spi, &mc13xxx_regmap_spi_config);
+	if (IS_ERR(mc13xxx->regmap)) {
+		ret = PTR_ERR(mc13xxx->regmap);
+		dev_err(mc13xxx->dev, "Failed to initialize register map: %d\n",
+				ret);
+		dev_set_drvdata(&spi->dev, NULL);
+		kfree(mc13xxx);
+		return ret;
+	}
+
+	ret = mc13xxx_common_init(mc13xxx, pdata, spi->irq);
+
+	if (ret) {
+		dev_set_drvdata(&spi->dev, NULL);
+	} else {
+		const struct spi_device_id *devid =
+			spi_get_device_id(spi);
+		if (!devid || devid->driver_data != mc13xxx->ictype)
+			dev_warn(mc13xxx->dev,
+				"device id doesn't match auto detection!\n");
+	}
+
+	return ret;
+}
+
+static int __devexit mc13xxx_spi_remove(struct spi_device *spi)
+{
+	struct mc13xxx *mc13xxx = dev_get_drvdata(&spi->dev);
+
+	mc13xxx_common_cleanup(mc13xxx);
+
+	return 0;
+}
+
+static struct spi_driver mc13xxx_spi_driver = {
+	.id_table = mc13xxx_device_id,
+	.driver = {
+		.name = "mc13xxx",
+		.owner = THIS_MODULE,
+		.of_match_table = mc13xxx_dt_ids,
+	},
+	.probe = mc13xxx_spi_probe,
+	.remove = __devexit_p(mc13xxx_spi_remove),
+};
+
+static int __init mc13xxx_init(void)
+{
+	return spi_register_driver(&mc13xxx_spi_driver);
+}
+subsys_initcall(mc13xxx_init);
+
+static void __exit mc13xxx_exit(void)
+{
+	spi_unregister_driver(&mc13xxx_spi_driver);
+}
+module_exit(mc13xxx_exit);
+
+MODULE_DESCRIPTION("Core driver for Freescale MC13XXX PMIC");
+MODULE_AUTHOR("Uwe Kleine-Koenig <u.kleine-koenig@pengutronix.de>");
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/mfd/mc13xxx.h b/drivers/mfd/mc13xxx.h
new file mode 100644
index 0000000..bbba06f
--- /dev/null
+++ b/drivers/mfd/mc13xxx.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright 2012 Creative Product Design
+ * Marc Reilly <marc@cpdesign.com.au>
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License version 2 as published by the
+ * Free Software Foundation.
+ */
+#ifndef __DRIVERS_MFD_MC13XXX_H
+#define __DRIVERS_MFD_MC13XXX_H
+
+#include <linux/mutex.h>
+#include <linux/regmap.h>
+#include <linux/mfd/mc13xxx.h>
+
+enum mc13xxx_id {
+	MC13XXX_ID_MC13783,
+	MC13XXX_ID_MC13892,
+	MC13XXX_ID_INVALID,
+};
+
+#define MC13XXX_NUMREGS 0x3f
+
+struct mc13xxx {
+	struct regmap *regmap;
+
+	struct device *dev;
+	enum mc13xxx_id ictype;
+
+	struct mutex lock;
+	int irq;
+	int flags;
+
+	irq_handler_t irqhandler[MC13XXX_NUM_IRQ];
+	void *irqdata[MC13XXX_NUM_IRQ];
+
+	int adcflags;
+};
+
+int mc13xxx_common_init(struct mc13xxx *mc13xxx,
+		struct mc13xxx_platform_data *pdata, int irq);
+
+void mc13xxx_common_cleanup(struct mc13xxx *mc13xxx);
+
+#endif /* __DRIVERS_MFD_MC13XXX_H */
-- 
cgit v0.10.2


From df3df6469fd1e59284d6b5d4dd9dbe1bd7861040 Mon Sep 17 00:00:00 2001
From: Marc Reilly <marc@cpdesign.com.au>
Date: Sun, 1 Apr 2012 16:41:39 +1000
Subject: mfd: Add mc13xxx i2c driver

Adds support for mc13xxx family ICs connected via i2c.

Signed-off-by: Marc Reilly <marc@cpdesign.com.au>
Acked-by: Oskar Schirmer <oskar@scara.com>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>

diff --git a/drivers/mfd/Kconfig b/drivers/mfd/Kconfig
index 9c8294c..ef86a74 100644
--- a/drivers/mfd/Kconfig
+++ b/drivers/mfd/Kconfig
@@ -597,7 +597,7 @@ config MFD_MC13783
 
 config MFD_MC13XXX
 	tristate "Support Freescale MC13783 and MC13892"
-	depends on SPI_MASTER
+	depends on SPI_MASTER || I2C
 	select MFD_CORE
 	select MFD_MC13783
 	help
@@ -615,6 +615,13 @@ config MFD_MC13XXX_SPI
 	help
 	  Select this if your MC13xxx is connected via an SPI bus.
 
+config MFD_MC13XXX_I2C
+	tristate "MC13xxx I2C interface" if I2C
+	default I2C
+	select REGMAP_I2C
+	help
+	  Select this if your MC13xxx is connected via an I2C bus.
+
 endif
 
 config ABX500_CORE
diff --git a/drivers/mfd/Makefile b/drivers/mfd/Makefile
index 7ce5b5c..5dd6be7 100644
--- a/drivers/mfd/Makefile
+++ b/drivers/mfd/Makefile
@@ -55,6 +55,7 @@ obj-$(CONFIG_TWL6040_CORE)	+= twl6040-core.o twl6040-irq.o
 
 obj-$(CONFIG_MFD_MC13XXX)	+= mc13xxx-core.o
 obj-$(CONFIG_MFD_MC13XXX_SPI)	+= mc13xxx-spi.o
+obj-$(CONFIG_MFD_MC13XXX_I2C)	+= mc13xxx-i2c.o
 
 obj-$(CONFIG_MFD_CORE)		+= mfd-core.o
 
diff --git a/drivers/mfd/mc13xxx-i2c.c b/drivers/mfd/mc13xxx-i2c.c
new file mode 100644
index 0000000..d22501d
--- /dev/null
+++ b/drivers/mfd/mc13xxx-i2c.c
@@ -0,0 +1,128 @@
+/*
+ * Copyright 2009-2010 Creative Product Design
+ * Marc Reilly marc@cpdesign.com.au
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License version 2 as published by the
+ * Free Software Foundation.
+ */
+
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/platform_device.h>
+#include <linux/mutex.h>
+#include <linux/mfd/core.h>
+#include <linux/mfd/mc13xxx.h>
+#include <linux/of.h>
+#include <linux/of_device.h>
+#include <linux/of_gpio.h>
+#include <linux/i2c.h>
+#include <linux/err.h>
+
+#include "mc13xxx.h"
+
+static const struct i2c_device_id mc13xxx_i2c_device_id[] = {
+	{
+		.name = "mc13892",
+		.driver_data = MC13XXX_ID_MC13892,
+	}, {
+		/* sentinel */
+	}
+};
+MODULE_DEVICE_TABLE(i2c, mc13xxx_i2c_device_id);
+
+static const struct of_device_id mc13xxx_dt_ids[] = {
+	{
+		.compatible = "fsl,mc13892",
+		.data = (void *) &mc13xxx_i2c_device_id[0],
+	}, {
+		/* sentinel */
+	}
+};
+MODULE_DEVICE_TABLE(of, mc13xxx_dt_ids);
+
+static struct regmap_config mc13xxx_regmap_i2c_config = {
+	.reg_bits = 8,
+	.val_bits = 24,
+
+	.max_register = MC13XXX_NUMREGS,
+
+	.cache_type = REGCACHE_NONE,
+};
+
+static int mc13xxx_i2c_probe(struct i2c_client *client,
+		const struct i2c_device_id *id)
+{
+	const struct of_device_id *of_id;
+	struct i2c_driver *idrv = to_i2c_driver(client->dev.driver);
+	struct mc13xxx *mc13xxx;
+	struct mc13xxx_platform_data *pdata = dev_get_platdata(&client->dev);
+	int ret;
+
+	of_id = of_match_device(mc13xxx_dt_ids, &client->dev);
+	if (of_id)
+		idrv->id_table = (const struct i2c_device_id*) of_id->data;
+
+	mc13xxx = kzalloc(sizeof(*mc13xxx), GFP_KERNEL);
+	if (!mc13xxx)
+		return -ENOMEM;
+
+	dev_set_drvdata(&client->dev, mc13xxx);
+
+	mc13xxx->dev = &client->dev;
+	mutex_init(&mc13xxx->lock);
+
+	mc13xxx->regmap = regmap_init_i2c(client, &mc13xxx_regmap_i2c_config);
+	if (IS_ERR(mc13xxx->regmap)) {
+		ret = PTR_ERR(mc13xxx->regmap);
+		dev_err(mc13xxx->dev, "Failed to initialize register map: %d\n",
+				ret);
+		dev_set_drvdata(&client->dev, NULL);
+		kfree(mc13xxx);
+		return ret;
+	}
+
+	ret = mc13xxx_common_init(mc13xxx, pdata, client->irq);
+
+	if (ret == 0 && (id->driver_data != mc13xxx->ictype))
+		dev_warn(mc13xxx->dev,
+				"device id doesn't match auto detection!\n");
+
+	return ret;
+}
+
+static int __devexit mc13xxx_i2c_remove(struct i2c_client *client)
+{
+	struct mc13xxx *mc13xxx = dev_get_drvdata(&client->dev);
+
+	mc13xxx_common_cleanup(mc13xxx);
+
+	return 0;
+}
+
+static struct i2c_driver mc13xxx_i2c_driver = {
+	.id_table = mc13xxx_i2c_device_id,
+	.driver = {
+		.owner = THIS_MODULE,
+		.name = "mc13xxx",
+		.of_match_table = mc13xxx_dt_ids,
+	},
+	.probe = mc13xxx_i2c_probe,
+	.remove = __devexit_p(mc13xxx_i2c_remove),
+};
+
+static int __init mc13xxx_i2c_init(void)
+{
+	return i2c_add_driver(&mc13xxx_i2c_driver);
+}
+subsys_initcall(mc13xxx_i2c_init);
+
+static void __exit mc13xxx_i2c_exit(void)
+{
+	i2c_del_driver(&mc13xxx_i2c_driver);
+}
+module_exit(mc13xxx_i2c_exit);
+
+MODULE_DESCRIPTION("i2c driver for Freescale MC13XXX PMIC");
+MODULE_AUTHOR("Marc Reilly <marc@cpdesign.com.au");
+MODULE_LICENSE("GPL v2");
-- 
cgit v0.10.2


From 44f72e53382c9c673fd54c3bab67a6b9a2d4526e Mon Sep 17 00:00:00 2001
From: Virupax Sadashivpetimath <virupax.sadashivpetimath@stericsson.com>
Date: Tue, 17 Apr 2012 09:30:14 +0200
Subject: mfd: Add new resources on ab8500 AB8505 and AB9540

The AB8505 and AB9540 has extended support for micro USB
resistance detection, used for detecting chargers. Let's
register resources for this resource. Let's also split off the
separate codec device for AB9540.

Signed-off-by: Virupax Sadashivpetimath <virupax.sadashivpetimath@stericsson.com>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>

diff --git a/drivers/mfd/ab8500-core.c b/drivers/mfd/ab8500-core.c
index 1f08704..ae67612 100644
--- a/drivers/mfd/ab8500-core.c
+++ b/drivers/mfd/ab8500-core.c
@@ -744,6 +744,39 @@ static struct resource __devinitdata ab8500_usb_resources[] = {
 	},
 };
 
+static struct resource __devinitdata ab8505_iddet_resources[] = {
+	{
+		.name  = "KeyDeglitch",
+		.start = AB8505_INT_KEYDEGLITCH,
+		.end   = AB8505_INT_KEYDEGLITCH,
+		.flags = IORESOURCE_IRQ,
+	},
+	{
+		.name  = "KP",
+		.start = AB8505_INT_KP,
+		.end   = AB8505_INT_KP,
+		.flags = IORESOURCE_IRQ,
+	},
+	{
+		.name  = "IKP",
+		.start = AB8505_INT_IKP,
+		.end   = AB8505_INT_IKP,
+		.flags = IORESOURCE_IRQ,
+	},
+	{
+		.name  = "IKR",
+		.start = AB8505_INT_IKR,
+		.end   = AB8505_INT_IKR,
+		.flags = IORESOURCE_IRQ,
+	},
+	{
+		.name  = "KeyStuck",
+		.start = AB8505_INT_KEYSTUCK,
+		.end   = AB8505_INT_KEYSTUCK,
+		.flags = IORESOURCE_IRQ,
+	},
+};
+
 static struct resource __devinitdata ab8500_temp_resources[] = {
 	{
 		.name  = "AB8500_TEMP_WARM",
@@ -803,10 +836,6 @@ static struct mfd_cell __devinitdata abx500_common_devs[] = {
 		.resources = ab8500_av_acc_detect_resources,
 	},
 	{
-		.name = "ab8500-codec",
-	},
-
-	{
 		.name = "ab8500-poweron-key",
 		.num_resources = ARRAY_SIZE(ab8500_poweronkey_db_resources),
 		.resources = ab8500_poweronkey_db_resources,
@@ -845,6 +874,9 @@ static struct mfd_cell __devinitdata ab8500_devs[] = {
 		.num_resources = ARRAY_SIZE(ab8500_usb_resources),
 		.resources = ab8500_usb_resources,
 	},
+	{
+		.name = "ab8500-codec",
+	},
 };
 
 static struct mfd_cell __devinitdata ab9540_devs[] = {
@@ -858,6 +890,18 @@ static struct mfd_cell __devinitdata ab9540_devs[] = {
 		.num_resources = ARRAY_SIZE(ab8500_usb_resources),
 		.resources = ab8500_usb_resources,
 	},
+	{
+		.name = "ab9540-codec",
+	},
+};
+
+/* Device list common to ab9540 and ab8505 */
+static struct mfd_cell __devinitdata ab9540_ab8505_devs[] = {
+	{
+		.name = "ab-iddet",
+		.num_resources = ARRAY_SIZE(ab8505_iddet_resources),
+		.resources = ab8505_iddet_resources,
+	},
 };
 
 static ssize_t show_chip_id(struct device *dev,
@@ -1125,8 +1169,14 @@ int __devinit ab8500_init(struct ab8500 *ab8500, enum ab8500_version version)
 			      ab8500->irq_base);
 	else
 		ret = mfd_add_devices(ab8500->dev, 0, ab8500_devs,
-			      ARRAY_SIZE(ab9540_devs), NULL,
+			      ARRAY_SIZE(ab8500_devs), NULL,
+			      ab8500->irq_base);
+
+	if (is_ab9540(ab8500) || is_ab8505(ab8500))
+		ret = mfd_add_devices(ab8500->dev, 0, ab9540_ab8505_devs,
+			      ARRAY_SIZE(ab9540_ab8505_devs), NULL,
 			      ab8500->irq_base);
+
 	if (ret)
 		goto out_freeirq;
 
diff --git a/include/linux/mfd/abx500/ab8500.h b/include/linux/mfd/abx500/ab8500.h
index fccc300..d798f5b 100644
--- a/include/linux/mfd/abx500/ab8500.h
+++ b/include/linux/mfd/abx500/ab8500.h
@@ -194,6 +194,14 @@ enum ab8500_version {
 #define AB9540_INT_GPIO52F		123
 #define AB9540_INT_GPIO53F		124
 #define AB9540_INT_GPIO54F		125 /* not 8505 */
+/* ab8500_irq_regoffset[16] -> IT[Source|Latch|Mask]25 */
+#define AB8505_INT_KEYSTUCK		128
+#define AB8505_INT_IKR			129
+#define AB8505_INT_IKP			130
+#define AB8505_INT_KP			131
+#define AB8505_INT_KEYDEGLITCH		132
+#define AB8505_INT_MODPWRSTATUSF	134
+#define AB8505_INT_MODPWRSTATUSR	135
 
 /*
  * AB8500_AB9540_NR_IRQS is used when configuring the IRQ numbers for the
@@ -203,8 +211,8 @@ enum ab8500_version {
  * which is larger.
  */
 #define AB8500_NR_IRQS			112
-#define AB8505_NR_IRQS			128
-#define AB9540_NR_IRQS			128
+#define AB8505_NR_IRQS			136
+#define AB9540_NR_IRQS			136
 /* This is set to the roof of any AB8500 chip variant IRQ counts */
 #define AB8500_MAX_NR_IRQS		AB9540_NR_IRQS
 
-- 
cgit v0.10.2


From 7e82d6ff5d2c5e35d1fcb8c673287f7d780a13bb Mon Sep 17 00:00:00 2001
From: Rabin Vincent <rabin.vincent@stericsson.com>
Date: Tue, 17 Apr 2012 09:30:24 +0200
Subject: mfd: Handle the ab8500 irq for suspend/resume

Ensure that the AB interrupt is only handled at a time when
all core drivers are resumed. Ensure that the AB interrupt
is marked as a wakeup interrupt.

Signed-off-by: Rabin Vincent <rabin.vincent@stericsson.com>
Reviewed-by: Jonas Aberg <jonas.aberg@stericsson.com>
Reviewed-by: Mattias Wallin <mattias.wallin@stericsson.com>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>

diff --git a/drivers/mfd/ab8500-i2c.c b/drivers/mfd/ab8500-i2c.c
index b83045f..89b31a3 100644
--- a/drivers/mfd/ab8500-i2c.c
+++ b/drivers/mfd/ab8500-i2c.c
@@ -101,10 +101,42 @@ static const struct platform_device_id ab8500_id[] = {
 	{ }
 };
 
+#ifdef CONFIG_PM
+static int ab8500_i2c_suspend(struct device *dev)
+{
+	struct ab8500 *ab = dev_get_drvdata(dev);
+
+	disable_irq(ab->irq);
+	enable_irq_wake(ab->irq);
+
+	return 0;
+}
+
+static int ab8500_i2c_resume(struct device *dev)
+{
+	struct ab8500 *ab = dev_get_drvdata(dev);
+
+	disable_irq_wake(ab->irq);
+	enable_irq(ab->irq);
+
+	return 0;
+}
+
+static const struct dev_pm_ops ab8500_i2c_pm_ops = {
+	.suspend	= ab8500_i2c_suspend,
+	.resume		= ab8500_i2c_resume,
+};
+
+#define AB8500_I2C_PM_OPS	(&ab8500_i2c_pm_ops)
+#else
+#define AB8500_I2C_PM_OPS	NULL
+#endif
+
 static struct platform_driver ab8500_i2c_driver = {
 	.driver = {
 		.name = "ab8500-i2c",
 		.owner = THIS_MODULE,
+		.pm	= AB8500_I2C_PM_OPS,
 	},
 	.probe	= ab8500_i2c_probe,
 	.remove	= __devexit_p(ab8500_i2c_remove),
-- 
cgit v0.10.2


From 112a80d29b529d4057777ac2cb4ec15ff5b6d210 Mon Sep 17 00:00:00 2001
From: Jonas Aaberg <jonas.aberg@stericsson.com>
Date: Tue, 17 Apr 2012 09:30:33 +0200
Subject: mfd: Deny ab8500 suspend if i2c transfer is ongoing

If we are in the middle of an I2C transfer we need to deny suspend
of the AB8500 core. Implement an atomic reference counter for the
I2C operations to make sure we don't do this.

Signed-off-by: Jonas Aaberg <jonas.aberg@stericsson.com>
Reviewed-by: Mattias Wallin <mattias.wallin@stericsson.com>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>

diff --git a/drivers/mfd/ab8500-core.c b/drivers/mfd/ab8500-core.c
index ae67612..eee9560 100644
--- a/drivers/mfd/ab8500-core.c
+++ b/drivers/mfd/ab8500-core.c
@@ -161,9 +161,13 @@ static int set_register_interruptible(struct ab8500 *ab8500, u8 bank,
 static int ab8500_set_register(struct device *dev, u8 bank,
 	u8 reg, u8 value)
 {
+	int ret;
 	struct ab8500 *ab8500 = dev_get_drvdata(dev->parent);
 
-	return set_register_interruptible(ab8500, bank, reg, value);
+	atomic_inc(&ab8500->transfer_ongoing);
+	ret = set_register_interruptible(ab8500, bank, reg, value);
+	atomic_dec(&ab8500->transfer_ongoing);
+	return ret;
 }
 
 static int get_register_interruptible(struct ab8500 *ab8500, u8 bank,
@@ -192,9 +196,13 @@ static int get_register_interruptible(struct ab8500 *ab8500, u8 bank,
 static int ab8500_get_register(struct device *dev, u8 bank,
 	u8 reg, u8 *value)
 {
+	int ret;
 	struct ab8500 *ab8500 = dev_get_drvdata(dev->parent);
 
-	return get_register_interruptible(ab8500, bank, reg, value);
+	atomic_inc(&ab8500->transfer_ongoing);
+	ret = get_register_interruptible(ab8500, bank, reg, value);
+	atomic_dec(&ab8500->transfer_ongoing);
+	return ret;
 }
 
 static int mask_and_set_register_interruptible(struct ab8500 *ab8500, u8 bank,
@@ -241,11 +249,14 @@ out:
 static int ab8500_mask_and_set_register(struct device *dev,
 	u8 bank, u8 reg, u8 bitmask, u8 bitvalues)
 {
+	int ret;
 	struct ab8500 *ab8500 = dev_get_drvdata(dev->parent);
 
-	return mask_and_set_register_interruptible(ab8500, bank, reg,
-		bitmask, bitvalues);
-
+	atomic_inc(&ab8500->transfer_ongoing);
+	ret= mask_and_set_register_interruptible(ab8500, bank, reg,
+						 bitmask, bitvalues);
+	atomic_dec(&ab8500->transfer_ongoing);
+	return ret;
 }
 
 static struct abx500_ops ab8500_ops = {
@@ -264,6 +275,7 @@ static void ab8500_irq_lock(struct irq_data *data)
 	struct ab8500 *ab8500 = irq_data_get_irq_chip_data(data);
 
 	mutex_lock(&ab8500->irq_lock);
+	atomic_inc(&ab8500->transfer_ongoing);
 }
 
 static void ab8500_irq_sync_unlock(struct irq_data *data)
@@ -292,7 +304,7 @@ static void ab8500_irq_sync_unlock(struct irq_data *data)
 		reg = AB8500_IT_MASK1_REG + ab8500->irq_reg_offset[i];
 		set_register_interruptible(ab8500, AB8500_INTERRUPT, reg, new);
 	}
-
+	atomic_dec(&ab8500->transfer_ongoing);
 	mutex_unlock(&ab8500->irq_lock);
 }
 
@@ -332,6 +344,8 @@ static irqreturn_t ab8500_irq(int irq, void *dev)
 
 	dev_vdbg(ab8500->dev, "interrupt\n");
 
+	atomic_inc(&ab8500->transfer_ongoing);
+
 	for (i = 0; i < ab8500->mask_size; i++) {
 		int regoffset = ab8500->irq_reg_offset[i];
 		int status;
@@ -355,9 +369,10 @@ static irqreturn_t ab8500_irq(int irq, void *dev)
 
 			handle_nested_irq(ab8500->irq_base + line);
 			value &= ~(1 << bit);
+
 		} while (value);
 	}
-
+	atomic_dec(&ab8500->transfer_ongoing);
 	return IRQ_HANDLED;
 }
 
@@ -411,6 +426,14 @@ static void ab8500_irq_remove(struct ab8500 *ab8500)
 	}
 }
 
+int ab8500_suspend(struct ab8500 *ab8500)
+{
+	if (atomic_read(&ab8500->transfer_ongoing))
+		return -EINVAL;
+	else
+		return 0;
+}
+
 /* AB8500 GPIO Resources */
 static struct resource __devinitdata ab8500_gpio_resources[] = {
 	{
@@ -1059,6 +1082,7 @@ int __devinit ab8500_init(struct ab8500 *ab8500, enum ab8500_version version)
 
 	mutex_init(&ab8500->lock);
 	mutex_init(&ab8500->irq_lock);
+	atomic_set(&ab8500->transfer_ongoing, 0);
 
 	if (version != AB8500_VERSION_UNDEFINED)
 		ab8500->version = version;
diff --git a/include/linux/mfd/abx500/ab8500.h b/include/linux/mfd/abx500/ab8500.h
index d798f5b..91dd3ef 100644
--- a/include/linux/mfd/abx500/ab8500.h
+++ b/include/linux/mfd/abx500/ab8500.h
@@ -7,6 +7,7 @@
 #ifndef MFD_AB8500_H
 #define MFD_AB8500_H
 
+#include <linux/atomic.h>
 #include <linux/mutex.h>
 
 struct device;
@@ -224,6 +225,7 @@ enum ab8500_version {
  * @dev: parent device
  * @lock: read/write operations lock
  * @irq_lock: genirq bus lock
+ * @transfer_ongoing: 0 if no transfer ongoing
  * @irq: irq line
  * @version: chip version id (e.g. ab8500 or ab9540)
  * @chip_id: chip revision id
@@ -242,7 +244,7 @@ struct ab8500 {
 	struct device	*dev;
 	struct mutex	lock;
 	struct mutex	irq_lock;
-
+	atomic_t	transfer_ongoing;
 	int		irq_base;
 	int		irq;
 	enum ab8500_version version;
@@ -288,6 +290,8 @@ extern int __devinit ab8500_init(struct ab8500 *ab8500,
 				 enum ab8500_version version);
 extern int __devexit ab8500_exit(struct ab8500 *ab8500);
 
+extern int ab8500_suspend(struct ab8500 *ab8500);
+
 static inline int is_ab8500(struct ab8500 *ab)
 {
 	return ab->version == AB8500_VERSION_AB8500;
-- 
cgit v0.10.2


From 6ef9418c9e6fc41e6c1066b6423f3a1625e4b822 Mon Sep 17 00:00:00 2001
From: Rickard Andersson <rickard.andersson@stericsson.com>
Date: Tue, 17 Apr 2012 09:30:57 +0200
Subject: mfd: Add parameter to disable ab8500 battery management

This patch makes it possible to disable battery management
via a module boot parameter. When 'ab8500-core.no_bm=1' then
ab8500_btemp, ab8500_chargalg, ab8500_charger and ab8500_fg will
not be probed. This boot parameter is used for scripted testing
of the system.

Signed-off-by: Rickard Andersson <rickard.andersson@stericsson.com>
Reviewed-by: Jonas Aberg <jonas.aberg@stericsson.com>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>

diff --git a/drivers/mfd/ab8500-core.c b/drivers/mfd/ab8500-core.c
index eee9560..08850f0 100644
--- a/drivers/mfd/ab8500-core.c
+++ b/drivers/mfd/ab8500-core.c
@@ -97,6 +97,9 @@
 
 #define AB8500_TURN_ON_STATUS		0x00
 
+static bool no_bm; /* No battery management */
+module_param(no_bm, bool, S_IRUGO);
+
 #define AB9540_MODEM_CTRL2_REG			0x23
 #define AB9540_MODEM_CTRL2_SWDBBRSTN_BIT	BIT(2)
 
@@ -834,26 +837,6 @@ static struct mfd_cell __devinitdata abx500_common_devs[] = {
 		.resources = ab8500_rtc_resources,
 	},
 	{
-		.name = "ab8500-charger",
-		.num_resources = ARRAY_SIZE(ab8500_charger_resources),
-		.resources = ab8500_charger_resources,
-	},
-	{
-		.name = "ab8500-btemp",
-		.num_resources = ARRAY_SIZE(ab8500_btemp_resources),
-		.resources = ab8500_btemp_resources,
-	},
-	{
-		.name = "ab8500-fg",
-		.num_resources = ARRAY_SIZE(ab8500_fg_resources),
-		.resources = ab8500_fg_resources,
-	},
-	{
-		.name = "ab8500-chargalg",
-		.num_resources = ARRAY_SIZE(ab8500_chargalg_resources),
-		.resources = ab8500_chargalg_resources,
-	},
-	{
 		.name = "ab8500-acc-det",
 		.num_resources = ARRAY_SIZE(ab8500_av_acc_detect_resources),
 		.resources = ab8500_av_acc_detect_resources,
@@ -886,6 +869,29 @@ static struct mfd_cell __devinitdata abx500_common_devs[] = {
 	},
 };
 
+static struct mfd_cell __devinitdata ab8500_bm_devs[] = {
+	{
+		.name = "ab8500-charger",
+		.num_resources = ARRAY_SIZE(ab8500_charger_resources),
+		.resources = ab8500_charger_resources,
+	},
+	{
+		.name = "ab8500-btemp",
+		.num_resources = ARRAY_SIZE(ab8500_btemp_resources),
+		.resources = ab8500_btemp_resources,
+	},
+	{
+		.name = "ab8500-fg",
+		.num_resources = ARRAY_SIZE(ab8500_fg_resources),
+		.resources = ab8500_fg_resources,
+	},
+	{
+		.name = "ab8500-chargalg",
+		.num_resources = ARRAY_SIZE(ab8500_chargalg_resources),
+		.resources = ab8500_chargalg_resources,
+	},
+};
+
 static struct mfd_cell __devinitdata ab8500_devs[] = {
 	{
 		.name = "ab8500-gpio",
@@ -1204,6 +1210,15 @@ int __devinit ab8500_init(struct ab8500 *ab8500, enum ab8500_version version)
 	if (ret)
 		goto out_freeirq;
 
+	if (!no_bm) {
+		/* Add battery management devices */
+		ret = mfd_add_devices(ab8500->dev, 0, ab8500_bm_devs,
+				      ARRAY_SIZE(ab8500_bm_devs), NULL,
+				      ab8500->irq_base);
+		if (ret)
+			dev_err(ab8500->dev, "error adding bm devices\n");
+	}
+
 	if (is_ab9540(ab8500))
 		ret = sysfs_create_group(&ab8500->dev->kobj,
 					&ab9540_attr_group);
-- 
cgit v0.10.2


From 9146ab5055152bbacb5690c384df2fd610fb3c68 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 1 May 2012 11:21:43 -0400
Subject: NFS: Read cleanups

Remove unused variables, and reformat some code.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Cc: Fred Isaman <iisaman@netapp.com>

diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 35e2dce..20a0293 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -341,8 +341,6 @@ static int nfs_pagein_multi(struct nfs_pageio_descriptor *desc,
 	struct nfs_read_data *data;
 	size_t rsize = desc->pg_bsize, nbytes;
 	unsigned int offset;
-	int requests = 0;
-	int ret = 0;
 
 	nfs_list_remove_request(req);
 	nfs_list_add_request(req, &hdr->pages);
@@ -358,12 +356,11 @@ static int nfs_pagein_multi(struct nfs_pageio_descriptor *desc,
 		data->pages.pagevec[0] = page;
 		nfs_read_rpcsetup(data, len, offset);
 		list_add(&data->list, &hdr->rpc_list);
-		requests++;
 		nbytes -= len;
 		offset += len;
-	} while(nbytes != 0);
+	} while (nbytes != 0);
 	desc->pg_rpc_callops = &nfs_read_common_ops;
-	return ret;
+	return 0;
 out_bad:
 	while (!list_empty(&hdr->rpc_list)) {
 		data = list_first_entry(&hdr->rpc_list, struct nfs_read_data, list);
@@ -387,8 +384,7 @@ static int nfs_pagein_one(struct nfs_pageio_descriptor *desc,
 							  desc->pg_count));
 	if (!data) {
 		desc->pg_completion_ops->error_cleanup(head);
-		ret = -ENOMEM;
-		goto out;
+		return -ENOMEM;
 	}
 
 	pages = data->pages.pagevec;
@@ -402,8 +398,7 @@ static int nfs_pagein_one(struct nfs_pageio_descriptor *desc,
 	nfs_read_rpcsetup(data, desc->pg_count, 0);
 	list_add(&data->list, &hdr->rpc_list);
 	desc->pg_rpc_callops = &nfs_read_common_ops;
-out:
-	return ret;
+	return 0;
 }
 
 int nfs_generic_pagein(struct nfs_pageio_descriptor *desc,
-- 
cgit v0.10.2


From 25b11dcdbfcad69a5ec03265e2dce19e5eca936b Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 1 May 2012 12:07:22 -0400
Subject: NFS: Clean up nfs read and write error paths

Move the error handling for nfs_generic_pagein() into a single function.
Ditto for nfs_generic_flush().

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Cc: Fred Isaman <iisaman@netapp.com>

diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 39cbac5..6fdeca2 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -1321,7 +1321,6 @@ pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc)
 	if (ret != 0) {
 		put_lseg(desc->pg_lseg);
 		desc->pg_lseg = NULL;
-		set_bit(NFS_IOHDR_REDO, &hdr->flags);
 	} else
 		pnfs_do_multiple_writes(desc, &hdr->rpc_list, desc->pg_ioflags);
 	if (atomic_dec_and_test(&hdr->refcnt))
@@ -1476,7 +1475,6 @@ pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc)
 	if (ret != 0) {
 		put_lseg(desc->pg_lseg);
 		desc->pg_lseg = NULL;
-		set_bit(NFS_IOHDR_REDO, &hdr->flags);
 	} else
 		pnfs_do_multiple_reads(desc, &hdr->rpc_list);
 	if (atomic_dec_and_test(&hdr->refcnt))
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 20a0293..1961a19 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -320,6 +320,19 @@ static const struct nfs_pgio_completion_ops nfs_async_read_completion_ops = {
 	.completion = nfs_read_completion,
 };
 
+static void nfs_pagein_error(struct nfs_pageio_descriptor *desc,
+		struct nfs_pgio_header *hdr)
+{
+	set_bit(NFS_IOHDR_REDO, &hdr->flags);
+	while (!list_empty(&hdr->rpc_list)) {
+		struct nfs_read_data *data = list_first_entry(&hdr->rpc_list,
+				struct nfs_read_data, list);
+		list_del(&data->list);
+		nfs_readdata_release(data);
+	}
+	desc->pg_completion_ops->error_cleanup(&desc->pg_list);
+}
+
 /*
  * Generate multiple requests to fill a single page.
  *
@@ -342,33 +355,27 @@ static int nfs_pagein_multi(struct nfs_pageio_descriptor *desc,
 	size_t rsize = desc->pg_bsize, nbytes;
 	unsigned int offset;
 
-	nfs_list_remove_request(req);
-	nfs_list_add_request(req, &hdr->pages);
-
 	offset = 0;
 	nbytes = desc->pg_count;
 	do {
 		size_t len = min(nbytes,rsize);
 
 		data = nfs_readdata_alloc(hdr, 1);
-		if (!data)
-			goto out_bad;
+		if (!data) {
+			nfs_pagein_error(desc, hdr);
+			return -ENOMEM;
+		}
 		data->pages.pagevec[0] = page;
 		nfs_read_rpcsetup(data, len, offset);
 		list_add(&data->list, &hdr->rpc_list);
 		nbytes -= len;
 		offset += len;
 	} while (nbytes != 0);
+
+	nfs_list_remove_request(req);
+	nfs_list_add_request(req, &hdr->pages);
 	desc->pg_rpc_callops = &nfs_read_common_ops;
 	return 0;
-out_bad:
-	while (!list_empty(&hdr->rpc_list)) {
-		data = list_first_entry(&hdr->rpc_list, struct nfs_read_data, list);
-		list_del(&data->list);
-		nfs_readdata_release(data);
-	}
-	desc->pg_completion_ops->error_cleanup(&hdr->pages);
-	return -ENOMEM;
 }
 
 static int nfs_pagein_one(struct nfs_pageio_descriptor *desc,
@@ -378,12 +385,11 @@ static int nfs_pagein_one(struct nfs_pageio_descriptor *desc,
 	struct page		**pages;
 	struct nfs_read_data    *data;
 	struct list_head *head = &desc->pg_list;
-	int ret = 0;
 
 	data = nfs_readdata_alloc(hdr, nfs_page_array_len(desc->pg_base,
 							  desc->pg_count));
 	if (!data) {
-		desc->pg_completion_ops->error_cleanup(head);
+		nfs_pagein_error(desc, hdr);
 		return -ENOMEM;
 	}
 
@@ -427,8 +433,6 @@ static int nfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc)
 	if (ret == 0)
 		ret = nfs_do_multiple_reads(&hdr->rpc_list,
 					    desc->pg_rpc_callops);
-	else
-		set_bit(NFS_IOHDR_REDO, &hdr->flags);
 	if (atomic_dec_and_test(&hdr->refcnt))
 		hdr->completion_ops->completion(hdr);
 	return ret;
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 2f80aa5..d1e4f81 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1058,6 +1058,19 @@ static const struct nfs_pgio_completion_ops nfs_async_write_completion_ops = {
 	.completion = nfs_write_completion,
 };
 
+static void nfs_flush_error(struct nfs_pageio_descriptor *desc,
+		struct nfs_pgio_header *hdr)
+{
+	set_bit(NFS_IOHDR_REDO, &hdr->flags);
+	while (!list_empty(&hdr->rpc_list)) {
+		struct nfs_write_data *data = list_first_entry(&hdr->rpc_list,
+				struct nfs_write_data, list);
+		list_del(&data->list);
+		nfs_writedata_release(data);
+	}
+	desc->pg_completion_ops->error_cleanup(&desc->pg_list);
+}
+
 /*
  * Generate multiple small requests to write out a single
  * contiguous dirty area on one page.
@@ -1071,12 +1084,9 @@ static int nfs_flush_multi(struct nfs_pageio_descriptor *desc,
 	size_t wsize = desc->pg_bsize, nbytes;
 	unsigned int offset;
 	int requests = 0;
-	int ret = 0;
 	struct nfs_commit_info cinfo;
 
 	nfs_init_cinfo(&cinfo, desc->pg_inode, desc->pg_dreq);
-	nfs_list_remove_request(req);
-	nfs_list_add_request(req, &hdr->pages);
 
 	if ((desc->pg_ioflags & FLUSH_COND_STABLE) &&
 	    (desc->pg_moreio || nfs_reqs_to_commit(&cinfo) ||
@@ -1090,8 +1100,10 @@ static int nfs_flush_multi(struct nfs_pageio_descriptor *desc,
 		size_t len = min(nbytes, wsize);
 
 		data = nfs_writedata_alloc(hdr, 1);
-		if (!data)
-			goto out_bad;
+		if (!data) {
+			nfs_flush_error(desc, hdr);
+			return -ENOMEM;
+		}
 		data->pages.pagevec[0] = page;
 		nfs_write_rpcsetup(data, len, offset, desc->pg_ioflags, &cinfo);
 		list_add(&data->list, &hdr->rpc_list);
@@ -1099,17 +1111,10 @@ static int nfs_flush_multi(struct nfs_pageio_descriptor *desc,
 		nbytes -= len;
 		offset += len;
 	} while (nbytes != 0);
+	nfs_list_remove_request(req);
+	nfs_list_add_request(req, &hdr->pages);
 	desc->pg_rpc_callops = &nfs_write_common_ops;
-	return ret;
-
-out_bad:
-	while (!list_empty(&hdr->rpc_list)) {
-		data = list_first_entry(&hdr->rpc_list, struct nfs_write_data, list);
-		list_del(&data->list);
-		nfs_writedata_release(data);
-	}
-	desc->pg_completion_ops->error_cleanup(&hdr->pages);
-	return -ENOMEM;
+	return 0;
 }
 
 /*
@@ -1127,15 +1132,13 @@ static int nfs_flush_one(struct nfs_pageio_descriptor *desc,
 	struct page		**pages;
 	struct nfs_write_data	*data;
 	struct list_head *head = &desc->pg_list;
-	int ret = 0;
 	struct nfs_commit_info cinfo;
 
 	data = nfs_writedata_alloc(hdr, nfs_page_array_len(desc->pg_base,
 							   desc->pg_count));
 	if (!data) {
-		desc->pg_completion_ops->error_cleanup(head);
-		ret = -ENOMEM;
-		goto out;
+		nfs_flush_error(desc, hdr);
+		return -ENOMEM;
 	}
 
 	nfs_init_cinfo(&cinfo, desc->pg_inode, desc->pg_dreq);
@@ -1155,8 +1158,7 @@ static int nfs_flush_one(struct nfs_pageio_descriptor *desc,
 	nfs_write_rpcsetup(data, desc->pg_count, 0, desc->pg_ioflags, &cinfo);
 	list_add(&data->list, &hdr->rpc_list);
 	desc->pg_rpc_callops = &nfs_write_common_ops;
-out:
-	return ret;
+	return 0;
 }
 
 int nfs_generic_flush(struct nfs_pageio_descriptor *desc,
@@ -1186,8 +1188,6 @@ static int nfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc)
 		ret = nfs_do_multiple_writes(&hdr->rpc_list,
 					     desc->pg_rpc_callops,
 					     desc->pg_ioflags);
-	else
-		set_bit(NFS_IOHDR_REDO, &hdr->flags);
 	if (atomic_dec_and_test(&hdr->refcnt))
 		hdr->completion_ops->completion(hdr);
 	return ret;
-- 
cgit v0.10.2


From 4bd8b010136afa0df9122a08bad361686bda0a1d Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 1 May 2012 12:49:58 -0400
Subject: NFS: Simplify the nfs_read_completion functions

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Cc: Fred Isaman <iisaman@netapp.com>

diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index f17e469..aab3016 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -243,36 +243,28 @@ static void nfs_direct_read_completion(struct nfs_pgio_header *hdr)
 		dreq->count += hdr->good_bytes;
 	spin_unlock(&dreq->lock);
 
-	if (!test_bit(NFS_IOHDR_ERROR, &hdr->flags)) {
-		while (!list_empty(&hdr->pages)) {
-			struct nfs_page *req = nfs_list_entry(hdr->pages.next);
-			struct page *page = req->wb_page;
-
-			if (test_bit(NFS_IOHDR_EOF, &hdr->flags)) {
-				if (bytes > hdr->good_bytes)
-					zero_user(page, 0, PAGE_SIZE);
-				else if (hdr->good_bytes - bytes < PAGE_SIZE)
-					zero_user_segment(page,
-						hdr->good_bytes & ~PAGE_MASK,
-						PAGE_SIZE);
-			}
-			bytes += req->wb_bytes;
-			nfs_list_remove_request(req);
-			if (!PageCompound(page))
-				set_page_dirty(page);
-			nfs_direct_readpage_release(req);
+	while (!list_empty(&hdr->pages)) {
+		struct nfs_page *req = nfs_list_entry(hdr->pages.next);
+		struct page *page = req->wb_page;
+
+		if (test_bit(NFS_IOHDR_EOF, &hdr->flags)) {
+			if (bytes > hdr->good_bytes)
+				zero_user(page, 0, PAGE_SIZE);
+			else if (hdr->good_bytes - bytes < PAGE_SIZE)
+				zero_user_segment(page,
+					hdr->good_bytes & ~PAGE_MASK,
+					PAGE_SIZE);
 		}
-	} else {
-		while (!list_empty(&hdr->pages)) {
-			struct nfs_page *req = nfs_list_entry(hdr->pages.next);
-
-			if (bytes < hdr->good_bytes)
-				if (!PageCompound(req->wb_page))
-					set_page_dirty(req->wb_page);
-			bytes += req->wb_bytes;
-			nfs_list_remove_request(req);
-			nfs_direct_readpage_release(req);
+		if (!PageCompound(page)) {
+			if (test_bit(NFS_IOHDR_ERROR, &hdr->flags)) {
+				if (bytes < hdr->good_bytes)
+					set_page_dirty(page);
+			} else
+				set_page_dirty(page);
 		}
+		bytes += req->wb_bytes;
+		nfs_list_remove_request(req);
+		nfs_direct_readpage_release(req);
 	}
 out_put:
 	if (put_dreq(dreq))
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 1961a19..37c9eb2 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -179,34 +179,26 @@ static void nfs_read_completion(struct nfs_pgio_header *hdr)
 
 	if (test_bit(NFS_IOHDR_REDO, &hdr->flags))
 		goto out;
-	if (!test_bit(NFS_IOHDR_ERROR, &hdr->flags)) {
-		while (!list_empty(&hdr->pages)) {
-			struct nfs_page *req = nfs_list_entry(hdr->pages.next);
-			struct page *page = req->wb_page;
-
-			if (test_bit(NFS_IOHDR_EOF, &hdr->flags)) {
-				if (bytes > hdr->good_bytes)
-					zero_user(page, 0, PAGE_SIZE);
-				else if (hdr->good_bytes - bytes < PAGE_SIZE)
-					zero_user_segment(page,
-						hdr->good_bytes & ~PAGE_MASK,
-						PAGE_SIZE);
-			}
-			SetPageUptodate(page);
-			nfs_list_remove_request(req);
-			nfs_readpage_release(req);
-			bytes += PAGE_SIZE;
+	while (!list_empty(&hdr->pages)) {
+		struct nfs_page *req = nfs_list_entry(hdr->pages.next);
+		struct page *page = req->wb_page;
+
+		if (test_bit(NFS_IOHDR_EOF, &hdr->flags)) {
+			if (bytes > hdr->good_bytes)
+				zero_user(page, 0, PAGE_SIZE);
+			else if (hdr->good_bytes - bytes < PAGE_SIZE)
+				zero_user_segment(page,
+					hdr->good_bytes & ~PAGE_MASK,
+					PAGE_SIZE);
 		}
-	} else {
-		while (!list_empty(&hdr->pages)) {
-			struct nfs_page *req = nfs_list_entry(hdr->pages.next);
-
-			bytes += req->wb_bytes;
+		bytes += req->wb_bytes;
+		if (test_bit(NFS_IOHDR_ERROR, &hdr->flags)) {
 			if (bytes <= hdr->good_bytes)
-				SetPageUptodate(req->wb_page);
-			nfs_list_remove_request(req);
-			nfs_readpage_release(req);
-		}
+				SetPageUptodate(page);
+		} else
+			SetPageUptodate(page);
+		nfs_list_remove_request(req);
+		nfs_readpage_release(req);
 	}
 out:
 	hdr->release(hdr);
-- 
cgit v0.10.2


From a9f6991b6cd3f55aa8482633337cd811d84d0dd8 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Fri, 27 Apr 2012 13:48:17 -0400
Subject: NFSv4: Fix a typo in NFS4_enc_link_sz

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 4c3cc0e..fe61424 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -626,9 +626,9 @@ static int nfs4_stat_to_errno(int);
 				encode_savefh_maxsz + \
 				encode_putfh_maxsz + \
 				encode_link_maxsz + \
-				decode_getattr_maxsz + \
+				encode_getattr_maxsz + \
 				encode_restorefh_maxsz + \
-				decode_getattr_maxsz)
+				encode_getattr_maxsz)
 #define NFS4_dec_link_sz	(compound_decode_hdr_maxsz + \
 				decode_sequence_maxsz + \
 				decode_putfh_maxsz + \
-- 
cgit v0.10.2


From 9e907fec6ef7705ba07e22f034dacf102d29a538 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Fri, 27 Apr 2012 13:48:17 -0400
Subject: NFSv4: Delegreturn only needs the cache consistency bitmask

In order to do close-to-open cache consistency checking after
a delegreturn, we don't need to retrieve the full set of
attributes.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 1780391..111a3cc 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -4117,7 +4117,7 @@ static int _nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, co
 	nfs41_init_sequence(&data->args.seq_args, &data->res.seq_res, 1);
 	data->args.fhandle = &data->fh;
 	data->args.stateid = &data->stateid;
-	data->args.bitmask = server->attr_bitmask;
+	data->args.bitmask = server->cache_consistency_bitmask;
 	nfs_copy_fh(&data->fh, NFS_FH(inode));
 	nfs4_stateid_copy(&data->stateid, stateid);
 	data->res.fattr = &data->fattr;
-- 
cgit v0.10.2


From e144cbcc251f16c1a14b9256cda73ab4aebe933a Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Sat, 28 Apr 2012 16:05:03 -0400
Subject: NFSv4: Retrieve attributes _before_ calling delegreturn

In order to retrieve cache consistency attributes before
anyone else has a chance to change the inode, we need to
put the GETATTR op _before_ the DELEGRETURN op.

We can then use that as part of a 'nfs_post_op_update_inode_force_wcc()'
call, to ensure that we update the attributes without clearing our
cached data.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 111a3cc..2e0fbff 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -4138,9 +4138,10 @@ static int _nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, co
 	if (status != 0)
 		goto out;
 	status = data->rpc_status;
-	if (status != 0)
-		goto out;
-	nfs_refresh_inode(inode, &data->fattr);
+	if (status == 0)
+		nfs_post_op_update_inode_force_wcc(inode, &data->fattr);
+	else
+		nfs_refresh_inode(inode, &data->fattr);
 out:
 	rpc_put_task(task);
 	return status;
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index fe61424..ac7a3b0 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -2602,8 +2602,8 @@ static void nfs4_xdr_enc_delegreturn(struct rpc_rqst *req,
 	encode_compound_hdr(xdr, req, &hdr);
 	encode_sequence(xdr, &args->seq_args, &hdr);
 	encode_putfh(xdr, args->fhandle, &hdr);
-	encode_delegreturn(xdr, args->stateid, &hdr);
 	encode_getfattr(xdr, args->bitmask, &hdr);
+	encode_delegreturn(xdr, args->stateid, &hdr);
 	encode_nops(&hdr);
 }
 
@@ -6527,10 +6527,10 @@ static int nfs4_xdr_dec_delegreturn(struct rpc_rqst *rqstp,
 	status = decode_putfh(xdr);
 	if (status != 0)
 		goto out;
-	status = decode_delegreturn(xdr);
+	status = decode_getfattr(xdr, res->fattr, res->server);
 	if (status != 0)
 		goto out;
-	decode_getfattr(xdr, res->fattr, res->server);
+	status = decode_delegreturn(xdr);
 out:
 	return status;
 }
-- 
cgit v0.10.2


From b4b1eadf7c5f00636500ad47f68edc0666e63ea5 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Sun, 29 Apr 2012 11:23:50 -0400
Subject: NFS: Don't force page cache revalidations when holding a delegation

If we're holding a delegation, then we already know that our
page cache is valid.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 59a12c6a..fed27c0 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -870,6 +870,15 @@ static int nfs_invalidate_mapping(struct inode *inode, struct address_space *map
 	return 0;
 }
 
+static bool nfs_mapping_need_revalidate_inode(struct inode *inode)
+{
+	if (nfs_have_delegated_attributes(inode))
+		return false;
+	return (NFS_I(inode)->cache_validity & NFS_INO_REVAL_PAGECACHE)
+		|| nfs_attribute_timeout(inode)
+		|| NFS_STALE(inode);
+}
+
 /**
  * nfs_revalidate_mapping - Revalidate the pagecache
  * @inode - pointer to host inode
@@ -880,9 +889,7 @@ int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping)
 	struct nfs_inode *nfsi = NFS_I(inode);
 	int ret = 0;
 
-	if ((nfsi->cache_validity & NFS_INO_REVAL_PAGECACHE)
-			|| nfs_attribute_cache_expired(inode)
-			|| NFS_STALE(inode)) {
+	if (nfs_mapping_need_revalidate_inode(inode)) {
 		ret = __nfs_revalidate_inode(NFS_SERVER(inode), inode);
 		if (ret < 0)
 			goto out;
-- 
cgit v0.10.2


From 01da47bde78ff2149f6546a0f17e25983aaddd7b Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Sun, 29 Apr 2012 12:30:19 -0400
Subject: NFS: Optimise away nfs_check_inode_attributes() when holding a
 delegation

We already know that the attribute cache is valid.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index fed27c0..81946e7 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -955,6 +955,8 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat
 	unsigned long invalid = 0;
 
 
+	if (nfs_have_delegated_attributes(inode))
+		return 0;
 	/* Has the inode gone and changed behind our back? */
 	if ((fattr->valid & NFS_ATTR_FATTR_FILEID) && nfsi->fileid != fattr->fileid)
 		return -EIO;
-- 
cgit v0.10.2


From 8d197a568fc337c66729b289c7fa0f28c14ba5ac Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Sun, 29 Apr 2012 12:50:01 -0400
Subject: NFS: Always trust the PageUptodate flag when we have a delegation

We can always use the optimal full page write if we know that we
hold a delegation.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index d1e4f81..6f263da 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -850,10 +850,14 @@ int nfs_flush_incompatible(struct file *file, struct page *page)
  * the PageUptodate() flag. In this case, we will need to turn off
  * write optimisations that depend on the page contents being correct.
  */
-static int nfs_write_pageuptodate(struct page *page, struct inode *inode)
+static bool nfs_write_pageuptodate(struct page *page, struct inode *inode)
 {
-	return PageUptodate(page) &&
-		!(NFS_I(inode)->cache_validity & (NFS_INO_REVAL_PAGECACHE|NFS_INO_INVALID_DATA));
+	if (nfs_have_delegated_attributes(inode))
+		goto out;
+	if (NFS_I(inode)->cache_validity & NFS_INO_REVAL_PAGECACHE)
+		return false;
+out:
+	return PageUptodate(page) != 0;
 }
 
 /*
-- 
cgit v0.10.2


From 4124bbc52118e7da6f7ad41cc247fa16f4b3f051 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Fri, 27 Apr 2012 13:48:17 -0400
Subject: NFS: Simplify nfs_fhget()

If the inode is being initialised, there is no point in
setting flags such as NFS_INO_INVALID_ACCESS,
NFS_INO_INVALID_ACL or NFS_INO_INVALID_DATA since there are
no cached access calls, acls or data caches to invalidate.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 81946e7..8d67e5e 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -285,9 +285,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
 		inode->i_mode = fattr->mode;
 		if ((fattr->valid & NFS_ATTR_FATTR_MODE) == 0
 				&& nfs_server_capable(inode, NFS_CAP_MODE))
-			nfsi->cache_validity |= NFS_INO_INVALID_ATTR
-				| NFS_INO_INVALID_ACCESS
-				| NFS_INO_INVALID_ACL;
+			nfsi->cache_validity |= NFS_INO_INVALID_ATTR;
 		/* Why so? Because we want revalidate for devices/FIFOs, and
 		 * that's precisely what we have in nfs_file_inode_operations.
 		 */
@@ -337,24 +335,19 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
 		if (fattr->valid & NFS_ATTR_FATTR_MTIME)
 			inode->i_mtime = fattr->mtime;
 		else if (nfs_server_capable(inode, NFS_CAP_MTIME))
-			nfsi->cache_validity |= NFS_INO_INVALID_ATTR
-				| NFS_INO_INVALID_DATA;
+			nfsi->cache_validity |= NFS_INO_INVALID_ATTR;
 		if (fattr->valid & NFS_ATTR_FATTR_CTIME)
 			inode->i_ctime = fattr->ctime;
 		else if (nfs_server_capable(inode, NFS_CAP_CTIME))
-			nfsi->cache_validity |= NFS_INO_INVALID_ATTR
-				| NFS_INO_INVALID_ACCESS
-				| NFS_INO_INVALID_ACL;
+			nfsi->cache_validity |= NFS_INO_INVALID_ATTR;
 		if (fattr->valid & NFS_ATTR_FATTR_CHANGE)
 			inode->i_version = fattr->change_attr;
 		else if (nfs_server_capable(inode, NFS_CAP_CHANGE_ATTR))
-			nfsi->cache_validity |= NFS_INO_INVALID_ATTR
-				| NFS_INO_INVALID_DATA;
+			nfsi->cache_validity |= NFS_INO_INVALID_ATTR;
 		if (fattr->valid & NFS_ATTR_FATTR_SIZE)
 			inode->i_size = nfs_size_to_loff_t(fattr->size);
 		else
 			nfsi->cache_validity |= NFS_INO_INVALID_ATTR
-				| NFS_INO_INVALID_DATA
 				| NFS_INO_REVAL_PAGECACHE;
 		if (fattr->valid & NFS_ATTR_FATTR_NLINK)
 			set_nlink(inode, fattr->nlink);
@@ -363,15 +356,11 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
 		if (fattr->valid & NFS_ATTR_FATTR_OWNER)
 			inode->i_uid = fattr->uid;
 		else if (nfs_server_capable(inode, NFS_CAP_OWNER))
-			nfsi->cache_validity |= NFS_INO_INVALID_ATTR
-				| NFS_INO_INVALID_ACCESS
-				| NFS_INO_INVALID_ACL;
+			nfsi->cache_validity |= NFS_INO_INVALID_ATTR;
 		if (fattr->valid & NFS_ATTR_FATTR_GROUP)
 			inode->i_gid = fattr->gid;
 		else if (nfs_server_capable(inode, NFS_CAP_OWNER_GROUP))
-			nfsi->cache_validity |= NFS_INO_INVALID_ATTR
-				| NFS_INO_INVALID_ACCESS
-				| NFS_INO_INVALID_ACL;
+			nfsi->cache_validity |= NFS_INO_INVALID_ATTR;
 		if (fattr->valid & NFS_ATTR_FATTR_BLOCKS_USED)
 			inode->i_blocks = fattr->du.nfs2.blocks;
 		if (fattr->valid & NFS_ATTR_FATTR_SPACE_USED) {
-- 
cgit v0.10.2


From 6a4506c0b56889aaa15bcf50b3c75f46a8d0a3bd Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Fri, 27 Apr 2012 13:48:17 -0400
Subject: NFS: Change attribute updates should set NFS_INO_REVAL_PAGECACHE

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 8d67e5e..9d76c0b 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -1312,7 +1312,11 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 		if (inode->i_version != fattr->change_attr) {
 			dprintk("NFS: change_attr change on server for file %s/%ld\n",
 					inode->i_sb->s_id, inode->i_ino);
-			invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
+			invalid |= NFS_INO_INVALID_ATTR
+				| NFS_INO_INVALID_DATA
+				| NFS_INO_INVALID_ACCESS
+				| NFS_INO_INVALID_ACL
+				| NFS_INO_REVAL_PAGECACHE;
 			if (S_ISDIR(inode->i_mode))
 				nfs_force_lookup_revalidate(inode);
 			inode->i_version = fattr->change_attr;
-- 
cgit v0.10.2


From 3a1556e8662cc425c433b463fcdae138908ca467 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Fri, 27 Apr 2012 13:48:18 -0400
Subject: NFSv2/v3: Simulate the change attribute

Use the ctime to simulate a change attribute for NFSv2 and NFSv3.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 60f7e4e..a8f8de6 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -880,7 +880,7 @@ static int nfs_init_server(struct nfs_server *server,
 	server->options = data->options;
 	server->caps |= NFS_CAP_HARDLINKS|NFS_CAP_SYMLINKS|NFS_CAP_FILEID|
 		NFS_CAP_MODE|NFS_CAP_NLINK|NFS_CAP_OWNER|NFS_CAP_OWNER_GROUP|
-		NFS_CAP_ATIME|NFS_CAP_CTIME|NFS_CAP_MTIME;
+		NFS_CAP_ATIME|NFS_CAP_CTIME|NFS_CAP_MTIME|NFS_CAP_CHANGE_ATTR;
 
 	if (data->rsize)
 		server->rsize = nfs_block_size(data->rsize, NULL);
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 0fd1efa..1855e8f 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -493,3 +493,15 @@ unsigned int nfs_page_array_len(unsigned int base, size_t len)
 		PAGE_SIZE - 1) >> PAGE_SHIFT;
 }
 
+/*
+ * Convert a struct timespec into a 64-bit change attribute
+ *
+ * This does approximately the same thing as timespec_to_ns(),
+ * but for calculation efficiency, we multiply the seconds by
+ * 1024*1024*1024.
+ */
+static inline
+u64 nfs_timespec_to_change_attr(const struct timespec *ts)
+{
+	return ((u64)ts->tv_sec << 30) + ts->tv_nsec;
+}
diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
index 1f56000..c99008e 100644
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c
@@ -313,6 +313,8 @@ static int decode_fattr(struct xdr_stream *xdr, struct nfs_fattr *fattr)
 	p = xdr_decode_time(p, &fattr->atime);
 	p = xdr_decode_time(p, &fattr->mtime);
 	xdr_decode_time(p, &fattr->ctime);
+	fattr->change_attr = nfs_timespec_to_change_attr(&fattr->ctime);
+
 	return 0;
 out_overflow:
 	print_overflow_msg(__func__, xdr);
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index 01e53e9..ee284c2 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -675,6 +675,7 @@ static int decode_fattr3(struct xdr_stream *xdr, struct nfs_fattr *fattr)
 	p = xdr_decode_nfstime3(p, &fattr->atime);
 	p = xdr_decode_nfstime3(p, &fattr->mtime);
 	xdr_decode_nfstime3(p, &fattr->ctime);
+	fattr->change_attr = nfs_timespec_to_change_attr(&fattr->ctime);
 
 	fattr->valid |= NFS_ATTR_FATTR_V3;
 	return 0;
@@ -725,12 +726,14 @@ static int decode_wcc_attr(struct xdr_stream *xdr, struct nfs_fattr *fattr)
 		goto out_overflow;
 
 	fattr->valid |= NFS_ATTR_FATTR_PRESIZE
+		| NFS_ATTR_FATTR_PRECHANGE
 		| NFS_ATTR_FATTR_PREMTIME
 		| NFS_ATTR_FATTR_PRECTIME;
 
 	p = xdr_decode_size3(p, &fattr->pre_size);
 	p = xdr_decode_nfstime3(p, &fattr->pre_mtime);
 	xdr_decode_nfstime3(p, &fattr->pre_ctime);
+	fattr->pre_change_attr = nfs_timespec_to_change_attr(&fattr->pre_ctime);
 
 	return 0;
 out_overflow:
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 6deb8f0..bc36808 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -106,14 +106,14 @@ struct nfs_fattr {
 		| NFS_ATTR_FATTR_FILEID \
 		| NFS_ATTR_FATTR_ATIME \
 		| NFS_ATTR_FATTR_MTIME \
-		| NFS_ATTR_FATTR_CTIME)
+		| NFS_ATTR_FATTR_CTIME \
+		| NFS_ATTR_FATTR_CHANGE)
 #define NFS_ATTR_FATTR_V2 (NFS_ATTR_FATTR \
 		| NFS_ATTR_FATTR_BLOCKS_USED)
 #define NFS_ATTR_FATTR_V3 (NFS_ATTR_FATTR \
 		| NFS_ATTR_FATTR_SPACE_USED)
 #define NFS_ATTR_FATTR_V4 (NFS_ATTR_FATTR \
-		| NFS_ATTR_FATTR_SPACE_USED \
-		| NFS_ATTR_FATTR_CHANGE)
+		| NFS_ATTR_FATTR_SPACE_USED)
 
 /*
  * Info on the file system
-- 
cgit v0.10.2


From fee7fe196c41847c135cde41b0ec790f53ee6fcf Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Fri, 27 Apr 2012 13:48:18 -0400
Subject: NFS: Simplify the cache invalidation code

Now that NFSv2 and NFSv3 have simulated change attributes,
instead of using all three of mtime, ctime and change attribute to
manage data cache consistency, we can simplify the code to just use
the change attribute.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 9d76c0b..0d53113 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -958,7 +958,7 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat
 
 	/* Verify a few of the more important attributes */
 	if ((fattr->valid & NFS_ATTR_FATTR_MTIME) && !timespec_equal(&inode->i_mtime, &fattr->mtime))
-		invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
+		invalid |= NFS_INO_INVALID_ATTR;
 
 	if (fattr->valid & NFS_ATTR_FATTR_SIZE) {
 		cur_size = i_size_read(inode);
@@ -1325,38 +1325,15 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 		invalid |= save_cache_validity;
 
 	if (fattr->valid & NFS_ATTR_FATTR_MTIME) {
-		/* NFSv2/v3: Check if the mtime agrees */
-		if (!timespec_equal(&inode->i_mtime, &fattr->mtime)) {
-			dprintk("NFS: mtime change on server for file %s/%ld\n",
-					inode->i_sb->s_id, inode->i_ino);
-			invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA;
-			if (S_ISDIR(inode->i_mode))
-				nfs_force_lookup_revalidate(inode);
-			memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime));
-		}
+		memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime));
 	} else if (server->caps & NFS_CAP_MTIME)
 		invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR
-				| NFS_INO_INVALID_DATA
-				| NFS_INO_REVAL_PAGECACHE
 				| NFS_INO_REVAL_FORCED);
 
 	if (fattr->valid & NFS_ATTR_FATTR_CTIME) {
-		/* If ctime has changed we should definitely clear access+acl caches */
-		if (!timespec_equal(&inode->i_ctime, &fattr->ctime)) {
-			invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
-			/* and probably clear data for a directory too as utimes can cause
-			 * havoc with our cache.
-			 */
-			if (S_ISDIR(inode->i_mode)) {
-				invalid |= NFS_INO_INVALID_DATA;
-				nfs_force_lookup_revalidate(inode);
-			}
-			memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime));
-		}
+		memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime));
 	} else if (server->caps & NFS_CAP_CTIME)
 		invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR
-				| NFS_INO_INVALID_ACCESS
-				| NFS_INO_INVALID_ACL
 				| NFS_INO_REVAL_FORCED);
 
 	/* Check if our cached file size is stale */
-- 
cgit v0.10.2


From 90ff0c548d1220d31f80e498b587393895705e6c Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Fri, 27 Apr 2012 13:48:18 -0400
Subject: NFSv4: Simplify the NFSv4 OPEN compound

Get rid of the post-op GETATTR on the directory in order to reduce
the amount of processing done on the server.

The cost is that if we later need to stat() the directory, then we
know that the ctime and mtime are likely to be invalid.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 2e0fbff..f01c3d1 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -788,7 +788,6 @@ struct nfs4_opendata {
 	struct nfs4_string owner_name;
 	struct nfs4_string group_name;
 	struct nfs_fattr f_attr;
-	struct nfs_fattr dir_attr;
 	struct dentry *dir;
 	struct dentry *dentry;
 	struct nfs4_state_owner *owner;
@@ -804,12 +803,10 @@ struct nfs4_opendata {
 static void nfs4_init_opendata_res(struct nfs4_opendata *p)
 {
 	p->o_res.f_attr = &p->f_attr;
-	p->o_res.dir_attr = &p->dir_attr;
 	p->o_res.seqid = p->o_arg.seqid;
 	p->c_res.seqid = p->c_arg.seqid;
 	p->o_res.server = p->o_arg.server;
 	nfs_fattr_init(&p->f_attr);
-	nfs_fattr_init(&p->dir_attr);
 	nfs_fattr_init_names(&p->f_attr, &p->owner_name, &p->group_name);
 }
 
@@ -843,7 +840,6 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry,
 	p->o_arg.name = &dentry->d_name;
 	p->o_arg.server = server;
 	p->o_arg.bitmask = server->attr_bitmask;
-	p->o_arg.dir_bitmask = server->cache_consistency_bitmask;
 	p->o_arg.claim = NFS4_OPEN_CLAIM_NULL;
 	if (attrs != NULL && attrs->ia_valid != 0) {
 		__be32 verf[2];
@@ -1611,8 +1607,6 @@ static int _nfs4_recover_proc_open(struct nfs4_opendata *data)
 
 	nfs_fattr_map_and_free_names(NFS_SERVER(dir), &data->f_attr);
 
-	nfs_refresh_inode(dir, o_res->dir_attr);
-
 	if (o_res->rflags & NFS4_OPEN_RESULT_CONFIRM) {
 		status = _nfs4_proc_open_confirm(data);
 		if (status != 0)
@@ -1645,11 +1639,8 @@ static int _nfs4_proc_open(struct nfs4_opendata *data)
 
 	nfs_fattr_map_and_free_names(server, &data->f_attr);
 
-	if (o_arg->open_flags & O_CREAT) {
+	if (o_arg->open_flags & O_CREAT)
 		update_changeattr(dir, &o_res->cinfo);
-		nfs_post_op_update_inode(dir, o_res->dir_attr);
-	} else
-		nfs_refresh_inode(dir, o_res->dir_attr);
 	if ((o_res->rflags & NFS4_OPEN_RESULT_LOCKTYPE_POSIX) == 0)
 		server->caps &= ~NFS_CAP_POSIX_LOCK;
 	if(o_res->rflags & NFS4_OPEN_RESULT_CONFIRM) {
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index ac7a3b0..6e878dc 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -431,20 +431,14 @@ static int nfs4_stat_to_errno(int);
 #define NFS4_enc_open_sz        (compound_encode_hdr_maxsz + \
 				encode_sequence_maxsz + \
 				encode_putfh_maxsz + \
-				encode_savefh_maxsz + \
 				encode_open_maxsz + \
 				encode_getfh_maxsz + \
-				encode_getattr_maxsz + \
-				encode_restorefh_maxsz + \
 				encode_getattr_maxsz)
 #define NFS4_dec_open_sz        (compound_decode_hdr_maxsz + \
 				decode_sequence_maxsz + \
 				decode_putfh_maxsz + \
-				decode_savefh_maxsz + \
 				decode_open_maxsz + \
 				decode_getfh_maxsz + \
-				decode_getattr_maxsz + \
-				decode_restorefh_maxsz + \
 				decode_getattr_maxsz)
 #define NFS4_enc_open_confirm_sz \
 				(compound_encode_hdr_maxsz + \
@@ -2191,12 +2185,9 @@ static void nfs4_xdr_enc_open(struct rpc_rqst *req, struct xdr_stream *xdr,
 	encode_compound_hdr(xdr, req, &hdr);
 	encode_sequence(xdr, &args->seq_args, &hdr);
 	encode_putfh(xdr, args->fh, &hdr);
-	encode_savefh(xdr, &hdr);
 	encode_open(xdr, args, &hdr);
 	encode_getfh(xdr, &hdr);
 	encode_getfattr(xdr, args->bitmask, &hdr);
-	encode_restorefh(xdr, &hdr);
-	encode_getfattr(xdr, args->dir_bitmask, &hdr);
 	encode_nops(&hdr);
 }
 
@@ -6075,19 +6066,12 @@ static int nfs4_xdr_dec_open(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
 	status = decode_putfh(xdr);
 	if (status)
 		goto out;
-	status = decode_savefh(xdr);
-	if (status)
-		goto out;
 	status = decode_open(xdr, res);
 	if (status)
 		goto out;
 	if (decode_getfh(xdr, &res->fh) != 0)
 		goto out;
-	if (decode_getfattr(xdr, res->f_attr, res->server) != 0)
-		goto out;
-	if (decode_restorefh(xdr) != 0)
-		goto out;
-	decode_getfattr(xdr, res->dir_attr, res->server);
+	decode_getfattr(xdr, res->f_attr, res->server);
 out:
 	return status;
 }
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index bc36808..92a929f 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -338,7 +338,6 @@ struct nfs_openargs {
 	const struct qstr *	name;
 	const struct nfs_server *server;	 /* Needed for ID mapping */
 	const u32 *		bitmask;
-	const u32 *		dir_bitmask;
 	__u32			claim;
 	struct nfs4_sequence_args	seq_args;
 };
@@ -349,7 +348,6 @@ struct nfs_openres {
 	struct nfs4_change_info	cinfo;
 	__u32                   rflags;
 	struct nfs_fattr *      f_attr;
-	struct nfs_fattr *      dir_attr;
 	struct nfs_seqid *	seqid;
 	const struct nfs_server *server;
 	fmode_t			delegation_type;
-- 
cgit v0.10.2


From 7c317fcfbae773e493ecee1c53738db774b1d0ca Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Fri, 27 Apr 2012 13:48:18 -0400
Subject: NFSv4: Simplify the NFSv4 CREATE compound

Get rid of the post-op GETATTR on the directory in order to reduce
the amount of processing done on the server.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index f01c3d1..619bc1e 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -2994,7 +2994,6 @@ struct nfs4_createdata {
 	struct nfs4_create_res res;
 	struct nfs_fh fh;
 	struct nfs_fattr fattr;
-	struct nfs_fattr dir_fattr;
 };
 
 static struct nfs4_createdata *nfs4_alloc_createdata(struct inode *dir,
@@ -3018,9 +3017,7 @@ static struct nfs4_createdata *nfs4_alloc_createdata(struct inode *dir,
 		data->res.server = server;
 		data->res.fh = &data->fh;
 		data->res.fattr = &data->fattr;
-		data->res.dir_fattr = &data->dir_fattr;
 		nfs_fattr_init(data->res.fattr);
-		nfs_fattr_init(data->res.dir_fattr);
 	}
 	return data;
 }
@@ -3031,7 +3028,6 @@ static int nfs4_do_create(struct inode *dir, struct dentry *dentry, struct nfs4_
 				    &data->arg.seq_args, &data->res.seq_res, 1);
 	if (status == 0) {
 		update_changeattr(dir, &data->res.dir_cinfo);
-		nfs_post_op_update_inode(dir, data->res.dir_fattr);
 		status = nfs_instantiate(dentry, data->res.fh, data->res.fattr);
 	}
 	return status;
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 6e878dc..1a70097 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -647,20 +647,14 @@ static int nfs4_stat_to_errno(int);
 #define NFS4_enc_create_sz	(compound_encode_hdr_maxsz + \
 				encode_sequence_maxsz + \
 				encode_putfh_maxsz + \
-				encode_savefh_maxsz + \
 				encode_create_maxsz + \
 				encode_getfh_maxsz + \
-				encode_getattr_maxsz + \
-				encode_restorefh_maxsz + \
 				encode_getattr_maxsz)
 #define NFS4_dec_create_sz	(compound_decode_hdr_maxsz + \
 				decode_sequence_maxsz + \
 				decode_putfh_maxsz + \
-				decode_savefh_maxsz + \
 				decode_create_maxsz + \
 				decode_getfh_maxsz + \
-				decode_getattr_maxsz + \
-				decode_restorefh_maxsz + \
 				decode_getattr_maxsz)
 #define NFS4_enc_pathconf_sz	(compound_encode_hdr_maxsz + \
 				encode_sequence_maxsz + \
@@ -2119,12 +2113,9 @@ static void nfs4_xdr_enc_create(struct rpc_rqst *req, struct xdr_stream *xdr,
 	encode_compound_hdr(xdr, req, &hdr);
 	encode_sequence(xdr, &args->seq_args, &hdr);
 	encode_putfh(xdr, args->dir_fh, &hdr);
-	encode_savefh(xdr, &hdr);
 	encode_create(xdr, args, &hdr);
 	encode_getfh(xdr, &hdr);
 	encode_getfattr(xdr, args->bitmask, &hdr);
-	encode_restorefh(xdr, &hdr);
-	encode_getfattr(xdr, args->bitmask, &hdr);
 	encode_nops(&hdr);
 }
 
@@ -5895,21 +5886,13 @@ static int nfs4_xdr_dec_create(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
 	status = decode_putfh(xdr);
 	if (status)
 		goto out;
-	status = decode_savefh(xdr);
-	if (status)
-		goto out;
 	status = decode_create(xdr, &res->dir_cinfo);
 	if (status)
 		goto out;
 	status = decode_getfh(xdr, res->fh);
 	if (status)
 		goto out;
-	if (decode_getfattr(xdr, res->fattr, res->server))
-		goto out;
-	status = decode_restorefh(xdr);
-	if (status)
-		goto out;
-	decode_getfattr(xdr, res->dir_fattr, res->server);
+	decode_getfattr(xdr, res->fattr, res->server);
 out:
 	return status;
 }
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 92a929f..696a17e 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -855,7 +855,6 @@ struct nfs4_create_res {
 	struct nfs_fh *			fh;
 	struct nfs_fattr *		fattr;
 	struct nfs4_change_info		dir_cinfo;
-	struct nfs_fattr *		dir_fattr;
 	struct nfs4_sequence_res	seq_res;
 };
 
-- 
cgit v0.10.2


From 778d28172f710184855bcfeadcdd6b46997c4de2 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Fri, 27 Apr 2012 13:48:19 -0400
Subject: NFSv4: Simplify the NFSv4 REMOVE, LINK and RENAME compounds

Get rid of the post-op GETATTR on the directory in order to reduce
the amount of processing done on the server.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 619bc1e..c746b0c 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -2775,7 +2775,6 @@ static int _nfs4_proc_remove(struct inode *dir, struct qstr *name)
 		.fh = NFS_FH(dir),
 		.name.len = name->len,
 		.name.name = name->name,
-		.bitmask = server->attr_bitmask,
 	};
 	struct nfs_removeres res = {
 		.server = server,
@@ -2785,19 +2784,11 @@ static int _nfs4_proc_remove(struct inode *dir, struct qstr *name)
 		.rpc_argp = &args,
 		.rpc_resp = &res,
 	};
-	int status = -ENOMEM;
-
-	res.dir_attr = nfs_alloc_fattr();
-	if (res.dir_attr == NULL)
-		goto out;
+	int status;
 
 	status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 1);
-	if (status == 0) {
+	if (status == 0)
 		update_changeattr(dir, &res.cinfo);
-		nfs_post_op_update_inode(dir, res.dir_attr);
-	}
-	nfs_free_fattr(res.dir_attr);
-out:
 	return status;
 }
 
@@ -2819,7 +2810,6 @@ static void nfs4_proc_unlink_setup(struct rpc_message *msg, struct inode *dir)
 	struct nfs_removeargs *args = msg->rpc_argp;
 	struct nfs_removeres *res = msg->rpc_resp;
 
-	args->bitmask = server->cache_consistency_bitmask;
 	res->server = server;
 	msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_REMOVE];
 	nfs41_init_sequence(&args->seq_args, &res->seq_res, 1);
@@ -2844,7 +2834,6 @@ static int nfs4_proc_unlink_done(struct rpc_task *task, struct inode *dir)
 	if (nfs4_async_handle_error(task, res->server, NULL) == -EAGAIN)
 		return 0;
 	update_changeattr(dir, &res->cinfo);
-	nfs_post_op_update_inode(dir, res->dir_attr);
 	return 1;
 }
 
@@ -2855,7 +2844,6 @@ static void nfs4_proc_rename_setup(struct rpc_message *msg, struct inode *dir)
 	struct nfs_renameres *res = msg->rpc_resp;
 
 	msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RENAME];
-	arg->bitmask = server->attr_bitmask;
 	res->server = server;
 	nfs41_init_sequence(&arg->seq_args, &res->seq_res, 1);
 }
@@ -2881,9 +2869,7 @@ static int nfs4_proc_rename_done(struct rpc_task *task, struct inode *old_dir,
 		return 0;
 
 	update_changeattr(old_dir, &res->old_cinfo);
-	nfs_post_op_update_inode(old_dir, res->old_fattr);
 	update_changeattr(new_dir, &res->new_cinfo);
-	nfs_post_op_update_inode(new_dir, res->new_fattr);
 	return 1;
 }
 
@@ -2896,7 +2882,6 @@ static int _nfs4_proc_rename(struct inode *old_dir, struct qstr *old_name,
 		.new_dir = NFS_FH(new_dir),
 		.old_name = old_name,
 		.new_name = new_name,
-		.bitmask = server->attr_bitmask,
 	};
 	struct nfs_renameres res = {
 		.server = server,
@@ -2908,21 +2893,11 @@ static int _nfs4_proc_rename(struct inode *old_dir, struct qstr *old_name,
 	};
 	int status = -ENOMEM;
 	
-	res.old_fattr = nfs_alloc_fattr();
-	res.new_fattr = nfs_alloc_fattr();
-	if (res.old_fattr == NULL || res.new_fattr == NULL)
-		goto out;
-
 	status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1);
 	if (!status) {
 		update_changeattr(old_dir, &res.old_cinfo);
-		nfs_post_op_update_inode(old_dir, res.old_fattr);
 		update_changeattr(new_dir, &res.new_cinfo);
-		nfs_post_op_update_inode(new_dir, res.new_fattr);
 	}
-out:
-	nfs_free_fattr(res.new_fattr);
-	nfs_free_fattr(res.old_fattr);
 	return status;
 }
 
@@ -2960,18 +2935,15 @@ static int _nfs4_proc_link(struct inode *inode, struct inode *dir, struct qstr *
 	int status = -ENOMEM;
 
 	res.fattr = nfs_alloc_fattr();
-	res.dir_attr = nfs_alloc_fattr();
-	if (res.fattr == NULL || res.dir_attr == NULL)
+	if (res.fattr == NULL)
 		goto out;
 
 	status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1);
 	if (!status) {
 		update_changeattr(dir, &res.cinfo);
-		nfs_post_op_update_inode(dir, res.dir_attr);
 		nfs_post_op_update_inode(inode, res.fattr);
 	}
 out:
-	nfs_free_fattr(res.dir_attr);
 	nfs_free_fattr(res.fattr);
 	return status;
 }
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 1a70097..49483f1 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -589,38 +589,29 @@ static int nfs4_stat_to_errno(int);
 #define NFS4_enc_remove_sz	(compound_encode_hdr_maxsz + \
 				encode_sequence_maxsz + \
 				encode_putfh_maxsz + \
-				encode_remove_maxsz + \
-				encode_getattr_maxsz)
+				encode_remove_maxsz)
 #define NFS4_dec_remove_sz	(compound_decode_hdr_maxsz + \
 				decode_sequence_maxsz + \
 				decode_putfh_maxsz + \
-				decode_remove_maxsz + \
-				decode_getattr_maxsz)
+				decode_remove_maxsz)
 #define NFS4_enc_rename_sz	(compound_encode_hdr_maxsz + \
 				encode_sequence_maxsz + \
 				encode_putfh_maxsz + \
 				encode_savefh_maxsz + \
 				encode_putfh_maxsz + \
-				encode_rename_maxsz + \
-				encode_getattr_maxsz + \
-				encode_restorefh_maxsz + \
-				encode_getattr_maxsz)
+				encode_rename_maxsz)
 #define NFS4_dec_rename_sz	(compound_decode_hdr_maxsz + \
 				decode_sequence_maxsz + \
 				decode_putfh_maxsz + \
 				decode_savefh_maxsz + \
 				decode_putfh_maxsz + \
-				decode_rename_maxsz + \
-				decode_getattr_maxsz + \
-				decode_restorefh_maxsz + \
-				decode_getattr_maxsz)
+				decode_rename_maxsz)
 #define NFS4_enc_link_sz	(compound_encode_hdr_maxsz + \
 				encode_sequence_maxsz + \
 				encode_putfh_maxsz + \
 				encode_savefh_maxsz + \
 				encode_putfh_maxsz + \
 				encode_link_maxsz + \
-				encode_getattr_maxsz + \
 				encode_restorefh_maxsz + \
 				encode_getattr_maxsz)
 #define NFS4_dec_link_sz	(compound_decode_hdr_maxsz + \
@@ -629,7 +620,6 @@ static int nfs4_stat_to_errno(int);
 				decode_savefh_maxsz + \
 				decode_putfh_maxsz + \
 				decode_link_maxsz + \
-				decode_getattr_maxsz + \
 				decode_restorefh_maxsz + \
 				decode_getattr_maxsz)
 #define NFS4_enc_symlink_sz	(compound_encode_hdr_maxsz + \
@@ -2052,7 +2042,6 @@ static void nfs4_xdr_enc_remove(struct rpc_rqst *req, struct xdr_stream *xdr,
 	encode_sequence(xdr, &args->seq_args, &hdr);
 	encode_putfh(xdr, args->fh, &hdr);
 	encode_remove(xdr, &args->name, &hdr);
-	encode_getfattr(xdr, args->bitmask, &hdr);
 	encode_nops(&hdr);
 }
 
@@ -2072,9 +2061,6 @@ static void nfs4_xdr_enc_rename(struct rpc_rqst *req, struct xdr_stream *xdr,
 	encode_savefh(xdr, &hdr);
 	encode_putfh(xdr, args->new_dir, &hdr);
 	encode_rename(xdr, args->old_name, args->new_name, &hdr);
-	encode_getfattr(xdr, args->bitmask, &hdr);
-	encode_restorefh(xdr, &hdr);
-	encode_getfattr(xdr, args->bitmask, &hdr);
 	encode_nops(&hdr);
 }
 
@@ -2094,7 +2080,6 @@ static void nfs4_xdr_enc_link(struct rpc_rqst *req, struct xdr_stream *xdr,
 	encode_savefh(xdr, &hdr);
 	encode_putfh(xdr, args->dir_fh, &hdr);
 	encode_link(xdr, args->name, &hdr);
-	encode_getfattr(xdr, args->bitmask, &hdr);
 	encode_restorefh(xdr, &hdr);
 	encode_getfattr(xdr, args->bitmask, &hdr);
 	encode_nops(&hdr);
@@ -5782,9 +5767,6 @@ static int nfs4_xdr_dec_remove(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
 	if (status)
 		goto out;
 	status = decode_remove(xdr, &res->cinfo);
-	if (status)
-		goto out;
-	decode_getfattr(xdr, res->dir_attr, res->server);
 out:
 	return status;
 }
@@ -5814,15 +5796,6 @@ static int nfs4_xdr_dec_rename(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
 	if (status)
 		goto out;
 	status = decode_rename(xdr, &res->old_cinfo, &res->new_cinfo);
-	if (status)
-		goto out;
-	/* Current FH is target directory */
-	if (decode_getfattr(xdr, res->new_fattr, res->server))
-		goto out;
-	status = decode_restorefh(xdr);
-	if (status)
-		goto out;
-	decode_getfattr(xdr, res->old_fattr, res->server);
 out:
 	return status;
 }
@@ -5858,8 +5831,6 @@ static int nfs4_xdr_dec_link(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
 	 * Note order: OP_LINK leaves the directory as the current
 	 *             filehandle.
 	 */
-	if (decode_getfattr(xdr, res->dir_attr, res->server))
-		goto out;
 	status = decode_restorefh(xdr);
 	if (status)
 		goto out;
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 696a17e..2e53a3f 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -540,7 +540,6 @@ struct nfs_commitres {
 struct nfs_removeargs {
 	const struct nfs_fh	*fh;
 	struct qstr		name;
-	const u32 *		bitmask;
 	struct nfs4_sequence_args	seq_args;
 };
 
@@ -559,7 +558,6 @@ struct nfs_renameargs {
 	const struct nfs_fh		*new_dir;
 	const struct qstr		*old_name;
 	const struct qstr		*new_name;
-	const u32			*bitmask;
 	struct nfs4_sequence_args	seq_args;
 };
 
-- 
cgit v0.10.2


From 5a37f85131c526ed7a3991d4dc2845498f81c1de Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Sat, 28 Apr 2012 14:55:16 -0400
Subject: NFSv4: Don't request cache consistency attributes on some writes

We don't need cache consistency information when we're doing O_DIRECT
writes. Ditto for the case of delegated writes.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index c746b0c..64b67f3 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -3370,7 +3370,7 @@ static int nfs4_write_done_cb(struct rpc_task *task, struct nfs_write_data *data
 	}
 	if (task->tk_status >= 0) {
 		renew_lease(NFS_SERVER(inode), data->timestamp);
-		nfs_post_op_update_inode_force_wcc(inode, data->res.fattr);
+		nfs_post_op_update_inode_force_wcc(inode, &data->fattr);
 	}
 	return 0;
 }
@@ -3401,15 +3401,30 @@ void nfs4_reset_write(struct rpc_task *task, struct nfs_write_data *data)
 }
 EXPORT_SYMBOL_GPL(nfs4_reset_write);
 
+static
+bool nfs4_write_need_cache_consistency_data(const struct nfs_write_data *data)
+{
+	const struct nfs_pgio_header *hdr = data->header;
+
+	/* Don't request attributes for pNFS or O_DIRECT writes */
+	if (data->ds_clp != NULL || hdr->dreq != NULL)
+		return false;
+	/* Otherwise, request attributes if and only if we don't hold
+	 * a delegation
+	 */
+	return nfs_have_delegation(hdr->inode, FMODE_READ) == 0;
+}
+
 static void nfs4_proc_write_setup(struct nfs_write_data *data, struct rpc_message *msg)
 {
 	struct nfs_server *server = NFS_SERVER(data->header->inode);
 
-	if (data->ds_clp) {
+	if (!nfs4_write_need_cache_consistency_data(data)) {
 		data->args.bitmask = NULL;
 		data->res.fattr = NULL;
 	} else
 		data->args.bitmask = server->cache_consistency_bitmask;
+
 	if (!data->write_done_cb)
 		data->write_done_cb = nfs4_write_done_cb;
 	data->res.server = server;
-- 
cgit v0.10.2


From 8582715e733d08bc98fe629db0601360d70de4dc Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Sun, 29 Apr 2012 10:44:42 -0400
Subject: NFSv4: COMMIT does not need post-op attributes

No attributes are supposed to change during a COMMIT call, so there
is no need to request post-op attributes.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 64b67f3..98eb48d 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -3462,7 +3462,6 @@ static int nfs4_commit_done_cb(struct rpc_task *task, struct nfs_commit_data *da
 		rpc_restart_call_prepare(task);
 		return -EAGAIN;
 	}
-	nfs_refresh_inode(inode, data->res.fattr);
 	return 0;
 }
 
@@ -3477,11 +3476,6 @@ static void nfs4_proc_commit_setup(struct nfs_commit_data *data, struct rpc_mess
 {
 	struct nfs_server *server = NFS_SERVER(data->inode);
 
-	if (data->lseg) {
-		data->args.bitmask = NULL;
-		data->res.fattr = NULL;
-	} else
-		data->args.bitmask = server->cache_consistency_bitmask;
 	if (data->commit_done_cb == NULL)
 		data->commit_done_cb = nfs4_commit_done_cb;
 	data->res.server = server;
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 49483f1..db040e9 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -421,13 +421,11 @@ static int nfs4_stat_to_errno(int);
 #define NFS4_enc_commit_sz	(compound_encode_hdr_maxsz + \
 				encode_sequence_maxsz + \
 				encode_putfh_maxsz + \
-				encode_commit_maxsz + \
-				encode_getattr_maxsz)
+				encode_commit_maxsz)
 #define NFS4_dec_commit_sz	(compound_decode_hdr_maxsz + \
 				decode_sequence_maxsz + \
 				decode_putfh_maxsz + \
-				decode_commit_maxsz + \
-				decode_getattr_maxsz)
+				decode_commit_maxsz)
 #define NFS4_enc_open_sz        (compound_encode_hdr_maxsz + \
 				encode_sequence_maxsz + \
 				encode_putfh_maxsz + \
@@ -2425,8 +2423,6 @@ static void nfs4_xdr_enc_commit(struct rpc_rqst *req, struct xdr_stream *xdr,
 	encode_sequence(xdr, &args->seq_args, &hdr);
 	encode_putfh(xdr, args->fh, &hdr);
 	encode_commit(xdr, args, &hdr);
-	if (args->bitmask)
-		encode_getfattr(xdr, args->bitmask, &hdr);
 	encode_nops(&hdr);
 }
 
@@ -6306,10 +6302,6 @@ static int nfs4_xdr_dec_commit(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
 	if (status)
 		goto out;
 	status = decode_commit(xdr, res);
-	if (status)
-		goto out;
-	if (res->fattr)
-		decode_getfattr(xdr, res->fattr, res->server);
 out:
 	return status;
 }
-- 
cgit v0.10.2


From d69ee9b85541a69a1092f5da675bd23256dc62af Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 1 May 2012 17:37:59 -0400
Subject: NFS: Adapt readdirplus to application usage patterns

While the use of READDIRPLUS is significantly more efficient than
READDIR followed by many LOOKUP calls, it is still less efficient
than just READDIR if the attributes are not required.

This patch tracks when lookups are attempted on the directory,
and uses that information to selectively disable READDIRPLUS
on that directory.
The first 'readdir' call is always served using READDIRPLUS.
Subsequent calls only use READDIRPLUS if there was a successful
lookup or revalidation on a child in the mean time.

Credit for the original idea should go to Neil Brown. See:
      http://www.spinics.net/lists/linux-nfs/msg19996.html
However, the implementation in this patch differs from Neil's
in that it focuses on tracking lookups rather than calls to
stat().

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Cc: Neil Brown <neilb@suse.de>

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 82b42e2..d0884c0 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -475,6 +475,29 @@ different:
 }
 
 static
+bool nfs_use_readdirplus(struct inode *dir, struct file *filp)
+{
+	if (!nfs_server_capable(dir, NFS_CAP_READDIRPLUS))
+		return false;
+	if (test_and_clear_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(dir)->flags))
+		return true;
+	if (filp->f_pos == 0)
+		return true;
+	return false;
+}
+
+/*
+ * This function is called by the lookup code to request the use of
+ * readdirplus to accelerate any future lookups in the same
+ * directory.
+ */
+static
+void nfs_advise_use_readdirplus(struct inode *dir)
+{
+	set_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(dir)->flags);
+}
+
+static
 void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry)
 {
 	struct qstr filename = {
@@ -874,7 +897,7 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 	desc->file = filp;
 	desc->dir_cookie = &dir_ctx->dir_cookie;
 	desc->decode = NFS_PROTO(inode)->decode_dirent;
-	desc->plus = NFS_USE_READDIRPLUS(inode);
+	desc->plus = nfs_use_readdirplus(inode, filp) ? 1 : 0;
 
 	nfs_block_sillyrename(dentry);
 	res = nfs_revalidate_mapping(inode, filp->f_mapping);
@@ -1114,7 +1137,7 @@ static int nfs_lookup_revalidate(struct dentry *dentry, struct nameidata *nd)
 	if (!inode) {
 		if (nfs_neg_need_reval(dir, dentry, nd))
 			goto out_bad;
-		goto out_valid;
+		goto out_valid_noent;
 	}
 
 	if (is_bad_inode(inode)) {
@@ -1156,6 +1179,9 @@ static int nfs_lookup_revalidate(struct dentry *dentry, struct nameidata *nd)
 out_set_verifier:
 	nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
  out_valid:
+	/* Success: notify readdir to use READDIRPLUS */
+	nfs_advise_use_readdirplus(dir);
+ out_valid_noent:
 	dput(parent);
 	dfprintk(LOOKUPCACHE, "NFS: %s(%s/%s) is valid\n",
 			__func__, dentry->d_parent->d_name.name,
@@ -1311,6 +1337,9 @@ static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, stru
 	if (IS_ERR(res))
 		goto out_unblock_sillyrename;
 
+	/* Success: notify readdir to use READDIRPLUS */
+	nfs_advise_use_readdirplus(dir);
+
 no_entry:
 	res = d_materialise_unique(dentry, inode);
 	if (res != NULL) {
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 0d53113..9f17cd1 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -298,8 +298,6 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
 			inode->i_op = NFS_SB(sb)->nfs_client->rpc_ops->dir_inode_ops;
 			inode->i_fop = &nfs_dir_operations;
 			inode->i_data.a_ops = &nfs_dir_aops;
-			if (nfs_server_capable(inode, NFS_CAP_READDIRPLUS))
-				set_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(inode)->flags);
 			/* Deal with crossing mountpoints */
 			if (fattr->valid & NFS_ATTR_FATTR_MOUNTPOINT ||
 					fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL) {
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 8a88c16..6cc7dba 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -314,11 +314,6 @@ static inline int nfs_server_capable(struct inode *inode, int cap)
 	return NFS_SERVER(inode)->caps & cap;
 }
 
-static inline int NFS_USE_READDIRPLUS(struct inode *inode)
-{
-	return test_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(inode)->flags);
-}
-
 static inline void nfs_set_verifier(struct dentry * dentry, unsigned long verf)
 {
 	dentry->d_time = verf;
-- 
cgit v0.10.2


From bf5fc4028ef751904a114ffc4b5d2cd9f0233142 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Fri, 4 May 2012 13:47:16 -0400
Subject: NFS: Fix O_DIRECT compile warnings

Fix the following compile warnings:
fs/nfs/direct.c: In function 'nfs_direct_read_schedule_segment':
fs/nfs/direct.c:325:11: warning: comparison of distinct pointer types
lacks a cast [enabled by default]
fs/nfs/direct.c:325:11: warning: comparison of distinct pointer types
lacks a cast [enabled by default]
fs/nfs/direct.c:325:11: warning: comparison of distinct pointer types
lacks a cast [enabled by default]
fs/nfs/direct.c:352:27: warning: comparison of distinct pointer types
lacks a cast [enabled by default]
fs/nfs/direct.c: In function 'nfs_direct_write_schedule_segment':
fs/nfs/direct.c:622:11: warning: comparison of distinct pointer types
lacks a cast [enabled by default]
fs/nfs/direct.c:622:11: warning: comparison of distinct pointer types
lacks a cast [enabled by default]
fs/nfs/direct.c:622:11: warning: comparison of distinct pointer types
lacks a cast [enabled by default]
fs/nfs/direct.c:650:27: warning: comparison of distinct pointer types
lacks a cast [enabled by default]

Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Cc: Fred Isaman <iisaman@netapp.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>

diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index aab3016..dca9c81 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -322,7 +322,7 @@ static ssize_t nfs_direct_read_schedule_segment(struct nfs_pageio_descriptor *de
 		int i;
 
 		pgbase = user_addr & ~PAGE_MASK;
-		bytes = min(max(rsize, PAGE_SIZE), count);
+		bytes = min(max_t(size_t, rsize, PAGE_SIZE), count);
 
 		result = -ENOMEM;
 		npages = nfs_page_array_len(pgbase, bytes);
@@ -349,7 +349,7 @@ static ssize_t nfs_direct_read_schedule_segment(struct nfs_pageio_descriptor *de
 
 		for (i = 0; i < npages; i++) {
 			struct nfs_page *req;
-			unsigned int req_len = min(bytes, PAGE_SIZE - pgbase);
+			unsigned int req_len = min_t(size_t, bytes, PAGE_SIZE - pgbase);
 			/* XXX do we need to do the eof zeroing found in async_filler? */
 			req = nfs_create_request(dreq->ctx, dreq->inode,
 						 pagevec[i],
@@ -619,7 +619,7 @@ static ssize_t nfs_direct_write_schedule_segment(struct nfs_pageio_descriptor *d
 		int i;
 
 		pgbase = user_addr & ~PAGE_MASK;
-		bytes = min(max(wsize, PAGE_SIZE), count);
+		bytes = min(max_t(size_t, wsize, PAGE_SIZE), count);
 
 		result = -ENOMEM;
 		npages = nfs_page_array_len(pgbase, bytes);
@@ -647,7 +647,7 @@ static ssize_t nfs_direct_write_schedule_segment(struct nfs_pageio_descriptor *d
 
 		for (i = 0; i < npages; i++) {
 			struct nfs_page *req;
-			unsigned int req_len = min(bytes, PAGE_SIZE - pgbase);
+			unsigned int req_len = min_t(size_t, bytes, PAGE_SIZE - pgbase);
 
 			req = nfs_create_request(dreq->ctx, dreq->inode,
 						 pagevec[i],
-- 
cgit v0.10.2


From 1385b8117325e79f74c1e7d1cbf45c789deb85c5 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Fri, 4 May 2012 13:54:24 -0400
Subject: NFS: Fix sparse warnings

Fix the following sparse warnings:

fs/nfs/direct.c:221:6: warning: symbol 'nfs_direct_readpage_release' was
not declared. Should it be static?
fs/nfs/read.c:38:43: warning: non-ANSI function declaration of function
'nfs_readhdr_alloc'
fs/nfs/objlayout/objio_osd.c:214:5: warning: symbol '__alloc_objio_seg'
was not declared. Should it be static?

Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Cc: Fred Isaman <iisaman@netapp.com>
Cc: Boaz Harrosh <bharrosh@panasas.com>

diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index dca9c81..257d009 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -218,7 +218,7 @@ static void nfs_direct_complete(struct nfs_direct_req *dreq)
 	nfs_direct_req_release(dreq);
 }
 
-void nfs_direct_readpage_release(struct nfs_page *req)
+static void nfs_direct_readpage_release(struct nfs_page *req)
 {
 	dprintk("NFS: direct read done (%s/%lld %d@%lld)\n",
 		req->wb_context->dentry->d_inode->i_sb->s_id,
diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c
index fbf4874..b47277ba 100644
--- a/fs/nfs/objlayout/objio_osd.c
+++ b/fs/nfs/objlayout/objio_osd.c
@@ -211,7 +211,7 @@ static void copy_single_comp(struct ore_components *oc, unsigned c,
 	memcpy(ocomp->cred, src_comp->oc_cap.cred, sizeof(ocomp->cred));
 }
 
-int __alloc_objio_seg(unsigned numdevs, gfp_t gfp_flags,
+static int __alloc_objio_seg(unsigned numdevs, gfp_t gfp_flags,
 		       struct objio_segment **pseg)
 {
 /*	This is the in memory structure of the objio_segment
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 37c9eb2..f23cf25 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -35,7 +35,7 @@ static const struct nfs_pgio_completion_ops nfs_async_read_completion_ops;
 
 static struct kmem_cache *nfs_rdata_cachep;
 
-struct nfs_read_header *nfs_readhdr_alloc()
+struct nfs_read_header *nfs_readhdr_alloc(void)
 {
 	struct nfs_read_header *rhdr;
 
-- 
cgit v0.10.2


From 1ca5513af77307eccea7efd4d12ef5c14f1b12ab Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@opensource.wolfsonmicro.com>
Date: Mon, 7 May 2012 10:03:17 +0100
Subject: mfd: Fix tps65090 ifdefs for suspend mode

CONFIG_PM also covers runtime only PM.

Signed-off-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
Acked-by: Venu Byravarasu <vbyravarasu@nvidia.com>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>

diff --git a/drivers/mfd/tps65090.c b/drivers/mfd/tps65090.c
index a66d4df..ac7e8b1 100644
--- a/drivers/mfd/tps65090.c
+++ b/drivers/mfd/tps65090.c
@@ -334,7 +334,7 @@ static int __devexit tps65090_i2c_remove(struct i2c_client *client)
 	return 0;
 }
 
-#ifdef CONFIG_PM
+#ifdef CONFIG_PM_SLEEP
 static int tps65090_i2c_suspend(struct i2c_client *client, pm_message_t state)
 {
 	if (client->irq)
@@ -363,7 +363,7 @@ static struct i2c_driver tps65090_driver = {
 	},
 	.probe		= tps65090_i2c_probe,
 	.remove		= __devexit_p(tps65090_i2c_remove),
-#ifdef CONFIG_PM
+#ifdef CONFIG_PM_SLEEP
 	.suspend	= tps65090_i2c_suspend,
 	.resume		= tps65090_i2c_resume,
 #endif
-- 
cgit v0.10.2


From b6c9eeef4e775e1fff76f4395d11638dc198271d Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@opensource.wolfsonmicro.com>
Date: Mon, 7 May 2012 10:03:18 +0100
Subject: mfd: Don't use I2C-specific suspend and resume operations for
 tps65090

The legacy suspend operations have been deprecated and printing warnings
on boot for over a year now.

Signed-off-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
Acked-by: Venu Byravarasu <vbyravarasu@nvidia.com>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>

diff --git a/drivers/mfd/tps65090.c b/drivers/mfd/tps65090.c
index ac7e8b1..6dc4c34 100644
--- a/drivers/mfd/tps65090.c
+++ b/drivers/mfd/tps65090.c
@@ -335,21 +335,27 @@ static int __devexit tps65090_i2c_remove(struct i2c_client *client)
 }
 
 #ifdef CONFIG_PM_SLEEP
-static int tps65090_i2c_suspend(struct i2c_client *client, pm_message_t state)
+static int tps65090_suspend(struct device *dev)
 {
+	struct i2c_client *client = to_i2c_client(dev);
 	if (client->irq)
 		disable_irq(client->irq);
 	return 0;
 }
 
-static int tps65090_i2c_resume(struct i2c_client *client)
+static int tps65090_resume(struct device *dev)
 {
+	struct i2c_client *client = to_i2c_client(dev);
 	if (client->irq)
 		enable_irq(client->irq);
 	return 0;
 }
 #endif
 
+static const struct dev_pm_ops tps65090_pm_ops = {
+	SET_SYSTEM_SLEEP_PM_OPS(tps65090_suspend, tps65090_resume)
+};
+
 static const struct i2c_device_id tps65090_id_table[] = {
 	{ "tps65090", 0 },
 	{ },
@@ -360,13 +366,10 @@ static struct i2c_driver tps65090_driver = {
 	.driver	= {
 		.name	= "tps65090",
 		.owner	= THIS_MODULE,
+		.pm	= &tps65090_pm_ops,
 	},
 	.probe		= tps65090_i2c_probe,
 	.remove		= __devexit_p(tps65090_i2c_remove),
-#ifdef CONFIG_PM_SLEEP
-	.suspend	= tps65090_i2c_suspend,
-	.resume		= tps65090_i2c_resume,
-#endif
 	.id_table	= tps65090_id_table,
 };
 
-- 
cgit v0.10.2


From 63745d4068de8ccea3580214c6dbfdca0ec37859 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@opensource.wolfsonmicro.com>
Date: Mon, 7 May 2012 10:03:19 +0100
Subject: mfd: Fix tps65910 section annotations

A warning was being generated by the reference from tps65910_i2c_probe()
to tps65910_sleepinit() since the latter was annotated as __init but the
former was unannotated. Since these functions can only be called during
device init make them both __devinit, and while we're at it also annotate
tps65910_i2c_remove() __devexit for symmetry.

Signed-off-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>

diff --git a/drivers/mfd/tps65910.c b/drivers/mfd/tps65910.c
index ae7f47b..7a55af9 100644
--- a/drivers/mfd/tps65910.c
+++ b/drivers/mfd/tps65910.c
@@ -90,7 +90,7 @@ static const struct regmap_config tps65910_regmap_config = {
 	.cache_type = REGCACHE_RBTREE,
 };
 
-static int __init tps65910_sleepinit(struct tps65910 *tps65910,
+static int __devinit tps65910_sleepinit(struct tps65910 *tps65910,
 		struct tps65910_board *pmic_pdata)
 {
 	struct device *dev = NULL;
@@ -150,8 +150,8 @@ err_sleep_init:
 }
 
 
-static int tps65910_i2c_probe(struct i2c_client *i2c,
-			    const struct i2c_device_id *id)
+static __devinit int tps65910_i2c_probe(struct i2c_client *i2c,
+					const struct i2c_device_id *id)
 {
 	struct tps65910 *tps65910;
 	struct tps65910_board *pmic_plat_data;
@@ -213,7 +213,7 @@ regmap_err:
 	return ret;
 }
 
-static int tps65910_i2c_remove(struct i2c_client *i2c)
+static __devexit int tps65910_i2c_remove(struct i2c_client *i2c)
 {
 	struct tps65910 *tps65910 = i2c_get_clientdata(i2c);
 
@@ -239,7 +239,7 @@ static struct i2c_driver tps65910_i2c_driver = {
 		   .owner = THIS_MODULE,
 	},
 	.probe = tps65910_i2c_probe,
-	.remove = tps65910_i2c_remove,
+	.remove = __devexit_p(tps65910_i2c_remove),
 	.id_table = tps65910_i2c_id,
 };
 
-- 
cgit v0.10.2


From ce7e4e11221dd7fbe82c8ad28d1875b0dfa20de4 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@opensource.wolfsonmicro.com>
Date: Mon, 7 May 2012 10:03:20 +0100
Subject: mfd: Fix wm831x register range passing for recent ARM updates

The removal of mach/io.h from most ARM platforms also set the range of
valid IO ports to be empty for most platforms when previously any 32
bit integer had been valid. This makes it impossible to add IO resources
as the added range is smaller than that of the root resource for IO ports.

Since we're not really using IO memory at all fix this by defining our
own root resource outside the normal tree and make that the parent of
all IO resources. This also ensures we won't conflict with read IO ports
if we ever run on a platform which happens to use them.

Signed-off-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>

diff --git a/drivers/mfd/wm831x-core.c b/drivers/mfd/wm831x-core.c
index 838056c..476e4d3 100644
--- a/drivers/mfd/wm831x-core.c
+++ b/drivers/mfd/wm831x-core.c
@@ -614,8 +614,15 @@ int wm831x_set_bits(struct wm831x *wm831x, unsigned short reg,
 }
 EXPORT_SYMBOL_GPL(wm831x_set_bits);
 
+static struct resource wm831x_io_parent = {
+	.start = 0,
+	.end   = 0xffffffff,
+	.flags = IORESOURCE_IO,
+};
+
 static struct resource wm831x_dcdc1_resources[] = {
 	{
+		.parent = &wm831x_io_parent,
 		.start = WM831X_DC1_CONTROL_1,
 		.end   = WM831X_DC1_DVS_CONTROL,
 		.flags = IORESOURCE_IO,
@@ -637,6 +644,7 @@ static struct resource wm831x_dcdc1_resources[] = {
 
 static struct resource wm831x_dcdc2_resources[] = {
 	{
+		.parent = &wm831x_io_parent,
 		.start = WM831X_DC2_CONTROL_1,
 		.end   = WM831X_DC2_DVS_CONTROL,
 		.flags = IORESOURCE_IO,
@@ -657,6 +665,7 @@ static struct resource wm831x_dcdc2_resources[] = {
 
 static struct resource wm831x_dcdc3_resources[] = {
 	{
+		.parent = &wm831x_io_parent,
 		.start = WM831X_DC3_CONTROL_1,
 		.end   = WM831X_DC3_SLEEP_CONTROL,
 		.flags = IORESOURCE_IO,
@@ -671,6 +680,7 @@ static struct resource wm831x_dcdc3_resources[] = {
 
 static struct resource wm831x_dcdc4_resources[] = {
 	{
+		.parent = &wm831x_io_parent,
 		.start = WM831X_DC4_CONTROL,
 		.end   = WM831X_DC4_SLEEP_CONTROL,
 		.flags = IORESOURCE_IO,
@@ -685,6 +695,7 @@ static struct resource wm831x_dcdc4_resources[] = {
 
 static struct resource wm8320_dcdc4_buck_resources[] = {
 	{
+		.parent = &wm831x_io_parent,
 		.start = WM831X_DC4_CONTROL,
 		.end   = WM832X_DC4_SLEEP_CONTROL,
 		.flags = IORESOURCE_IO,
@@ -707,6 +718,7 @@ static struct resource wm831x_gpio_resources[] = {
 
 static struct resource wm831x_isink1_resources[] = {
 	{
+		.parent = &wm831x_io_parent,
 		.start = WM831X_CURRENT_SINK_1,
 		.end   = WM831X_CURRENT_SINK_1,
 		.flags = IORESOURCE_IO,
@@ -720,6 +732,7 @@ static struct resource wm831x_isink1_resources[] = {
 
 static struct resource wm831x_isink2_resources[] = {
 	{
+		.parent = &wm831x_io_parent,
 		.start = WM831X_CURRENT_SINK_2,
 		.end   = WM831X_CURRENT_SINK_2,
 		.flags = IORESOURCE_IO,
@@ -733,6 +746,7 @@ static struct resource wm831x_isink2_resources[] = {
 
 static struct resource wm831x_ldo1_resources[] = {
 	{
+		.parent = &wm831x_io_parent,
 		.start = WM831X_LDO1_CONTROL,
 		.end   = WM831X_LDO1_SLEEP_CONTROL,
 		.flags = IORESOURCE_IO,
@@ -747,6 +761,7 @@ static struct resource wm831x_ldo1_resources[] = {
 
 static struct resource wm831x_ldo2_resources[] = {
 	{
+		.parent = &wm831x_io_parent,
 		.start = WM831X_LDO2_CONTROL,
 		.end   = WM831X_LDO2_SLEEP_CONTROL,
 		.flags = IORESOURCE_IO,
@@ -761,6 +776,7 @@ static struct resource wm831x_ldo2_resources[] = {
 
 static struct resource wm831x_ldo3_resources[] = {
 	{
+		.parent = &wm831x_io_parent,
 		.start = WM831X_LDO3_CONTROL,
 		.end   = WM831X_LDO3_SLEEP_CONTROL,
 		.flags = IORESOURCE_IO,
@@ -775,6 +791,7 @@ static struct resource wm831x_ldo3_resources[] = {
 
 static struct resource wm831x_ldo4_resources[] = {
 	{
+		.parent = &wm831x_io_parent,
 		.start = WM831X_LDO4_CONTROL,
 		.end   = WM831X_LDO4_SLEEP_CONTROL,
 		.flags = IORESOURCE_IO,
@@ -789,6 +806,7 @@ static struct resource wm831x_ldo4_resources[] = {
 
 static struct resource wm831x_ldo5_resources[] = {
 	{
+		.parent = &wm831x_io_parent,
 		.start = WM831X_LDO5_CONTROL,
 		.end   = WM831X_LDO5_SLEEP_CONTROL,
 		.flags = IORESOURCE_IO,
@@ -803,6 +821,7 @@ static struct resource wm831x_ldo5_resources[] = {
 
 static struct resource wm831x_ldo6_resources[] = {
 	{
+		.parent = &wm831x_io_parent,
 		.start = WM831X_LDO6_CONTROL,
 		.end   = WM831X_LDO6_SLEEP_CONTROL,
 		.flags = IORESOURCE_IO,
@@ -817,6 +836,7 @@ static struct resource wm831x_ldo6_resources[] = {
 
 static struct resource wm831x_ldo7_resources[] = {
 	{
+		.parent = &wm831x_io_parent,
 		.start = WM831X_LDO7_CONTROL,
 		.end   = WM831X_LDO7_SLEEP_CONTROL,
 		.flags = IORESOURCE_IO,
@@ -831,6 +851,7 @@ static struct resource wm831x_ldo7_resources[] = {
 
 static struct resource wm831x_ldo8_resources[] = {
 	{
+		.parent = &wm831x_io_parent,
 		.start = WM831X_LDO8_CONTROL,
 		.end   = WM831X_LDO8_SLEEP_CONTROL,
 		.flags = IORESOURCE_IO,
@@ -845,6 +866,7 @@ static struct resource wm831x_ldo8_resources[] = {
 
 static struct resource wm831x_ldo9_resources[] = {
 	{
+		.parent = &wm831x_io_parent,
 		.start = WM831X_LDO9_CONTROL,
 		.end   = WM831X_LDO9_SLEEP_CONTROL,
 		.flags = IORESOURCE_IO,
@@ -859,6 +881,7 @@ static struct resource wm831x_ldo9_resources[] = {
 
 static struct resource wm831x_ldo10_resources[] = {
 	{
+		.parent = &wm831x_io_parent,
 		.start = WM831X_LDO10_CONTROL,
 		.end   = WM831X_LDO10_SLEEP_CONTROL,
 		.flags = IORESOURCE_IO,
@@ -873,6 +896,7 @@ static struct resource wm831x_ldo10_resources[] = {
 
 static struct resource wm831x_ldo11_resources[] = {
 	{
+		.parent = &wm831x_io_parent,
 		.start = WM831X_LDO11_ON_CONTROL,
 		.end   = WM831X_LDO11_SLEEP_CONTROL,
 		.flags = IORESOURCE_IO,
@@ -974,6 +998,7 @@ static struct resource wm831x_rtc_resources[] = {
 
 static struct resource wm831x_status1_resources[] = {
 	{
+		.parent = &wm831x_io_parent,
 		.start = WM831X_STATUS_LED_1,
 		.end   = WM831X_STATUS_LED_1,
 		.flags = IORESOURCE_IO,
@@ -982,6 +1007,7 @@ static struct resource wm831x_status1_resources[] = {
 
 static struct resource wm831x_status2_resources[] = {
 	{
+		.parent = &wm831x_io_parent,
 		.start = WM831X_STATUS_LED_2,
 		.end   = WM831X_STATUS_LED_2,
 		.flags = IORESOURCE_IO,
-- 
cgit v0.10.2


From b7b142d9fc056e98e6fdef82dca3e87067517340 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@opensource.wolfsonmicro.com>
Date: Mon, 7 May 2012 10:03:21 +0100
Subject: mfd: Convert wm8350 physical I/O to regmap API

The driver still uses a custom cache implementation but the underlying
physical I/O is now done using the regmap API, saving some code and
avoiding allocating enormous scratch arrays on the stack.

Signed-off-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>

diff --git a/drivers/mfd/wm8350-core.c b/drivers/mfd/wm8350-core.c
index dd1caaa..8a9b11c 100644
--- a/drivers/mfd/wm8350-core.c
+++ b/drivers/mfd/wm8350-core.c
@@ -20,6 +20,7 @@
 #include <linux/device.h>
 #include <linux/delay.h>
 #include <linux/interrupt.h>
+#include <linux/regmap.h>
 #include <linux/workqueue.h>
 
 #include <linux/mfd/wm8350/core.h>
@@ -74,7 +75,7 @@ static int wm8350_phys_read(struct wm8350 *wm8350, u8 reg, int num_regs,
 	int bytes = num_regs * 2;
 
 	dev_dbg(wm8350->dev, "volatile read\n");
-	ret = wm8350->read_dev(wm8350, reg, bytes, (char *)dest);
+	ret = regmap_raw_read(wm8350->regmap, reg, dest, bytes);
 
 	for (i = reg; i < reg + num_regs; i++) {
 		/* Cache is CPU endian */
@@ -96,9 +97,6 @@ static int wm8350_read(struct wm8350 *wm8350, u8 reg, int num_regs, u16 *dest)
 	int ret = 0;
 	int bytes = num_regs * 2;
 
-	if (wm8350->read_dev == NULL)
-		return -ENODEV;
-
 	if ((reg + num_regs - 1) > WM8350_MAX_REGISTER) {
 		dev_err(wm8350->dev, "invalid reg %x\n",
 			reg + num_regs - 1);
@@ -149,9 +147,6 @@ static int wm8350_write(struct wm8350 *wm8350, u8 reg, int num_regs, u16 *src)
 	int end = reg + num_regs;
 	int bytes = num_regs * 2;
 
-	if (wm8350->write_dev == NULL)
-		return -ENODEV;
-
 	if ((reg + num_regs - 1) > WM8350_MAX_REGISTER) {
 		dev_err(wm8350->dev, "invalid reg %x\n",
 			reg + num_regs - 1);
@@ -182,7 +177,7 @@ static int wm8350_write(struct wm8350 *wm8350, u8 reg, int num_regs, u16 *src)
 	}
 
 	/* Actually write it out */
-	return wm8350->write_dev(wm8350, reg, bytes, (char *)src);
+	return regmap_raw_write(wm8350->regmap, reg, src, bytes);
 }
 
 /*
@@ -515,9 +510,8 @@ static int wm8350_create_cache(struct wm8350 *wm8350, int type, int mode)
 	 * a PMIC so the device many not be in a virgin state and we
 	 * can't rely on the silicon values.
 	 */
-	ret = wm8350->read_dev(wm8350, 0,
-			       sizeof(u16) * (WM8350_MAX_REGISTER + 1),
-			       wm8350->reg_cache);
+	ret = regmap_raw_read(wm8350->regmap, 0, wm8350->reg_cache,
+			      sizeof(u16) * (WM8350_MAX_REGISTER + 1));
 	if (ret < 0) {
 		dev_err(wm8350->dev,
 			"failed to read initial cache values\n");
@@ -570,35 +564,30 @@ int wm8350_device_init(struct wm8350 *wm8350, int irq,
 		       struct wm8350_platform_data *pdata)
 {
 	int ret;
-	u16 id1, id2, mask_rev;
-	u16 cust_id, mode, chip_rev;
+	unsigned int id1, id2, mask_rev;
+	unsigned int cust_id, mode, chip_rev;
 
 	dev_set_drvdata(wm8350->dev, wm8350);
 
 	/* get WM8350 revision and config mode */
-	ret = wm8350->read_dev(wm8350, WM8350_RESET_ID, sizeof(id1), &id1);
+	ret = regmap_read(wm8350->regmap, WM8350_RESET_ID, &id1);
 	if (ret != 0) {
 		dev_err(wm8350->dev, "Failed to read ID: %d\n", ret);
 		goto err;
 	}
 
-	ret = wm8350->read_dev(wm8350, WM8350_ID, sizeof(id2), &id2);
+	ret = regmap_read(wm8350->regmap, WM8350_ID, &id2);
 	if (ret != 0) {
 		dev_err(wm8350->dev, "Failed to read ID: %d\n", ret);
 		goto err;
 	}
 
-	ret = wm8350->read_dev(wm8350, WM8350_REVISION, sizeof(mask_rev),
-			       &mask_rev);
+	ret = regmap_read(wm8350->regmap, WM8350_REVISION, &mask_rev);
 	if (ret != 0) {
 		dev_err(wm8350->dev, "Failed to read revision: %d\n", ret);
 		goto err;
 	}
 
-	id1 = be16_to_cpu(id1);
-	id2 = be16_to_cpu(id2);
-	mask_rev = be16_to_cpu(mask_rev);
-
 	if (id1 != 0x6143) {
 		dev_err(wm8350->dev,
 			"Device with ID %x is not a WM8350\n", id1);
diff --git a/drivers/mfd/wm8350-i2c.c b/drivers/mfd/wm8350-i2c.c
index d955faa..271589f 100644
--- a/drivers/mfd/wm8350-i2c.c
+++ b/drivers/mfd/wm8350-i2c.c
@@ -15,47 +15,18 @@
 
 #include <linux/module.h>
 #include <linux/moduleparam.h>
+#include <linux/err.h>
 #include <linux/init.h>
 #include <linux/i2c.h>
 #include <linux/platform_device.h>
 #include <linux/mfd/wm8350/core.h>
+#include <linux/regmap.h>
 #include <linux/slab.h>
 
-static int wm8350_i2c_read_device(struct wm8350 *wm8350, char reg,
-				  int bytes, void *dest)
-{
-	int ret;
-
-	ret = i2c_master_send(wm8350->i2c_client, &reg, 1);
-	if (ret < 0)
-		return ret;
-	ret = i2c_master_recv(wm8350->i2c_client, dest, bytes);
-	if (ret < 0)
-		return ret;
-	if (ret != bytes)
-		return -EIO;
-	return 0;
-}
-
-static int wm8350_i2c_write_device(struct wm8350 *wm8350, char reg,
-				   int bytes, void *src)
-{
-	/* we add 1 byte for device register */
-	u8 msg[(WM8350_MAX_REGISTER << 1) + 1];
-	int ret;
-
-	if (bytes > ((WM8350_MAX_REGISTER << 1) + 1))
-		return -EINVAL;
-
-	msg[0] = reg;
-	memcpy(&msg[1], src, bytes);
-	ret = i2c_master_send(wm8350->i2c_client, msg, bytes + 1);
-	if (ret < 0)
-		return ret;
-	if (ret != bytes + 1)
-		return -EIO;
-	return 0;
-}
+static const struct regmap_config wm8350_regmap = {
+	.reg_bits = 8,
+	.val_bits = 16,
+};
 
 static int wm8350_i2c_probe(struct i2c_client *i2c,
 			    const struct i2c_device_id *id)
@@ -67,11 +38,16 @@ static int wm8350_i2c_probe(struct i2c_client *i2c,
 	if (wm8350 == NULL)
 		return -ENOMEM;
 
+	wm8350->regmap = devm_regmap_init_i2c(i2c, &wm8350_regmap);
+	if (IS_ERR(wm8350->regmap)) {
+		ret = PTR_ERR(wm8350->regmap);
+		dev_err(&i2c->dev, "Failed to allocate register map: %d\n",
+			ret);
+		return ret;
+	}
+
 	i2c_set_clientdata(i2c, wm8350);
 	wm8350->dev = &i2c->dev;
-	wm8350->i2c_client = i2c;
-	wm8350->read_dev = wm8350_i2c_read_device;
-	wm8350->write_dev = wm8350_i2c_write_device;
 
 	ret = wm8350_device_init(wm8350, i2c->irq, i2c->dev.platform_data);
 	if (ret < 0)
@@ -80,6 +56,7 @@ static int wm8350_i2c_probe(struct i2c_client *i2c,
 	return ret;
 
 err:
+	regmap_exit(wm8350->regmap);
 	return ret;
 }
 
diff --git a/include/linux/mfd/wm8350/core.h b/include/linux/mfd/wm8350/core.h
index 98fcc97..9192b64 100644
--- a/include/linux/mfd/wm8350/core.h
+++ b/include/linux/mfd/wm8350/core.h
@@ -602,6 +602,7 @@ extern const u16 wm8352_mode2_defaults[];
 extern const u16 wm8352_mode3_defaults[];
 
 struct wm8350;
+struct regmap;
 
 struct wm8350_hwmon {
 	struct platform_device *pdev;
@@ -612,13 +613,7 @@ struct wm8350 {
 	struct device *dev;
 
 	/* device IO */
-	union {
-		struct i2c_client *i2c_client;
-		struct spi_device *spi_device;
-	};
-	int (*read_dev)(struct wm8350 *wm8350, char reg, int size, void *dest);
-	int (*write_dev)(struct wm8350 *wm8350, char reg, int size,
-			 void *src);
+	struct regmap *regmap;
 	u16 *reg_cache;
 
 	struct mutex auxadc_mutex;
-- 
cgit v0.10.2


From cc7a727941193e3e59be2e9f6522eb78bc7ee909 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@opensource.wolfsonmicro.com>
Date: Mon, 7 May 2012 10:03:22 +0100
Subject: mfd: Read CUST_ID from the wm8994 device

Read CUST_ID from the device and log it for diagnostics.

Signed-off-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>

diff --git a/drivers/mfd/wm8994-core.c b/drivers/mfd/wm8994-core.c
index 9d7ca1e..60e6175 100644
--- a/drivers/mfd/wm8994-core.c
+++ b/drivers/mfd/wm8994-core.c
@@ -500,7 +500,8 @@ static __devinit int wm8994_device_init(struct wm8994 *wm8994, int irq)
 			ret);
 		goto err_enable;
 	}
-	wm8994->revision = ret;
+	wm8994->revision = ret & WM8994_CHIP_REV_MASK;
+	wm8994->cust_id = (ret & WM8994_CUST_ID_MASK) >> WM8994_CUST_ID_SHIFT;
 
 	switch (wm8994->type) {
 	case WM8994:
@@ -553,8 +554,8 @@ static __devinit int wm8994_device_init(struct wm8994 *wm8994, int irq)
 		break;
 	}
 
-	dev_info(wm8994->dev, "%s revision %c\n", devname,
-		 'A' + wm8994->revision);
+	dev_info(wm8994->dev, "%s revision %c CUST_ID %02x\n", devname,
+		 'A' + wm8994->revision, wm8994->cust_id);
 
 	switch (wm8994->type) {
 	case WM1811:
diff --git a/include/linux/mfd/wm8994/core.h b/include/linux/mfd/wm8994/core.h
index 9eff2a3..d41bc7b 100644
--- a/include/linux/mfd/wm8994/core.h
+++ b/include/linux/mfd/wm8994/core.h
@@ -57,6 +57,7 @@ struct wm8994 {
 
 	enum wm8994_type type;
 	int revision;
+	int cust_id;
 
 	struct device *dev;
 	struct regmap *regmap;
diff --git a/include/linux/mfd/wm8994/registers.h b/include/linux/mfd/wm8994/registers.h
index 86e6a03..0535489 100644
--- a/include/linux/mfd/wm8994/registers.h
+++ b/include/linux/mfd/wm8994/registers.h
@@ -2212,6 +2212,9 @@
 /*
  * R256 (0x100) - Chip Revision
  */
+#define WM8994_CUST_ID_MASK                     0xFF00  /* CUST_ID - [15:8] */
+#define WM8994_CUST_ID_SHIFT                         8  /* CUST_ID - [15:8] */
+#define WM8994_CUST_ID_WIDTH                         8  /* CUST_ID - [15:8] */
 #define WM8994_CHIP_REV_MASK                    0x000F  /* CHIP_REV - [3:0] */
 #define WM8994_CHIP_REV_SHIFT                        0  /* CHIP_REV - [3:0] */
 #define WM8994_CHIP_REV_WIDTH                        4  /* CHIP_REV - [3:0] */
-- 
cgit v0.10.2


From ceb57d27e28a8f979cbfd6391b7da6da51484059 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@opensource.wolfsonmicro.com>
Date: Mon, 7 May 2012 10:03:23 +0100
Subject: mfd: Convert wm8994 to module_i2c_driver()

Signed-off-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>

diff --git a/drivers/mfd/wm8994-core.c b/drivers/mfd/wm8994-core.c
index 60e6175..1e321d3 100644
--- a/drivers/mfd/wm8994-core.c
+++ b/drivers/mfd/wm8994-core.c
@@ -733,23 +733,7 @@ static struct i2c_driver wm8994_i2c_driver = {
 	.id_table = wm8994_i2c_id,
 };
 
-static int __init wm8994_i2c_init(void)
-{
-	int ret;
-
-	ret = i2c_add_driver(&wm8994_i2c_driver);
-	if (ret != 0)
-		pr_err("Failed to register wm8994 I2C driver: %d\n", ret);
-
-	return ret;
-}
-module_init(wm8994_i2c_init);
-
-static void __exit wm8994_i2c_exit(void)
-{
-	i2c_del_driver(&wm8994_i2c_driver);
-}
-module_exit(wm8994_i2c_exit);
+module_i2c_driver(wm8994_i2c_driver);
 
 MODULE_DESCRIPTION("Core support for the WM8994 audio CODEC");
 MODULE_LICENSE("GPL");
-- 
cgit v0.10.2


From 14546c337588370dced50dcaf43398939be9829e Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Sun, 6 May 2012 19:10:59 -0400
Subject: NFS: Don't do a full flush to disk on close() if we hold a delegation

If we hold a delegation then we know that it should be safe to continue
to cache the data beyond the close(). However since the process that wrote
the data may die after close(), we may still want to send the data to
server before those RPCSEC_GSS credentials expire. We therefore compromise
by starting writeback to the server, but don't wait for completion.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index aa9b709..8eda8a6 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -174,6 +174,13 @@ nfs_file_flush(struct file *file, fl_owner_t id)
 	if ((file->f_mode & FMODE_WRITE) == 0)
 		return 0;
 
+	/*
+	 * If we're holding a write delegation, then just start the i/o
+	 * but don't wait for completion (or send a commit).
+	 */
+	if (nfs_have_delegation(inode, FMODE_WRITE))
+		return filemap_fdatawrite(file->f_mapping);
+
 	/* Flush writes to the server and return any errors */
 	return vfs_fsync(file, 0);
 }
-- 
cgit v0.10.2


From c57d1bc5e043dbb5ba82ded07003d71a8033d899 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Sun, 6 May 2012 19:34:17 -0400
Subject: NFS: nfs_inode_return_delegation() should always flush dirty data

The assumption is that if you are in a situation where you need to
return the delegation, then you should probably stop caching the
data anyway.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index 89af1d2..a19cb5a 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -380,6 +380,10 @@ void nfs_inode_return_delegation_noreclaim(struct inode *inode)
  * nfs_inode_return_delegation - synchronously return a delegation
  * @inode: inode to process
  *
+ * This routine will always flush any dirty data to disk on the
+ * assumption that if we need to return the delegation, then
+ * we should stop caching.
+ *
  * Returns zero on success, or a negative errno value.
  */
 int nfs_inode_return_delegation(struct inode *inode)
@@ -389,10 +393,10 @@ int nfs_inode_return_delegation(struct inode *inode)
 	struct nfs_delegation *delegation;
 	int err = 0;
 
+	nfs_wb_all(inode);
 	if (rcu_access_pointer(nfsi->delegation) != NULL) {
 		delegation = nfs_detach_delegation(nfsi, server);
 		if (delegation != NULL) {
-			nfs_wb_all(inode);
 			err = __nfs_inode_return_delegation(inode, delegation, 1);
 		}
 	}
diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h
index cd6a7a8..72709c4 100644
--- a/fs/nfs/delegation.h
+++ b/fs/nfs/delegation.h
@@ -66,6 +66,7 @@ static inline int nfs_have_delegation(struct inode *inode, fmode_t flags)
 
 static inline int nfs_inode_return_delegation(struct inode *inode)
 {
+	nfs_wb_all(inode);
 	return 0;
 }
 #endif
-- 
cgit v0.10.2


From dc327ed4cd320be689596365372a3683208c3ba0 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Sun, 6 May 2012 19:46:30 -0400
Subject: NFSv4: nfs_client_return_marked_delegations can't flush data

Since even filemap_flush() needs to lock pages that are dirty, we
cannot risk calling it from the state manager context. Therefore,
we need to move the call to filemap_flush() to
nfs_async_inode_return_delegation().

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index a19cb5a..bd3a960 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -316,6 +316,10 @@ out:
  * nfs_client_return_marked_delegations - return previously marked delegations
  * @clp: nfs_client to process
  *
+ * Note that this function is designed to be called by the state
+ * manager thread. For this reason, it cannot flush the dirty data,
+ * since that could deadlock in case of a state recovery error.
+ *
  * Returns zero on success, or a negative errno value.
  */
 int nfs_client_return_marked_delegations(struct nfs_client *clp)
@@ -340,11 +344,9 @@ restart:
 								server);
 			rcu_read_unlock();
 
-			if (delegation != NULL) {
-				filemap_flush(inode->i_mapping);
+			if (delegation != NULL)
 				err = __nfs_inode_return_delegation(inode,
 								delegation, 0);
-			}
 			iput(inode);
 			if (!err)
 				goto restart;
@@ -542,6 +544,8 @@ int nfs_async_inode_return_delegation(struct inode *inode,
 	struct nfs_client *clp = server->nfs_client;
 	struct nfs_delegation *delegation;
 
+	filemap_flush(inode->i_mapping);
+
 	rcu_read_lock();
 	delegation = rcu_dereference(NFS_I(inode)->delegation);
 
-- 
cgit v0.10.2


From 35bdd29095ad614c5fb4a934bfd4f57a94dfd395 Mon Sep 17 00:00:00 2001
From: Alessandro Rubini <rubini@gnudd.com>
Date: Thu, 12 Apr 2012 10:48:44 +0200
Subject: mfd: Add driver for STA2X11 MFD block

This also introduces <asm/sta2x11.h> to export a function that is in
the base sta2x11 support patches. The header will increase with other
prototypes and constants over time.

Signed-off-by: Alessandro Rubini <rubini@gnudd.com>
Acked-by: Giancarlo Asnaghi <giancarlo.asnaghi@st.com>
Cc: Alan Cox <alan@linux.intel.com>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>

diff --git a/arch/x86/include/asm/sta2x11.h b/arch/x86/include/asm/sta2x11.h
new file mode 100644
index 0000000..e9d32df
--- /dev/null
+++ b/arch/x86/include/asm/sta2x11.h
@@ -0,0 +1,12 @@
+/*
+ * Header file for STMicroelectronics ConneXt (STA2X11) IOHub
+ */
+#ifndef __ASM_STA2X11_H
+#define __ASM_STA2X11_H
+
+#include <linux/pci.h>
+
+/* This needs to be called from the MFD to configure its sub-devices */
+struct sta2x11_instance *sta2x11_get_instance(struct pci_dev *pdev);
+
+#endif /* __ASM_STA2X11_H */
diff --git a/drivers/mfd/Kconfig b/drivers/mfd/Kconfig
index ef86a74..48eed22 100644
--- a/drivers/mfd/Kconfig
+++ b/drivers/mfd/Kconfig
@@ -906,6 +906,11 @@ config MFD_RC5T583
 	  Additional drivers must be enabled in order to use the
 	  different functionality of the device.
 
+config MFD_STA2X11
+	bool "STA2X11 multi function device support"
+	depends on STA2X11
+	select MFD_CORE
+
 config MFD_ANATOP
 	bool "Support for Freescale i.MX on-chip ANATOP controller"
 	depends on SOC_IMX6Q
diff --git a/drivers/mfd/Makefile b/drivers/mfd/Makefile
index 5dd6be7..0dc55cb 100644
--- a/drivers/mfd/Makefile
+++ b/drivers/mfd/Makefile
@@ -15,6 +15,7 @@ obj-$(CONFIG_MFD_DAVINCI_VOICECODEC)	+= davinci_voicecodec.o
 obj-$(CONFIG_MFD_DM355EVM_MSP)	+= dm355evm_msp.o
 obj-$(CONFIG_MFD_TI_SSP)	+= ti-ssp.o
 
+obj-$(CONFIG_MFD_STA2X11)	+= sta2x11-mfd.o
 obj-$(CONFIG_MFD_STMPE)		+= stmpe.o
 obj-$(CONFIG_STMPE_I2C)		+= stmpe-i2c.o
 obj-$(CONFIG_STMPE_SPI)		+= stmpe-spi.o
diff --git a/drivers/mfd/sta2x11-mfd.c b/drivers/mfd/sta2x11-mfd.c
new file mode 100644
index 0000000..d31fed0
--- /dev/null
+++ b/drivers/mfd/sta2x11-mfd.c
@@ -0,0 +1,467 @@
+/*
+ * Copyright (c) 2009-2011 Wind River Systems, Inc.
+ * Copyright (c) 2011 ST Microelectronics (Alessandro Rubini)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ * See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/errno.h>
+#include <linux/device.h>
+#include <linux/slab.h>
+#include <linux/list.h>
+#include <linux/io.h>
+#include <linux/ioport.h>
+#include <linux/pci.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+#include <linux/platform_device.h>
+#include <linux/mfd/core.h>
+#include <linux/mfd/sta2x11-mfd.h>
+
+#include <asm/sta2x11.h>
+
+/* This describes STA2X11 MFD chip for us, we may have several */
+struct sta2x11_mfd {
+	struct sta2x11_instance *instance;
+	spinlock_t lock;
+	struct list_head list;
+	void __iomem *sctl_regs;
+	void __iomem *apbreg_regs;
+};
+
+static LIST_HEAD(sta2x11_mfd_list);
+
+/* Three functions to act on the list */
+static struct sta2x11_mfd *sta2x11_mfd_find(struct pci_dev *pdev)
+{
+	struct sta2x11_instance *instance;
+	struct sta2x11_mfd *mfd;
+
+	if (!pdev && !list_empty(&sta2x11_mfd_list)) {
+		pr_warning("%s: Unspecified device, "
+			    "using first instance\n", __func__);
+		return list_entry(sta2x11_mfd_list.next,
+				  struct sta2x11_mfd, list);
+	}
+
+	instance = sta2x11_get_instance(pdev);
+	if (!instance)
+		return NULL;
+	list_for_each_entry(mfd, &sta2x11_mfd_list, list) {
+		if (mfd->instance == instance)
+			return mfd;
+	}
+	return NULL;
+}
+
+static int __devinit sta2x11_mfd_add(struct pci_dev *pdev, gfp_t flags)
+{
+	struct sta2x11_mfd *mfd = sta2x11_mfd_find(pdev);
+	struct sta2x11_instance *instance;
+
+	if (mfd)
+		return -EBUSY;
+	instance = sta2x11_get_instance(pdev);
+	if (!instance)
+		return -EINVAL;
+	mfd = kzalloc(sizeof(*mfd), flags);
+	if (!mfd)
+		return -ENOMEM;
+	INIT_LIST_HEAD(&mfd->list);
+	spin_lock_init(&mfd->lock);
+	mfd->instance = instance;
+	list_add(&mfd->list, &sta2x11_mfd_list);
+	return 0;
+}
+
+static int __devexit mfd_remove(struct pci_dev *pdev)
+{
+	struct sta2x11_mfd *mfd = sta2x11_mfd_find(pdev);
+
+	if (!mfd)
+		return -ENODEV;
+	list_del(&mfd->list);
+	kfree(mfd);
+	return 0;
+}
+
+/* These two functions are exported and are not expected to fail */
+u32 sta2x11_sctl_mask(struct pci_dev *pdev, u32 reg, u32 mask, u32 val)
+{
+	struct sta2x11_mfd *mfd = sta2x11_mfd_find(pdev);
+	u32 r;
+	unsigned long flags;
+
+	if (!mfd) {
+		dev_warn(&pdev->dev, ": can't access sctl regs\n");
+		return 0;
+	}
+	if (!mfd->sctl_regs) {
+		dev_warn(&pdev->dev, ": system ctl not initialized\n");
+		return 0;
+	}
+	spin_lock_irqsave(&mfd->lock, flags);
+	r = readl(mfd->sctl_regs + reg);
+	r &= ~mask;
+	r |= val;
+	if (mask)
+		writel(r, mfd->sctl_regs + reg);
+	spin_unlock_irqrestore(&mfd->lock, flags);
+	return r;
+}
+EXPORT_SYMBOL(sta2x11_sctl_mask);
+
+u32 sta2x11_apbreg_mask(struct pci_dev *pdev, u32 reg, u32 mask, u32 val)
+{
+	struct sta2x11_mfd *mfd = sta2x11_mfd_find(pdev);
+	u32 r;
+	unsigned long flags;
+
+	if (!mfd) {
+		dev_warn(&pdev->dev, ": can't access apb regs\n");
+		return 0;
+	}
+	if (!mfd->apbreg_regs) {
+		dev_warn(&pdev->dev, ": apb bridge not initialized\n");
+		return 0;
+	}
+	spin_lock_irqsave(&mfd->lock, flags);
+	r = readl(mfd->apbreg_regs + reg);
+	r &= ~mask;
+	r |= val;
+	if (mask)
+		writel(r, mfd->apbreg_regs + reg);
+	spin_unlock_irqrestore(&mfd->lock, flags);
+	return r;
+}
+EXPORT_SYMBOL(sta2x11_apbreg_mask);
+
+/* Two debugfs files, for our registers (FIXME: one instance only) */
+#define REG(regname) {.name = #regname, .offset = SCTL_ ## regname}
+static struct debugfs_reg32 sta2x11_sctl_regs[] = {
+	REG(SCCTL), REG(ARMCFG), REG(SCPLLCTL), REG(SCPLLFCTRL),
+	REG(SCRESFRACT), REG(SCRESCTRL1), REG(SCRESXTRL2), REG(SCPEREN0),
+	REG(SCPEREN1), REG(SCPEREN2), REG(SCGRST), REG(SCPCIPMCR1),
+	REG(SCPCIPMCR2), REG(SCPCIPMSR1), REG(SCPCIPMSR2), REG(SCPCIPMSR3),
+	REG(SCINTREN), REG(SCRISR), REG(SCCLKSTAT0), REG(SCCLKSTAT1),
+	REG(SCCLKSTAT2), REG(SCRSTSTA),
+};
+#undef REG
+
+static struct debugfs_regset32 sctl_regset = {
+	.regs = sta2x11_sctl_regs,
+	.nregs = ARRAY_SIZE(sta2x11_sctl_regs),
+};
+
+#define REG(regname) {.name = #regname, .offset = regname}
+static struct debugfs_reg32 sta2x11_apbreg_regs[] = {
+	REG(APBREG_BSR), REG(APBREG_PAER), REG(APBREG_PWAC), REG(APBREG_PRAC),
+	REG(APBREG_PCG), REG(APBREG_PUR), REG(APBREG_EMU_PCG),
+};
+#undef REG
+
+static struct debugfs_regset32 apbreg_regset = {
+	.regs = sta2x11_apbreg_regs,
+	.nregs = ARRAY_SIZE(sta2x11_apbreg_regs),
+};
+
+static struct dentry *sta2x11_sctl_debugfs;
+static struct dentry *sta2x11_apbreg_debugfs;
+
+/* Probe for the two platform devices */
+static int sta2x11_sctl_probe(struct platform_device *dev)
+{
+	struct pci_dev **pdev;
+	struct sta2x11_mfd *mfd;
+	struct resource *res;
+
+	pdev = dev->dev.platform_data;
+	mfd = sta2x11_mfd_find(*pdev);
+	if (!mfd)
+		return -ENODEV;
+
+	res = platform_get_resource(dev, IORESOURCE_MEM, 0);
+	if (!res)
+		return -ENOMEM;
+
+	if (!request_mem_region(res->start, resource_size(res),
+				"sta2x11-sctl"))
+		return -EBUSY;
+
+	mfd->sctl_regs = ioremap(res->start, resource_size(res));
+	if (!mfd->sctl_regs) {
+		release_mem_region(res->start, resource_size(res));
+		return -ENOMEM;
+	}
+	sctl_regset.base = mfd->sctl_regs;
+	sta2x11_sctl_debugfs = debugfs_create_regset32("sta2x11-sctl",
+						  S_IFREG | S_IRUGO,
+						  NULL, &sctl_regset);
+	return 0;
+}
+
+static int sta2x11_apbreg_probe(struct platform_device *dev)
+{
+	struct pci_dev **pdev;
+	struct sta2x11_mfd *mfd;
+	struct resource *res;
+
+	pdev = dev->dev.platform_data;
+	dev_dbg(&dev->dev, "%s: pdata is %p\n", __func__, pdev);
+	dev_dbg(&dev->dev, "%s: *pdata is %p\n", __func__, *pdev);
+
+	mfd = sta2x11_mfd_find(*pdev);
+	if (!mfd)
+		return -ENODEV;
+
+	res = platform_get_resource(dev, IORESOURCE_MEM, 0);
+	if (!res)
+		return -ENOMEM;
+
+	if (!request_mem_region(res->start, resource_size(res),
+				"sta2x11-apbreg"))
+		return -EBUSY;
+
+	mfd->apbreg_regs = ioremap(res->start, resource_size(res));
+	if (!mfd->apbreg_regs) {
+		release_mem_region(res->start, resource_size(res));
+		return -ENOMEM;
+	}
+	dev_dbg(&dev->dev, "%s: regbase %p\n", __func__, mfd->apbreg_regs);
+
+	apbreg_regset.base = mfd->apbreg_regs;
+	sta2x11_apbreg_debugfs = debugfs_create_regset32("sta2x11-apbreg",
+						  S_IFREG | S_IRUGO,
+						  NULL, &apbreg_regset);
+	return 0;
+}
+
+/* The two platform drivers */
+static struct platform_driver sta2x11_sctl_platform_driver = {
+	.driver = {
+		.name	= "sta2x11-sctl",
+		.owner	= THIS_MODULE,
+	},
+	.probe		= sta2x11_sctl_probe,
+};
+
+static int __init sta2x11_sctl_init(void)
+{
+	pr_info("%s\n", __func__);
+	return platform_driver_register(&sta2x11_sctl_platform_driver);
+}
+
+static struct platform_driver sta2x11_platform_driver = {
+	.driver = {
+		.name	= "sta2x11-apbreg",
+		.owner	= THIS_MODULE,
+	},
+	.probe		= sta2x11_apbreg_probe,
+};
+
+static int __init sta2x11_apbreg_init(void)
+{
+	pr_info("%s\n", __func__);
+	return platform_driver_register(&sta2x11_platform_driver);
+}
+
+/*
+ * What follows is the PCI device that hosts the above two pdevs.
+ * Each logic block is 4kB and they are all consecutive: we use this info.
+ */
+
+/* Bar 0 */
+enum bar0_cells {
+	STA2X11_GPIO_0 = 0,
+	STA2X11_GPIO_1,
+	STA2X11_GPIO_2,
+	STA2X11_GPIO_3,
+	STA2X11_SCTL,
+	STA2X11_SCR,
+	STA2X11_TIME,
+};
+/* Bar 1 */
+enum bar1_cells {
+	STA2X11_APBREG = 0,
+};
+#define CELL_4K(_name, _cell) { \
+		.name = _name, \
+		.start = _cell * 4096, .end = _cell * 4096 + 4095, \
+		.flags = IORESOURCE_MEM, \
+		}
+
+static const __devinitconst struct resource gpio_resources[] = {
+	{
+		.name = "sta2x11_gpio", /* 4 consecutive cells, 1 driver */
+		.start = 0,
+		.end = (4 * 4096) - 1,
+		.flags = IORESOURCE_MEM,
+	}
+};
+static const __devinitconst struct resource sctl_resources[] = {
+	CELL_4K("sta2x11-sctl", STA2X11_SCTL),
+};
+static const __devinitconst struct resource scr_resources[] = {
+	CELL_4K("sta2x11-scr", STA2X11_SCR),
+};
+static const __devinitconst struct resource time_resources[] = {
+	CELL_4K("sta2x11-time", STA2X11_TIME),
+};
+
+static const __devinitconst struct resource apbreg_resources[] = {
+	CELL_4K("sta2x11-apbreg", STA2X11_APBREG),
+};
+
+#define DEV(_name, _r) \
+	{ .name = _name, .num_resources = ARRAY_SIZE(_r), .resources = _r, }
+
+static __devinitdata struct mfd_cell sta2x11_mfd_bar0[] = {
+	DEV("sta2x11-gpio", gpio_resources), /* offset 0: we add pdata later */
+	DEV("sta2x11-sctl", sctl_resources),
+	DEV("sta2x11-scr", scr_resources),
+	DEV("sta2x11-time", time_resources),
+};
+
+static __devinitdata struct mfd_cell sta2x11_mfd_bar1[] = {
+	DEV("sta2x11-apbreg", apbreg_resources),
+};
+
+static int sta2x11_mfd_suspend(struct pci_dev *pdev, pm_message_t state)
+{
+	pci_save_state(pdev);
+	pci_disable_device(pdev);
+	pci_set_power_state(pdev, pci_choose_state(pdev, state));
+
+	return 0;
+}
+
+static int sta2x11_mfd_resume(struct pci_dev *pdev)
+{
+	int err;
+
+	pci_set_power_state(pdev, 0);
+	err = pci_enable_device(pdev);
+	if (err)
+		return err;
+	pci_restore_state(pdev);
+
+	return 0;
+}
+
+static int __devinit sta2x11_mfd_probe(struct pci_dev *pdev,
+				       const struct pci_device_id *pci_id)
+{
+	int err, i;
+	struct sta2x11_gpio_pdata *gpio_data;
+
+	dev_info(&pdev->dev, "%s\n", __func__);
+
+	err = pci_enable_device(pdev);
+	if (err) {
+		dev_err(&pdev->dev, "Can't enable device.\n");
+		return err;
+	}
+
+	err = pci_enable_msi(pdev);
+	if (err)
+		dev_info(&pdev->dev, "Enable msi failed\n");
+
+	/* Read gpio config data as pci device's platform data */
+	gpio_data = dev_get_platdata(&pdev->dev);
+	if (!gpio_data)
+		dev_warn(&pdev->dev, "no gpio configuration\n");
+
+	dev_dbg(&pdev->dev, "%s, gpio_data = %p (%p)\n", __func__,
+		gpio_data, &gpio_data);
+	dev_dbg(&pdev->dev, "%s, pdev = %p (%p)\n", __func__,
+		pdev, &pdev);
+
+	/* platform data is the pci device for all of them */
+	for (i = 0; i < ARRAY_SIZE(sta2x11_mfd_bar0); i++) {
+		sta2x11_mfd_bar0[i].pdata_size = sizeof(pdev);
+		sta2x11_mfd_bar0[i].platform_data = &pdev;
+	}
+	sta2x11_mfd_bar1[0].pdata_size = sizeof(pdev);
+	sta2x11_mfd_bar1[0].platform_data = &pdev;
+
+	/* Record this pdev before mfd_add_devices: their probe looks for it */
+	sta2x11_mfd_add(pdev, GFP_ATOMIC);
+
+
+	err = mfd_add_devices(&pdev->dev, -1,
+			      sta2x11_mfd_bar0,
+			      ARRAY_SIZE(sta2x11_mfd_bar0),
+			      &pdev->resource[0],
+			      0);
+	if (err) {
+		dev_err(&pdev->dev, "mfd_add_devices[0] failed: %d\n", err);
+		goto err_disable;
+	}
+
+	err = mfd_add_devices(&pdev->dev, -1,
+			      sta2x11_mfd_bar1,
+			      ARRAY_SIZE(sta2x11_mfd_bar1),
+			      &pdev->resource[1],
+			      0);
+	if (err) {
+		dev_err(&pdev->dev, "mfd_add_devices[1] failed: %d\n", err);
+		goto err_disable;
+	}
+
+	return 0;
+
+err_disable:
+	mfd_remove_devices(&pdev->dev);
+	pci_disable_device(pdev);
+	pci_disable_msi(pdev);
+	return err;
+}
+
+static DEFINE_PCI_DEVICE_TABLE(sta2x11_mfd_tbl) = {
+	{PCI_DEVICE(PCI_VENDOR_ID_STMICRO, PCI_DEVICE_ID_STMICRO_GPIO)},
+	{0,},
+};
+
+static struct pci_driver sta2x11_mfd_driver = {
+	.name =		"sta2x11-mfd",
+	.id_table =	sta2x11_mfd_tbl,
+	.probe =	sta2x11_mfd_probe,
+	.suspend =	sta2x11_mfd_suspend,
+	.resume =	sta2x11_mfd_resume,
+};
+
+static int __init sta2x11_mfd_init(void)
+{
+	pr_info("%s\n", __func__);
+	return pci_register_driver(&sta2x11_mfd_driver);
+}
+
+/*
+ * All of this must be ready before "normal" devices like MMCI appear.
+ * But MFD (the pci device) can't be too early. The following choice
+ * prepares platform drivers very early and probe the PCI device later,
+ * but before other PCI devices.
+ */
+subsys_initcall(sta2x11_apbreg_init);
+subsys_initcall(sta2x11_sctl_init);
+rootfs_initcall(sta2x11_mfd_init);
+
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Wind River");
+MODULE_DESCRIPTION("STA2x11 mfd for GPIO, SCTL and APBREG");
+MODULE_DEVICE_TABLE(pci, sta2x11_mfd_tbl);
diff --git a/include/linux/mfd/sta2x11-mfd.h b/include/linux/mfd/sta2x11-mfd.h
new file mode 100644
index 0000000..d179227
--- /dev/null
+++ b/include/linux/mfd/sta2x11-mfd.h
@@ -0,0 +1,324 @@
+/*
+ * Copyright (c) 2009-2011 Wind River Systems, Inc.
+ * Copyright (c) 2011 ST Microelectronics (Alessandro Rubini)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ * See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * The STMicroelectronics ConneXt (STA2X11) chip has several unrelated
+ * functions in one PCI endpoint functions. This driver simply
+ * registers the platform devices in this iomemregion and exports a few
+ * functions to access common registers
+ */
+
+#ifndef __STA2X11_MFD_H
+#define __STA2X11_MFD_H
+#include <linux/types.h>
+#include <linux/pci.h>
+
+/*
+ * The MFD PCI block includes the GPIO peripherals and other register blocks.
+ * For GPIO, we have 32*4 bits (I use "gsta" for "gpio sta2x11".)
+ */
+#define GSTA_GPIO_PER_BLOCK	32
+#define GSTA_NR_BLOCKS		4
+#define GSTA_NR_GPIO		(GSTA_GPIO_PER_BLOCK * GSTA_NR_BLOCKS)
+
+/* Pinconfig is set by the board definition: altfunc, pull-up, pull-down */
+struct sta2x11_gpio_pdata {
+	unsigned pinconfig[GSTA_NR_GPIO];
+};
+
+/* Macros below lifted from sh_pfc.h, with minor differences */
+#define PINMUX_TYPE_NONE		0
+#define PINMUX_TYPE_FUNCTION		1
+#define PINMUX_TYPE_OUTPUT_LOW		2
+#define PINMUX_TYPE_OUTPUT_HIGH		3
+#define PINMUX_TYPE_INPUT		4
+#define PINMUX_TYPE_INPUT_PULLUP	5
+#define PINMUX_TYPE_INPUT_PULLDOWN	6
+
+/* Give names to GPIO pins, like PXA does, taken from the manual */
+#define STA2X11_GPIO0			0
+#define STA2X11_GPIO1			1
+#define STA2X11_GPIO2			2
+#define STA2X11_GPIO3			3
+#define STA2X11_GPIO4			4
+#define STA2X11_GPIO5			5
+#define STA2X11_GPIO6			6
+#define STA2X11_GPIO7			7
+#define STA2X11_GPIO8_RGBOUT_RED7	8
+#define STA2X11_GPIO9_RGBOUT_RED6	9
+#define STA2X11_GPIO10_RGBOUT_RED5	10
+#define STA2X11_GPIO11_RGBOUT_RED4	11
+#define STA2X11_GPIO12_RGBOUT_RED3	12
+#define STA2X11_GPIO13_RGBOUT_RED2	13
+#define STA2X11_GPIO14_RGBOUT_RED1	14
+#define STA2X11_GPIO15_RGBOUT_RED0	15
+#define STA2X11_GPIO16_RGBOUT_GREEN7	16
+#define STA2X11_GPIO17_RGBOUT_GREEN6	17
+#define STA2X11_GPIO18_RGBOUT_GREEN5	18
+#define STA2X11_GPIO19_RGBOUT_GREEN4	19
+#define STA2X11_GPIO20_RGBOUT_GREEN3	20
+#define STA2X11_GPIO21_RGBOUT_GREEN2	21
+#define STA2X11_GPIO22_RGBOUT_GREEN1	22
+#define STA2X11_GPIO23_RGBOUT_GREEN0	23
+#define STA2X11_GPIO24_RGBOUT_BLUE7	24
+#define STA2X11_GPIO25_RGBOUT_BLUE6	25
+#define STA2X11_GPIO26_RGBOUT_BLUE5	26
+#define STA2X11_GPIO27_RGBOUT_BLUE4	27
+#define STA2X11_GPIO28_RGBOUT_BLUE3	28
+#define STA2X11_GPIO29_RGBOUT_BLUE2	29
+#define STA2X11_GPIO30_RGBOUT_BLUE1	30
+#define STA2X11_GPIO31_RGBOUT_BLUE0	31
+#define STA2X11_GPIO32_RGBOUT_VSYNCH	32
+#define STA2X11_GPIO33_RGBOUT_HSYNCH	33
+#define STA2X11_GPIO34_RGBOUT_DEN	34
+#define STA2X11_GPIO35_ETH_CRS_DV	35
+#define STA2X11_GPIO36_ETH_TXD1		36
+#define STA2X11_GPIO37_ETH_TXD0		37
+#define STA2X11_GPIO38_ETH_TX_EN	38
+#define STA2X11_GPIO39_MDIO		39
+#define STA2X11_GPIO40_ETH_REF_CLK	40
+#define STA2X11_GPIO41_ETH_RXD1		41
+#define STA2X11_GPIO42_ETH_RXD0		42
+#define STA2X11_GPIO43_MDC		43
+#define STA2X11_GPIO44_CAN_TX		44
+#define STA2X11_GPIO45_CAN_RX		45
+#define STA2X11_GPIO46_MLB_DAT		46
+#define STA2X11_GPIO47_MLB_SIG		47
+#define STA2X11_GPIO48_SPI0_CLK		48
+#define STA2X11_GPIO49_SPI0_TXD		49
+#define STA2X11_GPIO50_SPI0_RXD		50
+#define STA2X11_GPIO51_SPI0_FRM		51
+#define STA2X11_GPIO52_SPI1_CLK		52
+#define STA2X11_GPIO53_SPI1_TXD		53
+#define STA2X11_GPIO54_SPI1_RXD		54
+#define STA2X11_GPIO55_SPI1_FRM		55
+#define STA2X11_GPIO56_SPI2_CLK		56
+#define STA2X11_GPIO57_SPI2_TXD		57
+#define STA2X11_GPIO58_SPI2_RXD		58
+#define STA2X11_GPIO59_SPI2_FRM		59
+#define STA2X11_GPIO60_I2C0_SCL		60
+#define STA2X11_GPIO61_I2C0_SDA		61
+#define STA2X11_GPIO62_I2C1_SCL		62
+#define STA2X11_GPIO63_I2C1_SDA		63
+#define STA2X11_GPIO64_I2C2_SCL		64
+#define STA2X11_GPIO65_I2C2_SDA		65
+#define STA2X11_GPIO66_I2C3_SCL		66
+#define STA2X11_GPIO67_I2C3_SDA		67
+#define STA2X11_GPIO68_MSP0_RCK		68
+#define STA2X11_GPIO69_MSP0_RXD		69
+#define STA2X11_GPIO70_MSP0_RFS		70
+#define STA2X11_GPIO71_MSP0_TCK		71
+#define STA2X11_GPIO72_MSP0_TXD		72
+#define STA2X11_GPIO73_MSP0_TFS		73
+#define STA2X11_GPIO74_MSP0_SCK		74
+#define STA2X11_GPIO75_MSP1_CK		75
+#define STA2X11_GPIO76_MSP1_RXD		76
+#define STA2X11_GPIO77_MSP1_FS		77
+#define STA2X11_GPIO78_MSP1_TXD		78
+#define STA2X11_GPIO79_MSP2_CK		79
+#define STA2X11_GPIO80_MSP2_RXD		80
+#define STA2X11_GPIO81_MSP2_FS		81
+#define STA2X11_GPIO82_MSP2_TXD		82
+#define STA2X11_GPIO83_MSP3_CK		83
+#define STA2X11_GPIO84_MSP3_RXD		84
+#define STA2X11_GPIO85_MSP3_FS		85
+#define STA2X11_GPIO86_MSP3_TXD		86
+#define STA2X11_GPIO87_MSP4_CK		87
+#define STA2X11_GPIO88_MSP4_RXD		88
+#define STA2X11_GPIO89_MSP4_FS		89
+#define STA2X11_GPIO90_MSP4_TXD		90
+#define STA2X11_GPIO91_MSP5_CK		91
+#define STA2X11_GPIO92_MSP5_RXD		92
+#define STA2X11_GPIO93_MSP5_FS		93
+#define STA2X11_GPIO94_MSP5_TXD		94
+#define STA2X11_GPIO95_SDIO3_DAT3	95
+#define STA2X11_GPIO96_SDIO3_DAT2	96
+#define STA2X11_GPIO97_SDIO3_DAT1	97
+#define STA2X11_GPIO98_SDIO3_DAT0	98
+#define STA2X11_GPIO99_SDIO3_CLK	99
+#define STA2X11_GPIO100_SDIO3_CMD	100
+#define STA2X11_GPIO101			101
+#define STA2X11_GPIO102			102
+#define STA2X11_GPIO103			103
+#define STA2X11_GPIO104			104
+#define STA2X11_GPIO105_SDIO2_DAT3	105
+#define STA2X11_GPIO106_SDIO2_DAT2	106
+#define STA2X11_GPIO107_SDIO2_DAT1	107
+#define STA2X11_GPIO108_SDIO2_DAT0	108
+#define STA2X11_GPIO109_SDIO2_CLK	109
+#define STA2X11_GPIO110_SDIO2_CMD	110
+#define STA2X11_GPIO111			111
+#define STA2X11_GPIO112			112
+#define STA2X11_GPIO113			113
+#define STA2X11_GPIO114			114
+#define STA2X11_GPIO115_SDIO1_DAT3	115
+#define STA2X11_GPIO116_SDIO1_DAT2	116
+#define STA2X11_GPIO117_SDIO1_DAT1	117
+#define STA2X11_GPIO118_SDIO1_DAT0	118
+#define STA2X11_GPIO119_SDIO1_CLK	119
+#define STA2X11_GPIO120_SDIO1_CMD	120
+#define STA2X11_GPIO121			121
+#define STA2X11_GPIO122			122
+#define STA2X11_GPIO123			123
+#define STA2X11_GPIO124			124
+#define STA2X11_GPIO125_UART2_TXD	125
+#define STA2X11_GPIO126_UART2_RXD	126
+#define STA2X11_GPIO127_UART3_TXD	127
+
+/*
+ * The APB bridge has its own registers, needed by our users as well.
+ * They are accessed with the following read/mask/write function.
+ */
+u32 sta2x11_apbreg_mask(struct pci_dev *pdev, u32 reg, u32 mask, u32 val);
+
+/* CAN and MLB */
+#define APBREG_BSR	0x00	/* Bridge Status Reg */
+#define APBREG_PAER	0x08	/* Peripherals Address Error Reg */
+#define APBREG_PWAC	0x20	/* Peripheral Write Access Control reg */
+#define APBREG_PRAC	0x40	/* Peripheral Read Access Control reg */
+#define APBREG_PCG	0x60	/* Peripheral Clock Gating Reg */
+#define APBREG_PUR	0x80	/* Peripheral Under Reset Reg */
+#define APBREG_EMU_PCG	0xA0	/* Emulator Peripheral Clock Gating Reg */
+
+#define APBREG_CAN	(1 << 1)
+#define APBREG_MLB	(1 << 3)
+
+/* SARAC */
+#define APBREG_BSR_SARAC     0x100 /* Bridge Status Reg */
+#define APBREG_PAER_SARAC    0x108 /* Peripherals Address Error Reg */
+#define APBREG_PWAC_SARAC    0x120 /* Peripheral Write Access Control reg */
+#define APBREG_PRAC_SARAC    0x140 /* Peripheral Read Access Control reg */
+#define APBREG_PCG_SARAC     0x160 /* Peripheral Clock Gating Reg */
+#define APBREG_PUR_SARAC     0x180 /* Peripheral Under Reset Reg */
+#define APBREG_EMU_PCG_SARAC 0x1A0 /* Emulator Peripheral Clock Gating Reg */
+
+#define APBREG_SARAC	(1 << 2)
+
+/*
+ * The system controller has its own registers. Some of these are accessed
+ * by out users as well, using the following read/mask/write/function
+ */
+u32 sta2x11_sctl_mask(struct pci_dev *pdev, u32 reg, u32 mask, u32 val);
+
+#define SCTL_SCCTL		0x00	/* System controller control register */
+#define SCTL_ARMCFG		0x04	/* ARM configuration register */
+#define SCTL_SCPLLCTL		0x08	/* PLL control status register */
+#define SCTL_SCPLLFCTRL		0x0c	/* PLL frequency control register */
+#define SCTL_SCRESFRACT		0x10	/* PLL fractional input register */
+#define SCTL_SCRESCTRL1		0x14	/* Peripheral reset control 1 */
+#define SCTL_SCRESXTRL2		0x18	/* Peripheral reset control 2 */
+#define SCTL_SCPEREN0		0x1c	/* Peripheral clock enable register 0 */
+#define SCTL_SCPEREN1		0x20	/* Peripheral clock enable register 1 */
+#define SCTL_SCPEREN2		0x24	/* Peripheral clock enable register 2 */
+#define SCTL_SCGRST		0x28	/* Peripheral global reset */
+#define SCTL_SCPCIPMCR1		0x30	/* PCI power management control 1 */
+#define SCTL_SCPCIPMCR2		0x34	/* PCI power management control 2 */
+#define SCTL_SCPCIPMSR1		0x38	/* PCI power management status 1 */
+#define SCTL_SCPCIPMSR2		0x3c	/* PCI power management status 2 */
+#define SCTL_SCPCIPMSR3		0x40	/* PCI power management status 3 */
+#define SCTL_SCINTREN		0x44	/* Interrupt enable */
+#define SCTL_SCRISR		0x48	/* RAW interrupt status */
+#define SCTL_SCCLKSTAT0		0x4c	/* Peripheral clocks status 0 */
+#define SCTL_SCCLKSTAT1		0x50	/* Peripheral clocks status 1 */
+#define SCTL_SCCLKSTAT2		0x54	/* Peripheral clocks status 2 */
+#define SCTL_SCRSTSTA		0x58	/* Reset status register */
+
+#define SCTL_SCRESCTRL1_USB_PHY_POR	(1 << 0)
+#define SCTL_SCRESCTRL1_USB_OTG	(1 << 1)
+#define SCTL_SCRESCTRL1_USB_HRST	(1 << 2)
+#define SCTL_SCRESCTRL1_USB_PHY_HOST	(1 << 3)
+#define SCTL_SCRESCTRL1_SATAII	(1 << 4)
+#define SCTL_SCRESCTRL1_VIP		(1 << 5)
+#define SCTL_SCRESCTRL1_PER_MMC0	(1 << 6)
+#define SCTL_SCRESCTRL1_PER_MMC1	(1 << 7)
+#define SCTL_SCRESCTRL1_PER_GPIO0	(1 << 8)
+#define SCTL_SCRESCTRL1_PER_GPIO1	(1 << 9)
+#define SCTL_SCRESCTRL1_PER_GPIO2	(1 << 10)
+#define SCTL_SCRESCTRL1_PER_GPIO3	(1 << 11)
+#define SCTL_SCRESCTRL1_PER_MTU0	(1 << 12)
+#define SCTL_SCRESCTRL1_KER_SPI0	(1 << 13)
+#define SCTL_SCRESCTRL1_KER_SPI1	(1 << 14)
+#define SCTL_SCRESCTRL1_KER_SPI2	(1 << 15)
+#define SCTL_SCRESCTRL1_KER_MCI0	(1 << 16)
+#define SCTL_SCRESCTRL1_KER_MCI1	(1 << 17)
+#define SCTL_SCRESCTRL1_PRE_HSI2C0	(1 << 18)
+#define SCTL_SCRESCTRL1_PER_HSI2C1	(1 << 19)
+#define SCTL_SCRESCTRL1_PER_HSI2C2	(1 << 20)
+#define SCTL_SCRESCTRL1_PER_HSI2C3	(1 << 21)
+#define SCTL_SCRESCTRL1_PER_MSP0	(1 << 22)
+#define SCTL_SCRESCTRL1_PER_MSP1	(1 << 23)
+#define SCTL_SCRESCTRL1_PER_MSP2	(1 << 24)
+#define SCTL_SCRESCTRL1_PER_MSP3	(1 << 25)
+#define SCTL_SCRESCTRL1_PER_MSP4	(1 << 26)
+#define SCTL_SCRESCTRL1_PER_MSP5	(1 << 27)
+#define SCTL_SCRESCTRL1_PER_MMC	(1 << 28)
+#define SCTL_SCRESCTRL1_KER_MSP0	(1 << 29)
+#define SCTL_SCRESCTRL1_KER_MSP1	(1 << 30)
+#define SCTL_SCRESCTRL1_KER_MSP2	(1 << 31)
+
+#define SCTL_SCPEREN0_UART0		(1 << 0)
+#define SCTL_SCPEREN0_UART1		(1 << 1)
+#define SCTL_SCPEREN0_UART2		(1 << 2)
+#define SCTL_SCPEREN0_UART3		(1 << 3)
+#define SCTL_SCPEREN0_MSP0		(1 << 4)
+#define SCTL_SCPEREN0_MSP1		(1 << 5)
+#define SCTL_SCPEREN0_MSP2		(1 << 6)
+#define SCTL_SCPEREN0_MSP3		(1 << 7)
+#define SCTL_SCPEREN0_MSP4		(1 << 8)
+#define SCTL_SCPEREN0_MSP5		(1 << 9)
+#define SCTL_SCPEREN0_SPI0		(1 << 10)
+#define SCTL_SCPEREN0_SPI1		(1 << 11)
+#define SCTL_SCPEREN0_SPI2		(1 << 12)
+#define SCTL_SCPEREN0_I2C0		(1 << 13)
+#define SCTL_SCPEREN0_I2C1		(1 << 14)
+#define SCTL_SCPEREN0_I2C2		(1 << 15)
+#define SCTL_SCPEREN0_I2C3		(1 << 16)
+#define SCTL_SCPEREN0_SVDO_LVDS		(1 << 17)
+#define SCTL_SCPEREN0_USB_HOST		(1 << 18)
+#define SCTL_SCPEREN0_USB_OTG		(1 << 19)
+#define SCTL_SCPEREN0_MCI0		(1 << 20)
+#define SCTL_SCPEREN0_MCI1		(1 << 21)
+#define SCTL_SCPEREN0_MCI2		(1 << 22)
+#define SCTL_SCPEREN0_MCI3		(1 << 23)
+#define SCTL_SCPEREN0_SATA		(1 << 24)
+#define SCTL_SCPEREN0_ETHERNET		(1 << 25)
+#define SCTL_SCPEREN0_VIC		(1 << 26)
+#define SCTL_SCPEREN0_DMA_AUDIO		(1 << 27)
+#define SCTL_SCPEREN0_DMA_SOC		(1 << 28)
+#define SCTL_SCPEREN0_RAM		(1 << 29)
+#define SCTL_SCPEREN0_VIP		(1 << 30)
+#define SCTL_SCPEREN0_ARM		(1 << 31)
+
+#define SCTL_SCPEREN1_UART0		(1 << 0)
+#define SCTL_SCPEREN1_UART1		(1 << 1)
+#define SCTL_SCPEREN1_UART2		(1 << 2)
+#define SCTL_SCPEREN1_UART3		(1 << 3)
+#define SCTL_SCPEREN1_MSP0		(1 << 4)
+#define SCTL_SCPEREN1_MSP1		(1 << 5)
+#define SCTL_SCPEREN1_MSP2		(1 << 6)
+#define SCTL_SCPEREN1_MSP3		(1 << 7)
+#define SCTL_SCPEREN1_MSP4		(1 << 8)
+#define SCTL_SCPEREN1_MSP5		(1 << 9)
+#define SCTL_SCPEREN1_SPI0		(1 << 10)
+#define SCTL_SCPEREN1_SPI1		(1 << 11)
+#define SCTL_SCPEREN1_SPI2		(1 << 12)
+#define SCTL_SCPEREN1_I2C0		(1 << 13)
+#define SCTL_SCPEREN1_I2C1		(1 << 14)
+#define SCTL_SCPEREN1_I2C2		(1 << 15)
+#define SCTL_SCPEREN1_I2C3		(1 << 16)
+#define SCTL_SCPEREN1_USB_PHY		(1 << 17)
+
+#endif /* __STA2X11_MFD_H */
-- 
cgit v0.10.2


From 7b0d44f3b7cec0ae6f5e81d18df4a4077bbabb7c Mon Sep 17 00:00:00 2001
From: Alessandro Rubini <rubini@gnudd.com>
Date: Thu, 12 Apr 2012 10:48:55 +0200
Subject: gpio: Add STA2X11 GPIO block

This introduces 128 gpio bits (for each PCI device installed) with
working interrupt support.

Signed-off-by: Alessandro Rubini <rubini@gnudd.com>
Acked-by: Giancarlo Asnaghi <giancarlo.asnaghi@st.com>
Cc: Alan Cox <alan@linux.intel.com>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>

diff --git a/drivers/gpio/Kconfig b/drivers/gpio/Kconfig
index a0d5796..09ac540 100644
--- a/drivers/gpio/Kconfig
+++ b/drivers/gpio/Kconfig
@@ -149,6 +149,14 @@ config GPIO_PXA
 	help
 	  Say yes here to support the PXA GPIO device
 
+config GPIO_STA2X11
+	bool "STA2x11/ConneXt GPIO support"
+	depends on MFD_STA2X11
+	select GENERIC_IRQ_CHIP
+	help
+	  Say yes here to support the STA2x11/ConneXt GPIO device.
+	  The GPIO module has 128 GPIO pins with alternate functions.
+
 config GPIO_XILINX
 	bool "Xilinx GPIO support"
 	depends on PPC_OF || MICROBLAZE
diff --git a/drivers/gpio/Makefile b/drivers/gpio/Makefile
index 183c42b..2e69c58 100644
--- a/drivers/gpio/Makefile
+++ b/drivers/gpio/Makefile
@@ -49,6 +49,7 @@ obj-$(CONFIG_PLAT_SAMSUNG)	+= gpio-samsung.o
 obj-$(CONFIG_ARCH_SA1100)	+= gpio-sa1100.o
 obj-$(CONFIG_GPIO_SCH)		+= gpio-sch.o
 obj-$(CONFIG_GPIO_SODAVILLE)	+= gpio-sodaville.o
+obj-$(CONFIG_GPIO_STA2X11)	+= gpio-sta2x11.o
 obj-$(CONFIG_GPIO_STMPE)	+= gpio-stmpe.o
 obj-$(CONFIG_GPIO_SX150X)	+= gpio-sx150x.o
 obj-$(CONFIG_GPIO_TC3589X)	+= gpio-tc3589x.o
diff --git a/drivers/gpio/gpio-sta2x11.c b/drivers/gpio/gpio-sta2x11.c
new file mode 100644
index 0000000..38416be
--- /dev/null
+++ b/drivers/gpio/gpio-sta2x11.c
@@ -0,0 +1,435 @@
+/*
+ * STMicroelectronics ConneXt (STA2X11) GPIO driver
+ *
+ * Copyright 2012 ST Microelectronics (Alessandro Rubini)
+ * Based on gpio-ml-ioh.c, Copyright 2010 OKI Semiconductors Ltd.
+ * Also based on previous sta2x11 work, Copyright 2011 Wind River Systems, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ * See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/gpio.h>
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+#include <linux/pci.h>
+#include <linux/platform_device.h>
+#include <linux/mfd/sta2x11-mfd.h>
+
+struct gsta_regs {
+	u32 dat;		/* 0x00 */
+	u32 dats;
+	u32 datc;
+	u32 pdis;
+	u32 dir;		/* 0x10 */
+	u32 dirs;
+	u32 dirc;
+	u32 unused_1c;
+	u32 afsela;		/* 0x20 */
+	u32 unused_24[7];
+	u32 rimsc;		/* 0x40 */
+	u32 fimsc;
+	u32 is;
+	u32 ic;
+};
+
+struct gsta_gpio {
+	spinlock_t			lock;
+	struct device			*dev;
+	void __iomem			*reg_base;
+	struct gsta_regs __iomem	*regs[GSTA_NR_BLOCKS];
+	struct gpio_chip		gpio;
+	int				irq_base;
+	/* FIXME: save the whole config here (AF, ...) */
+	unsigned			irq_type[GSTA_NR_GPIO];
+};
+
+static inline struct gsta_regs __iomem *__regs(struct gsta_gpio *chip, int nr)
+{
+	return chip->regs[nr / GSTA_GPIO_PER_BLOCK];
+}
+
+static inline u32 __bit(int nr)
+{
+	return 1U << (nr % GSTA_GPIO_PER_BLOCK);
+}
+
+/*
+ * gpio methods
+ */
+
+static void gsta_gpio_set(struct gpio_chip *gpio, unsigned nr, int val)
+{
+	struct gsta_gpio *chip = container_of(gpio, struct gsta_gpio, gpio);
+	struct gsta_regs __iomem *regs = __regs(chip, nr);
+	u32 bit = __bit(nr);
+
+	if (val)
+		writel(bit, &regs->dats);
+	else
+		writel(bit, &regs->datc);
+}
+
+static int gsta_gpio_get(struct gpio_chip *gpio, unsigned nr)
+{
+	struct gsta_gpio *chip = container_of(gpio, struct gsta_gpio, gpio);
+	struct gsta_regs __iomem *regs = __regs(chip, nr);
+	u32 bit = __bit(nr);
+
+	return readl(&regs->dat) & bit;
+}
+
+static int gsta_gpio_direction_output(struct gpio_chip *gpio, unsigned nr,
+				      int val)
+{
+	struct gsta_gpio *chip = container_of(gpio, struct gsta_gpio, gpio);
+	struct gsta_regs __iomem *regs = __regs(chip, nr);
+	u32 bit = __bit(nr);
+
+	writel(bit, &regs->dirs);
+	/* Data register after direction, otherwise pullup/down is selected */
+	if (val)
+		writel(bit, &regs->dats);
+	else
+		writel(bit, &regs->datc);
+	return 0;
+}
+
+static int gsta_gpio_direction_input(struct gpio_chip *gpio, unsigned nr)
+{
+	struct gsta_gpio *chip = container_of(gpio, struct gsta_gpio, gpio);
+	struct gsta_regs __iomem *regs = __regs(chip, nr);
+	u32 bit = __bit(nr);
+
+	writel(bit, &regs->dirc);
+	return 0;
+}
+
+static int gsta_gpio_to_irq(struct gpio_chip *gpio, unsigned offset)
+{
+	struct gsta_gpio *chip = container_of(gpio, struct gsta_gpio, gpio);
+	return chip->irq_base + offset;
+}
+
+static void gsta_gpio_setup(struct gsta_gpio *chip) /* called from probe */
+{
+	struct gpio_chip *gpio = &chip->gpio;
+
+	/*
+	 * ARCH_NR_GPIOS is currently 256 and dynamic allocation starts
+	 * from the end. However, for compatibility, we need the first
+	 * ConneXt device to start from gpio 0: it's the main chipset
+	 * on most boards so documents and drivers assume gpio0..gpio127
+	 */
+	static int gpio_base;
+
+	gpio->label = dev_name(chip->dev);
+	gpio->owner = THIS_MODULE;
+	gpio->direction_input = gsta_gpio_direction_input;
+	gpio->get = gsta_gpio_get;
+	gpio->direction_output = gsta_gpio_direction_output;
+	gpio->set = gsta_gpio_set;
+	gpio->dbg_show = NULL;
+	gpio->base = gpio_base;
+	gpio->ngpio = GSTA_NR_GPIO;
+	gpio->can_sleep = 0;
+	gpio->to_irq = gsta_gpio_to_irq;
+
+	/*
+	 * After the first device, turn to dynamic gpio numbers.
+	 * For example, with ARCH_NR_GPIOS = 256 we can fit two cards
+	 */
+	if (!gpio_base)
+		gpio_base = -1;
+}
+
+/*
+ * Special method: alternate functions and pullup/pulldown. This is only
+ * invoked on startup to configure gpio's according to platform data.
+ * FIXME : this functionality shall be managed (and exported to other drivers)
+ * via the pin control subsystem.
+ */
+static void gsta_set_config(struct gsta_gpio *chip, int nr, unsigned cfg)
+{
+	struct gsta_regs __iomem *regs = __regs(chip, nr);
+	unsigned long flags;
+	u32 bit = __bit(nr);
+	u32 val;
+	int err = 0;
+
+	pr_info("%s: %p %i %i\n", __func__, chip, nr, cfg);
+
+	if (cfg == PINMUX_TYPE_NONE)
+		return;
+
+	/* Alternate function or not? */
+	spin_lock_irqsave(&chip->lock, flags);
+	val = readl(&regs->afsela);
+	if (cfg == PINMUX_TYPE_FUNCTION)
+		val |= bit;
+	else
+		val &= ~bit;
+	writel(val | bit, &regs->afsela);
+	if (cfg == PINMUX_TYPE_FUNCTION) {
+		spin_unlock_irqrestore(&chip->lock, flags);
+		return;
+	}
+
+	/* not alternate function: set details */
+	switch (cfg) {
+	case PINMUX_TYPE_OUTPUT_LOW:
+		writel(bit, &regs->dirs);
+		writel(bit, &regs->datc);
+		break;
+	case PINMUX_TYPE_OUTPUT_HIGH:
+		writel(bit, &regs->dirs);
+		writel(bit, &regs->dats);
+		break;
+	case PINMUX_TYPE_INPUT:
+		writel(bit, &regs->dirc);
+		val = readl(&regs->pdis) | bit;
+		writel(val, &regs->pdis);
+		break;
+	case PINMUX_TYPE_INPUT_PULLUP:
+		writel(bit, &regs->dirc);
+		val = readl(&regs->pdis) & ~bit;
+		writel(val, &regs->pdis);
+		writel(bit, &regs->dats);
+		break;
+	case PINMUX_TYPE_INPUT_PULLDOWN:
+		writel(bit, &regs->dirc);
+		val = readl(&regs->pdis) & ~bit;
+		writel(val, &regs->pdis);
+		writel(bit, &regs->datc);
+		break;
+	default:
+		err = 1;
+	}
+	spin_unlock_irqrestore(&chip->lock, flags);
+	if (err)
+		pr_err("%s: chip %p, pin %i, cfg %i is invalid\n",
+		       __func__, chip, nr, cfg);
+}
+
+/*
+ * Irq methods
+ */
+
+static void gsta_irq_disable(struct irq_data *data)
+{
+	struct irq_chip_generic *gc = irq_data_get_irq_chip_data(data);
+	struct gsta_gpio *chip = gc->private;
+	int nr = data->irq - chip->irq_base;
+	struct gsta_regs __iomem *regs = __regs(chip, nr);
+	u32 bit = __bit(nr);
+	u32 val;
+	unsigned long flags;
+
+	spin_lock_irqsave(&chip->lock, flags);
+	if (chip->irq_type[nr] & IRQ_TYPE_EDGE_RISING) {
+		val = readl(&regs->rimsc) & ~bit;
+		writel(val, &regs->rimsc);
+	}
+	if (chip->irq_type[nr] & IRQ_TYPE_EDGE_FALLING) {
+		val = readl(&regs->fimsc) & ~bit;
+		writel(val, &regs->fimsc);
+	}
+	spin_unlock_irqrestore(&chip->lock, flags);
+	return;
+}
+
+static void gsta_irq_enable(struct irq_data *data)
+{
+	struct irq_chip_generic *gc = irq_data_get_irq_chip_data(data);
+	struct gsta_gpio *chip = gc->private;
+	int nr = data->irq - chip->irq_base;
+	struct gsta_regs __iomem *regs = __regs(chip, nr);
+	u32 bit = __bit(nr);
+	u32 val;
+	int type;
+	unsigned long flags;
+
+	type = chip->irq_type[nr];
+
+	spin_lock_irqsave(&chip->lock, flags);
+	val = readl(&regs->rimsc);
+	if (type & IRQ_TYPE_EDGE_RISING)
+		writel(val | bit, &regs->rimsc);
+	else
+		writel(val & ~bit, &regs->rimsc);
+	val = readl(&regs->rimsc);
+	if (type & IRQ_TYPE_EDGE_FALLING)
+		writel(val | bit, &regs->fimsc);
+	else
+		writel(val & ~bit, &regs->fimsc);
+	spin_unlock_irqrestore(&chip->lock, flags);
+	return;
+}
+
+static int gsta_irq_type(struct irq_data *d, unsigned int type)
+{
+	struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
+	struct gsta_gpio *chip = gc->private;
+	int nr = d->irq - chip->irq_base;
+
+	/* We only support edge interrupts */
+	if (!(type & (IRQ_TYPE_EDGE_RISING | IRQ_TYPE_EDGE_FALLING))) {
+		pr_debug("%s: unsupported type 0x%x\n", __func__, type);
+		return -EINVAL;
+	}
+
+	chip->irq_type[nr] = type; /* used for enable/disable */
+
+	gsta_irq_enable(d);
+	return 0;
+}
+
+static irqreturn_t gsta_gpio_handler(int irq, void *dev_id)
+{
+	struct gsta_gpio *chip = dev_id;
+	struct gsta_regs __iomem *regs;
+	u32 is;
+	int i, nr, base;
+	irqreturn_t ret = IRQ_NONE;
+
+	for (i = 0; i < GSTA_NR_BLOCKS; i++) {
+		regs = chip->regs[i];
+		base = chip->irq_base + i * GSTA_GPIO_PER_BLOCK;
+		while ((is = readl(&regs->is))) {
+			nr = __ffs(is);
+			irq = base + nr;
+			generic_handle_irq(irq);
+			writel(1 << nr, &regs->ic);
+			ret = IRQ_HANDLED;
+		}
+	}
+	return ret;
+}
+
+static __devinit void gsta_alloc_irq_chip(struct gsta_gpio *chip)
+{
+	struct irq_chip_generic *gc;
+	struct irq_chip_type *ct;
+
+	gc = irq_alloc_generic_chip(KBUILD_MODNAME, 1, chip->irq_base,
+				     chip->reg_base, handle_simple_irq);
+	gc->private = chip;
+	ct = gc->chip_types;
+
+	ct->chip.irq_set_type = gsta_irq_type;
+	ct->chip.irq_disable = gsta_irq_disable;
+	ct->chip.irq_enable = gsta_irq_enable;
+
+	/* FIXME: this makes at most 32 interrupts. Request 0 by now */
+	irq_setup_generic_chip(gc, 0 /* IRQ_MSK(GSTA_GPIO_PER_BLOCK) */, 0,
+			       IRQ_NOREQUEST | IRQ_NOPROBE, 0);
+
+	/* Set up all all 128 interrupts: code from setup_generic_chip */
+	{
+		struct irq_chip_type *ct = gc->chip_types;
+		int i, j;
+		for (j = 0; j < GSTA_NR_GPIO; j++) {
+			i = chip->irq_base + j;
+			irq_set_chip_and_handler(i, &ct->chip, ct->handler);
+			irq_set_chip_data(i, gc);
+			irq_modify_status(i, IRQ_NOREQUEST | IRQ_NOPROBE, 0);
+		}
+		gc->irq_cnt = i - gc->irq_base;
+	}
+}
+
+/* The platform device used here is instantiated by the MFD device */
+static int __devinit gsta_probe(struct platform_device *dev)
+{
+	int i, err;
+	struct pci_dev *pdev;
+	struct sta2x11_gpio_pdata *gpio_pdata;
+	struct gsta_gpio *chip;
+	struct resource *res;
+
+	pdev = *(struct pci_dev **)(dev->dev.platform_data);
+	gpio_pdata = dev_get_platdata(&pdev->dev);
+
+	if (gpio_pdata == NULL)
+		dev_err(&dev->dev, "no gpio config\n");
+	pr_debug("gpio config: %p\n", gpio_pdata);
+
+	res = platform_get_resource(dev, IORESOURCE_MEM, 0);
+
+	chip = devm_kzalloc(&dev->dev, sizeof(*chip), GFP_KERNEL);
+	chip->dev = &dev->dev;
+	chip->reg_base = devm_request_and_ioremap(&dev->dev, res);
+
+	for (i = 0; i < GSTA_NR_BLOCKS; i++) {
+		chip->regs[i] = chip->reg_base + i * 4096;
+		/* disable all irqs */
+		writel(0, &chip->regs[i]->rimsc);
+		writel(0, &chip->regs[i]->fimsc);
+		writel(~0, &chip->regs[i]->ic);
+	}
+	spin_lock_init(&chip->lock);
+	gsta_gpio_setup(chip);
+	for (i = 0; i < GSTA_NR_GPIO; i++)
+		gsta_set_config(chip, i, gpio_pdata->pinconfig[i]);
+
+	/* 384 was used in previous code: be compatible for other drivers */
+	err = irq_alloc_descs(-1, 384, GSTA_NR_GPIO, NUMA_NO_NODE);
+	if (err < 0) {
+		dev_warn(&dev->dev, "sta2x11 gpio: Can't get irq base (%i)\n",
+			 -err);
+		return err;
+	}
+	chip->irq_base = err;
+	gsta_alloc_irq_chip(chip);
+
+	err = request_irq(pdev->irq, gsta_gpio_handler,
+			     IRQF_SHARED, KBUILD_MODNAME, chip);
+	if (err < 0) {
+		dev_err(&dev->dev, "sta2x11 gpio: Can't request irq (%i)\n",
+			-err);
+		goto err_free_descs;
+	}
+
+	err = gpiochip_add(&chip->gpio);
+	if (err < 0) {
+		dev_err(&dev->dev, "sta2x11 gpio: Can't register (%i)\n",
+			-err);
+		goto err_free_irq;
+	}
+
+	platform_set_drvdata(dev, chip);
+	return 0;
+
+err_free_irq:
+	free_irq(pdev->irq, chip);
+err_free_descs:
+	irq_free_descs(chip->irq_base, GSTA_NR_GPIO);
+	return err;
+}
+
+static struct platform_driver sta2x11_gpio_platform_driver = {
+	.driver = {
+		.name	= "sta2x11-gpio",
+		.owner	= THIS_MODULE,
+	},
+	.probe = gsta_probe,
+};
+
+module_platform_driver(sta2x11_gpio_platform_driver);
+
+MODULE_LICENSE("GPL v2");
+MODULE_DESCRIPTION("sta2x11_gpio GPIO driver");
-- 
cgit v0.10.2


From 1fc9b1eade80b323f02a9cf7a29e1641eddf1052 Mon Sep 17 00:00:00 2001
From: Seth Heasley <seth.heasley@intel.com>
Date: Mon, 23 Apr 2012 09:23:56 -0700
Subject: pci_ids: Add Intel Centerton Legacy Block DeviceID

This patch adds the Integrated Legacy Block DeviceID for the Centerton CPU.  It will be used in the GPIO and Multifunction Devices driver.

Signed-off-by: Seth Heasley <seth.heasley@intel.com>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>

diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 3329965..ab741b0 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -2506,6 +2506,7 @@
 #define PCI_DEVICE_ID_INTEL_MRST_SD2	0x084F
 #define PCI_DEVICE_ID_INTEL_I960	0x0960
 #define PCI_DEVICE_ID_INTEL_I960RM	0x0962
+#define PCI_DEVICE_ID_INTEL_CENTERTON_ILB	0x0c60
 #define PCI_DEVICE_ID_INTEL_8257X_SOL	0x1062
 #define PCI_DEVICE_ID_INTEL_82573E_SOL	0x1085
 #define PCI_DEVICE_ID_INTEL_82573L_SOL	0x108F
-- 
cgit v0.10.2


From 8ee3c2a79fe1df10bccd110d5b8cc13c5b9da709 Mon Sep 17 00:00:00 2001
From: Seth Heasley <seth.heasley@intel.com>
Date: Tue, 17 Apr 2012 14:09:22 -0700
Subject: lpc_sch: Add Intel Centerton Multifunction Device support

This patch adds the Intel Centerton processor DeviceID for the
Integrated Legacy Block (ILB).
The ILB provides GPIO, SMBus, and Watchdog functionality.

Signed-off-by: Seth Heasley <seth.heasley@intel.com>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>

diff --git a/drivers/mfd/lpc_sch.c b/drivers/mfd/lpc_sch.c
index 27026eb..9bf0839 100644
--- a/drivers/mfd/lpc_sch.c
+++ b/drivers/mfd/lpc_sch.c
@@ -36,6 +36,7 @@
 
 #define GPIOBASE	0x44
 #define GPIO_IO_SIZE	64
+#define GPIO_IO_SIZE_CENTERTON	128
 
 #define WDTBASE		0x84
 #define WDT_IO_SIZE	64
@@ -77,6 +78,7 @@ static struct mfd_cell tunnelcreek_cells[] = {
 static DEFINE_PCI_DEVICE_TABLE(lpc_sch_ids) = {
 	{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_SCH_LPC) },
 	{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ITC_LPC) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CENTERTON_ILB) },
 	{ 0, }
 };
 MODULE_DEVICE_TABLE(pci, lpc_sch_ids);
@@ -115,7 +117,11 @@ static int __devinit lpc_sch_probe(struct pci_dev *dev,
 	}
 
 	gpio_sch_resource.start = base_addr;
-	gpio_sch_resource.end = base_addr + GPIO_IO_SIZE - 1;
+
+	if (id->device == PCI_DEVICE_ID_INTEL_CENTERTON_ILB)
+		gpio_sch_resource.end = base_addr + GPIO_IO_SIZE_CENTERTON - 1;
+	else
+		gpio_sch_resource.end = base_addr + GPIO_IO_SIZE - 1;
 
 	for (i=0; i < ARRAY_SIZE(lpc_sch_cells); i++)
 		lpc_sch_cells[i].id = id->device;
@@ -125,7 +131,8 @@ static int __devinit lpc_sch_probe(struct pci_dev *dev,
 	if (ret)
 		goto out_dev;
 
-	if (id->device == PCI_DEVICE_ID_INTEL_ITC_LPC) {
+	if (id->device == PCI_DEVICE_ID_INTEL_ITC_LPC
+	 || id->device == PCI_DEVICE_ID_INTEL_CENTERTON_ILB) {
 		pci_read_config_dword(dev, WDTBASE, &base_addr_cfg);
 		if (!(base_addr_cfg & (1 << 31))) {
 			dev_err(&dev->dev, "Decode of the WDT I/O range disabled\n");
-- 
cgit v0.10.2


From 9e69ab4116668be1e327d164d6a27c8605b0aabb Mon Sep 17 00:00:00 2001
From: Olof Johansson <olof@lixom.net>
Date: Sat, 5 May 2012 16:02:48 -0700
Subject: mfd: Fix of_match_node() da9052 arguments

The driver calls of_match_node() with the arguments swapped.

Signed-off-by: Olof Johansson <olof@lixom.net>
Tested-by: Ying-Chun Liu <paul.liu@linaro.org>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>

diff --git a/drivers/mfd/da9052-i2c.c b/drivers/mfd/da9052-i2c.c
index d8abdb3..8410065f1 100644
--- a/drivers/mfd/da9052-i2c.c
+++ b/drivers/mfd/da9052-i2c.c
@@ -104,7 +104,7 @@ static int __devinit da9052_i2c_probe(struct i2c_client *client,
 		struct device_node *np = client->dev.of_node;
 		const struct of_device_id *deviceid;
 
-		deviceid = of_match_node(np, dialog_dt_ids);
+		deviceid = of_match_node(dialog_dt_ids, np);
 		id = (const struct i2c_device_id *)deviceid->data;
 	}
 #endif
-- 
cgit v0.10.2


From 9fc63f670f53cf9dcdca5e523289dda35da47e63 Mon Sep 17 00:00:00 2001
From: Lee Jones <lee.jones@linaro.org>
Date: Thu, 19 Apr 2012 21:36:41 +0100
Subject: mfd: Register db8500-prcmu as a platform driver instead of only
 probing

Pass the probe function as part of the platform_driver struct and
register using the more common platform_driver_register call. In
subsequent patches we'll also add DT support into the struct.

Signed-off-by: Lee Jones <lee.jones@linaro.org>
Acked-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>

diff --git a/drivers/mfd/db8500-prcmu.c b/drivers/mfd/db8500-prcmu.c
index 5be3248..79a79ae 100644
--- a/drivers/mfd/db8500-prcmu.c
+++ b/drivers/mfd/db8500-prcmu.c
@@ -2958,7 +2958,7 @@ static struct mfd_cell db8500_prcmu_devs[] = {
  * prcmu_fw_init - arch init call for the Linux PRCMU fw init logic
  *
  */
-static int __init db8500_prcmu_probe(struct platform_device *pdev)
+static int __devinit db8500_prcmu_probe(struct platform_device *pdev)
 {
 	int err = 0;
 
@@ -2999,11 +2999,12 @@ static struct platform_driver db8500_prcmu_driver = {
 		.name = "db8500-prcmu",
 		.owner = THIS_MODULE,
 	},
+	.probe = db8500_prcmu_probe,
 };
 
 static int __init db8500_prcmu_init(void)
 {
-	return platform_driver_probe(&db8500_prcmu_driver, db8500_prcmu_probe);
+	return platform_driver_register(&db8500_prcmu_driver);
 }
 
 arch_initcall(db8500_prcmu_init);
-- 
cgit v0.10.2


From ca7edd16ae488fe0eff5d4f8eb17c5caa8dcc5fa Mon Sep 17 00:00:00 2001
From: Lee Jones <lee.jones@linaro.org>
Date: Wed, 9 May 2012 17:19:25 +0200
Subject: mfd: Enable Device Tree support for the db8500-prcmu

This patch will enable probing to occur during a Device Tree enabled
boot. The IRQ base is expected to be located in and will be fetched
from the DT itself. We also prevent any of the db8500 regulators
from being registered here, as they will be enabled via DT instead.

Signed-off-by: Lee Jones <lee.jones@linaro.org>
Acked-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>

diff --git a/drivers/mfd/db8500-prcmu.c b/drivers/mfd/db8500-prcmu.c
index 79a79ae..e6f8d26 100644
--- a/drivers/mfd/db8500-prcmu.c
+++ b/drivers/mfd/db8500-prcmu.c
@@ -2960,7 +2960,8 @@ static struct mfd_cell db8500_prcmu_devs[] = {
  */
 static int __devinit db8500_prcmu_probe(struct platform_device *pdev)
 {
-	int err = 0;
+	struct device_node *np = pdev->dev.of_node;
+	int irq = 0, err = 0;
 
 	if (ux500_is_svp())
 		return -ENODEV;
@@ -2970,8 +2971,14 @@ static int __devinit db8500_prcmu_probe(struct platform_device *pdev)
 	/* Clean up the mailbox interrupts after pre-kernel code. */
 	writel(ALL_MBOX_BITS, PRCM_ARM_IT1_CLR);
 
-	err = request_threaded_irq(IRQ_DB8500_PRCMU1, prcmu_irq_handler,
-		prcmu_irq_thread_fn, IRQF_NO_SUSPEND, "prcmu", NULL);
+	if (np)
+		irq = platform_get_irq(pdev, 0);
+
+	if (!np || irq <= 0)
+		irq = IRQ_DB8500_PRCMU1;
+
+	err = request_threaded_irq(irq, prcmu_irq_handler,
+	        prcmu_irq_thread_fn, IRQF_NO_SUSPEND, "prcmu", NULL);
 	if (err < 0) {
 		pr_err("prcmu: Failed to allocate IRQ_DB8500_PRCMU1.\n");
 		err = -EBUSY;
@@ -2981,14 +2988,16 @@ static int __devinit db8500_prcmu_probe(struct platform_device *pdev)
 	if (cpu_is_u8500v20_or_later())
 		prcmu_config_esram0_deep_sleep(ESRAM0_DEEP_SLEEP_STATE_RET);
 
-	err = mfd_add_devices(&pdev->dev, 0, db8500_prcmu_devs,
-			      ARRAY_SIZE(db8500_prcmu_devs), NULL,
-			      0);
+	if (!np) {
+		err = mfd_add_devices(&pdev->dev, 0, db8500_prcmu_devs,
+				ARRAY_SIZE(db8500_prcmu_devs), NULL, 0);
+		if (err) {
+			pr_err("prcmu: Failed to add subdevices\n");
+			return err;
+		}
+	}
 
-	if (err)
-		pr_err("prcmu: Failed to add subdevices\n");
-	else
-		pr_info("DB8500 PRCMU initialized\n");
+	pr_info("DB8500 PRCMU initialized\n");
 
 no_irq_return:
 	return err;
-- 
cgit v0.10.2


From ae8406357eca7fde4ff047e858d285faee836804 Mon Sep 17 00:00:00 2001
From: Lee Jones <lee.jones@linaro.org>
Date: Fri, 4 May 2012 19:23:20 +0100
Subject: mfd: Add support for db8500-prcmu regulator supply for nmk-i2c.4

This applies a supply alias for the db8500's fifth Nomadik i2c port.

Signed-off-by: Lee Jones <lee.jones@linaro.org>
Acked-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>

diff --git a/drivers/mfd/db8500-prcmu.c b/drivers/mfd/db8500-prcmu.c
index e6f8d26..671c8bc 100644
--- a/drivers/mfd/db8500-prcmu.c
+++ b/drivers/mfd/db8500-prcmu.c
@@ -2720,6 +2720,7 @@ static struct regulator_consumer_supply db8500_vape_consumers[] = {
 	REGULATOR_SUPPLY("v-i2c", "nmk-i2c.1"),
 	REGULATOR_SUPPLY("v-i2c", "nmk-i2c.2"),
 	REGULATOR_SUPPLY("v-i2c", "nmk-i2c.3"),
+	REGULATOR_SUPPLY("v-i2c", "nmk-i2c.4"),
 	/* "v-mmc" changed to "vcore" in the mainline kernel */
 	REGULATOR_SUPPLY("vcore", "sdi0"),
 	REGULATOR_SUPPLY("vcore", "sdi1"),
-- 
cgit v0.10.2


From 16c5c023aac86228e3e94c4bf6d19708ea861a05 Mon Sep 17 00:00:00 2001
From: Johan Hovold <jhovold@gmail.com>
Date: Thu, 3 May 2012 12:26:36 +0200
Subject: mfd: Add LM3533 lighting-power core driver

Add support for National Semiconductor / TI LM3533 lighting power chips.

This is the core driver which provides register access over I2C and
registers the ambient-light-sensor, LED and backlight sub-drivers.

Signed-off-by: Johan Hovold <jhovold@gmail.com>
Reviewed-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>

diff --git a/Documentation/ABI/testing/sysfs-bus-i2c-devices-lm3533 b/Documentation/ABI/testing/sysfs-bus-i2c-devices-lm3533
new file mode 100644
index 0000000..5700721
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-bus-i2c-devices-lm3533
@@ -0,0 +1,38 @@
+What:		/sys/bus/i2c/devices/.../boost_freq
+Date:		April 2012
+KernelVersion:	3.5
+Contact:	Johan Hovold <jhovold@gmail.com>
+Description:
+		Set the boost converter switching frequency (0, 1), where
+
+		0 -  500Hz
+		1 - 1000Hz
+
+What:		/sys/bus/i2c/devices/.../boost_ovp
+Date:		April 2012
+KernelVersion:	3.5
+Contact:	Johan Hovold <jhovold@gmail.com>
+Description:
+		Set the boost converter over-voltage protection threshold
+		(0..3), where
+
+		0 - 16V
+		1 - 24V
+		2 - 32V
+		3 - 40V
+
+What:		/sys/bus/i2c/devices/.../output_hvled[n]
+Date:		April 2012
+KernelVersion:	3.5
+Contact:	Johan Hovold <jhovold@gmail.com>
+Description:
+		Set the controlling backlight device for high-voltage current
+		sink HVLED[n] (n = 1, 2) (0, 1).
+
+What:		/sys/bus/i2c/devices/.../output_lvled[n]
+Date:		April 2012
+KernelVersion:	3.5
+Contact:	Johan Hovold <jhovold@gmail.com>
+Description:
+		Set the controlling led device for low-voltage current sink
+		LVLED[n] (n = 1..5) (0..3).
diff --git a/drivers/mfd/Kconfig b/drivers/mfd/Kconfig
index 48eed22..211f5de 100644
--- a/drivers/mfd/Kconfig
+++ b/drivers/mfd/Kconfig
@@ -106,6 +106,19 @@ config UCB1400_CORE
 	  To compile this driver as a module, choose M here: the
 	  module will be called ucb1400_core.
 
+config MFD_LM3533
+	tristate "LM3533 Lighting Power chip"
+	depends on I2C
+	select MFD_CORE
+	select REGMAP_I2C
+	help
+	  Say yes here to enable support for National Semiconductor / TI
+	  LM3533 Lighting Power chips.
+
+	  This driver provides common support for accessing the device;
+	  additional drivers must be enabled in order to use the LED,
+	  backlight or ambient-light-sensor functionality of the device.
+
 config TPS6105X
 	tristate "TPS61050/61052 Boost Converters"
 	depends on I2C
diff --git a/drivers/mfd/Makefile b/drivers/mfd/Makefile
index 0dc55cb..d3dae95 100644
--- a/drivers/mfd/Makefile
+++ b/drivers/mfd/Makefile
@@ -120,3 +120,4 @@ obj-$(CONFIG_MFD_INTEL_MSIC)	+= intel_msic.o
 obj-$(CONFIG_MFD_RC5T583)	+= rc5t583.o rc5t583-irq.o
 obj-$(CONFIG_MFD_S5M_CORE)	+= s5m-core.o s5m-irq.o
 obj-$(CONFIG_MFD_ANATOP)	+= anatop-mfd.o
+obj-$(CONFIG_MFD_LM3533)	+= lm3533-core.o lm3533-ctrlbank.o
diff --git a/drivers/mfd/lm3533-core.c b/drivers/mfd/lm3533-core.c
new file mode 100644
index 0000000..75f4b7f
--- /dev/null
+++ b/drivers/mfd/lm3533-core.c
@@ -0,0 +1,717 @@
+/*
+ * lm3533-core.c -- LM3533 Core
+ *
+ * Copyright (C) 2011-2012 Texas Instruments
+ *
+ * Author: Johan Hovold <jhovold@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under  the terms of the GNU General  Public License as published by the
+ * Free Software Foundation;  either version 2 of the License, or (at your
+ * option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/err.h>
+#include <linux/gpio.h>
+#include <linux/i2c.h>
+#include <linux/mfd/core.h>
+#include <linux/regmap.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+
+#include <linux/mfd/lm3533.h>
+
+
+#define LM3533_BOOST_OVP_MAX		0x03
+#define LM3533_BOOST_OVP_MASK		0x06
+#define LM3533_BOOST_OVP_SHIFT		1
+
+#define LM3533_BOOST_FREQ_MAX		0x01
+#define LM3533_BOOST_FREQ_MASK		0x01
+#define LM3533_BOOST_FREQ_SHIFT		0
+
+#define LM3533_BL_ID_MASK		1
+#define LM3533_LED_ID_MASK		3
+#define LM3533_BL_ID_MAX		1
+#define LM3533_LED_ID_MAX		3
+
+#define LM3533_HVLED_ID_MAX		2
+#define LM3533_LVLED_ID_MAX		5
+
+#define LM3533_REG_OUTPUT_CONF1		0x10
+#define LM3533_REG_OUTPUT_CONF2		0x11
+#define LM3533_REG_BOOST_PWM		0x2c
+
+#define LM3533_REG_MAX			0xb2
+
+
+static struct mfd_cell lm3533_als_devs[] = {
+	{
+		.name	= "lm3533-als",
+		.id	= -1,
+	},
+};
+
+static struct mfd_cell lm3533_bl_devs[] = {
+	{
+		.name	= "lm3533-backlight",
+		.id	= 0,
+	},
+	{
+		.name	= "lm3533-backlight",
+		.id	= 1,
+	},
+};
+
+static struct mfd_cell lm3533_led_devs[] = {
+	{
+		.name	= "lm3533-leds",
+		.id	= 0,
+	},
+	{
+		.name	= "lm3533-leds",
+		.id	= 1,
+	},
+	{
+		.name	= "lm3533-leds",
+		.id	= 2,
+	},
+	{
+		.name	= "lm3533-leds",
+		.id	= 3,
+	},
+};
+
+int lm3533_read(struct lm3533 *lm3533, u8 reg, u8 *val)
+{
+	int tmp;
+	int ret;
+
+	ret = regmap_read(lm3533->regmap, reg, &tmp);
+	if (ret < 0) {
+		dev_err(lm3533->dev, "failed to read register %02x: %d\n",
+								reg, ret);
+		return ret;
+	}
+
+	*val = tmp;
+
+	dev_dbg(lm3533->dev, "read [%02x]: %02x\n", reg, *val);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(lm3533_read);
+
+int lm3533_write(struct lm3533 *lm3533, u8 reg, u8 val)
+{
+	int ret;
+
+	dev_dbg(lm3533->dev, "write [%02x]: %02x\n", reg, val);
+
+	ret = regmap_write(lm3533->regmap, reg, val);
+	if (ret < 0) {
+		dev_err(lm3533->dev, "failed to write register %02x: %d\n",
+								reg, ret);
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(lm3533_write);
+
+int lm3533_update(struct lm3533 *lm3533, u8 reg, u8 val, u8 mask)
+{
+	int ret;
+
+	dev_dbg(lm3533->dev, "update [%02x]: %02x/%02x\n", reg, val, mask);
+
+	ret = regmap_update_bits(lm3533->regmap, reg, val, mask);
+	if (ret < 0) {
+		dev_err(lm3533->dev, "failed to update register %02x: %d\n",
+								reg, ret);
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(lm3533_update);
+
+/*
+ * HVLED output config -- output hvled controlled by backlight bl
+ */
+static int lm3533_set_hvled_config(struct lm3533 *lm3533, u8 hvled, u8 bl)
+{
+	u8 val;
+	u8 mask;
+	int shift;
+	int ret;
+
+	if (hvled == 0 || hvled > LM3533_HVLED_ID_MAX)
+		return -EINVAL;
+
+	if (bl > LM3533_BL_ID_MAX)
+		return -EINVAL;
+
+	shift = hvled - 1;
+	mask = LM3533_BL_ID_MASK << shift;
+	val = bl << shift;
+
+	ret = lm3533_update(lm3533, LM3533_REG_OUTPUT_CONF1, val, mask);
+	if (ret)
+		dev_err(lm3533->dev, "failed to set hvled config\n");
+
+	return ret;
+}
+
+/*
+ * LVLED output config -- output lvled controlled by LED led
+ */
+static int lm3533_set_lvled_config(struct lm3533 *lm3533, u8 lvled, u8 led)
+{
+	u8 reg;
+	u8 val;
+	u8 mask;
+	int shift;
+	int ret;
+
+	if (lvled == 0 || lvled > LM3533_LVLED_ID_MAX)
+		return -EINVAL;
+
+	if (led > LM3533_LED_ID_MAX)
+		return -EINVAL;
+
+	if (lvled < 4) {
+		reg = LM3533_REG_OUTPUT_CONF1;
+		shift = 2 * lvled;
+	} else {
+		reg = LM3533_REG_OUTPUT_CONF2;
+		shift = 2 * (lvled - 4);
+	}
+
+	mask = LM3533_LED_ID_MASK << shift;
+	val = led << shift;
+
+	ret = lm3533_update(lm3533, reg, val, mask);
+	if (ret)
+		dev_err(lm3533->dev, "failed to set lvled config\n");
+
+	return ret;
+}
+
+static void lm3533_enable(struct lm3533 *lm3533)
+{
+	if (gpio_is_valid(lm3533->gpio_hwen))
+		gpio_set_value(lm3533->gpio_hwen, 1);
+}
+
+static void lm3533_disable(struct lm3533 *lm3533)
+{
+	if (gpio_is_valid(lm3533->gpio_hwen))
+		gpio_set_value(lm3533->gpio_hwen, 0);
+}
+
+enum lm3533_attribute_type {
+	LM3533_ATTR_TYPE_BACKLIGHT,
+	LM3533_ATTR_TYPE_LED,
+};
+
+struct lm3533_device_attribute {
+	struct device_attribute dev_attr;
+	enum lm3533_attribute_type type;
+	union {
+		struct {
+			u8 id;
+		} output;
+		struct {
+			u8 reg;
+			u8 shift;
+			u8 mask;
+			u8 max;
+		} generic;
+	} u;
+};
+
+#define to_lm3533_dev_attr(_attr) \
+	container_of(_attr, struct lm3533_device_attribute, dev_attr)
+
+static ssize_t show_lm3533_reg(struct device *dev,
+				struct device_attribute *attr, char *buf)
+{
+	struct lm3533 *lm3533 = dev_get_drvdata(dev);
+	struct lm3533_device_attribute *lattr = to_lm3533_dev_attr(attr);
+	u8 val;
+	int ret;
+
+	ret = lm3533_read(lm3533, lattr->u.generic.reg, &val);
+	if (ret)
+		return ret;
+
+	val = (val & lattr->u.generic.mask) >> lattr->u.generic.shift;
+
+	return scnprintf(buf, PAGE_SIZE, "%u\n", val);
+}
+
+static ssize_t store_lm3533_reg(struct device *dev,
+						struct device_attribute *attr,
+						const char *buf, size_t len)
+{
+	struct lm3533 *lm3533 = dev_get_drvdata(dev);
+	struct lm3533_device_attribute *lattr = to_lm3533_dev_attr(attr);
+	u8 val;
+	int ret;
+
+	if (kstrtou8(buf, 0, &val) || val > lattr->u.generic.max)
+		return -EINVAL;
+
+	val = val << lattr->u.generic.shift;
+	ret = lm3533_update(lm3533, lattr->u.generic.reg, val,
+							lattr->u.generic.mask);
+	if (ret)
+		return ret;
+
+	return len;
+}
+
+#define GENERIC_ATTR(_reg, _max, _mask, _shift) \
+	{ .reg		= _reg, \
+	  .max		= _max, \
+	  .mask		= _mask, \
+	  .shift	= _shift }
+
+#define LM3533_GENERIC_ATTR(_name, _mode, _show, _store, _type,	\
+						_reg, _max, _mask, _shift) \
+	struct lm3533_device_attribute lm3533_dev_attr_##_name = { \
+		.dev_attr	= __ATTR(_name, _mode, _show, _store), \
+		.type		= _type, \
+		.u.generic	= GENERIC_ATTR(_reg, _max, _mask, _shift) }
+
+#define LM3533_GENERIC_ATTR_RW(_name, _type, _reg, _max, _mask, _shift) \
+	LM3533_GENERIC_ATTR(_name, S_IRUGO | S_IWUSR, \
+					show_lm3533_reg, store_lm3533_reg, \
+					_type, _reg, _max, _mask, _shift)
+
+#define LM3533_BOOST_ATTR_RW(_name, _NAME) \
+	LM3533_GENERIC_ATTR_RW(_name, LM3533_ATTR_TYPE_BACKLIGHT, \
+				LM3533_REG_BOOST_PWM, LM3533_##_NAME##_MAX, \
+				LM3533_##_NAME##_MASK, LM3533_##_NAME##_SHIFT)
+/*
+ * Boost Over Voltage Protection Select
+ *
+ *   0 - 16 V (default)
+ *   1 - 24 V
+ *   2 - 32 V
+ *   3 - 40 V
+ */
+static LM3533_BOOST_ATTR_RW(boost_ovp, BOOST_OVP);
+
+/*
+ * Boost Frequency Select
+ *
+ *   0 - 500 kHz (default)
+ *   1 - 1 MHz
+ */
+static LM3533_BOOST_ATTR_RW(boost_freq, BOOST_FREQ);
+
+static ssize_t show_output(struct device *dev,
+				struct device_attribute *attr, char *buf)
+{
+	struct lm3533 *lm3533 = dev_get_drvdata(dev);
+	struct lm3533_device_attribute *lattr = to_lm3533_dev_attr(attr);
+	int id = lattr->u.output.id;
+	u8 reg;
+	u8 val;
+	u8 mask;
+	int shift;
+	int ret;
+
+	if (lattr->type == LM3533_ATTR_TYPE_BACKLIGHT) {
+		reg = LM3533_REG_OUTPUT_CONF1;
+		shift = id - 1;
+		mask = LM3533_BL_ID_MASK << shift;
+	} else {
+		if (id < 4) {
+			reg = LM3533_REG_OUTPUT_CONF1;
+			shift = 2 * id;
+		} else {
+			reg = LM3533_REG_OUTPUT_CONF2;
+			shift = 2 * (id - 4);
+		}
+		mask = LM3533_LED_ID_MASK << shift;
+	}
+
+	ret = lm3533_read(lm3533, reg, &val);
+	if (ret)
+		return ret;
+
+	val = (val & mask) >> shift;
+
+	return scnprintf(buf, PAGE_SIZE, "%u\n", val);
+}
+
+static ssize_t store_output(struct device *dev,
+					struct device_attribute *attr,
+					const char *buf, size_t len)
+{
+	struct lm3533 *lm3533 = dev_get_drvdata(dev);
+	struct lm3533_device_attribute *lattr = to_lm3533_dev_attr(attr);
+	int id = lattr->u.output.id;
+	u8 val;
+	int ret;
+
+	if (kstrtou8(buf, 0, &val))
+		return -EINVAL;
+
+	if (lattr->type == LM3533_ATTR_TYPE_BACKLIGHT)
+		ret = lm3533_set_hvled_config(lm3533, id, val);
+	else
+		ret = lm3533_set_lvled_config(lm3533, id, val);
+
+	if (ret)
+		return ret;
+
+	return len;
+}
+
+#define LM3533_OUTPUT_ATTR(_name, _mode, _show, _store, _type, _id) \
+	struct lm3533_device_attribute lm3533_dev_attr_##_name = \
+		{ .dev_attr	= __ATTR(_name, _mode, _show, _store), \
+		  .type		= _type, \
+		  .u.output	= { .id = _id }, }
+
+#define LM3533_OUTPUT_ATTR_RW(_name, _type, _id) \
+	LM3533_OUTPUT_ATTR(output_##_name, S_IRUGO | S_IWUSR, \
+					show_output, store_output, _type, _id)
+
+#define LM3533_OUTPUT_HVLED_ATTR_RW(_nr) \
+	LM3533_OUTPUT_ATTR_RW(hvled##_nr, LM3533_ATTR_TYPE_BACKLIGHT, _nr)
+#define LM3533_OUTPUT_LVLED_ATTR_RW(_nr) \
+	LM3533_OUTPUT_ATTR_RW(lvled##_nr, LM3533_ATTR_TYPE_LED, _nr)
+/*
+ * Output config:
+ *
+ * output_hvled<nr>	0-1
+ * output_lvled<nr>	0-3
+ */
+static LM3533_OUTPUT_HVLED_ATTR_RW(1);
+static LM3533_OUTPUT_HVLED_ATTR_RW(2);
+static LM3533_OUTPUT_LVLED_ATTR_RW(1);
+static LM3533_OUTPUT_LVLED_ATTR_RW(2);
+static LM3533_OUTPUT_LVLED_ATTR_RW(3);
+static LM3533_OUTPUT_LVLED_ATTR_RW(4);
+static LM3533_OUTPUT_LVLED_ATTR_RW(5);
+
+static struct attribute *lm3533_attributes[] = {
+	&lm3533_dev_attr_boost_freq.dev_attr.attr,
+	&lm3533_dev_attr_boost_ovp.dev_attr.attr,
+	&lm3533_dev_attr_output_hvled1.dev_attr.attr,
+	&lm3533_dev_attr_output_hvled2.dev_attr.attr,
+	&lm3533_dev_attr_output_lvled1.dev_attr.attr,
+	&lm3533_dev_attr_output_lvled2.dev_attr.attr,
+	&lm3533_dev_attr_output_lvled3.dev_attr.attr,
+	&lm3533_dev_attr_output_lvled4.dev_attr.attr,
+	&lm3533_dev_attr_output_lvled5.dev_attr.attr,
+	NULL,
+};
+
+#define to_dev_attr(_attr) \
+	container_of(_attr, struct device_attribute, attr)
+
+static mode_t lm3533_attr_is_visible(struct kobject *kobj,
+					     struct attribute *attr, int n)
+{
+	struct device *dev = container_of(kobj, struct device, kobj);
+	struct lm3533 *lm3533 = dev_get_drvdata(dev);
+	struct device_attribute *dattr = to_dev_attr(attr);
+	struct lm3533_device_attribute *lattr = to_lm3533_dev_attr(dattr);
+	enum lm3533_attribute_type type = lattr->type;
+	mode_t mode = attr->mode;
+
+	if (!lm3533->have_backlights && type == LM3533_ATTR_TYPE_BACKLIGHT)
+		mode = 0;
+	else if (!lm3533->have_leds && type == LM3533_ATTR_TYPE_LED)
+		mode = 0;
+
+	return mode;
+};
+
+static struct attribute_group lm3533_attribute_group = {
+	.is_visible	= lm3533_attr_is_visible,
+	.attrs		= lm3533_attributes
+};
+
+static int __devinit lm3533_device_als_init(struct lm3533 *lm3533)
+{
+	struct lm3533_platform_data *pdata = lm3533->dev->platform_data;
+	int ret;
+
+	if (!pdata->als)
+		return 0;
+
+	lm3533_als_devs[0].platform_data = pdata->als;
+	lm3533_als_devs[0].pdata_size = sizeof(*pdata->als);
+
+	ret = mfd_add_devices(lm3533->dev, 0, lm3533_als_devs, 1, NULL, 0);
+	if (ret) {
+		dev_err(lm3533->dev, "failed to add ALS device\n");
+		return ret;
+	}
+
+	lm3533->have_als = 1;
+
+	return 0;
+}
+
+static int __devinit lm3533_device_bl_init(struct lm3533 *lm3533)
+{
+	struct lm3533_platform_data *pdata = lm3533->dev->platform_data;
+	int i;
+	int ret;
+
+	if (!pdata->backlights || pdata->num_backlights == 0)
+		return 0;
+
+	if (pdata->num_backlights > ARRAY_SIZE(lm3533_bl_devs))
+		pdata->num_backlights = ARRAY_SIZE(lm3533_bl_devs);
+
+	for (i = 0; i < pdata->num_backlights; ++i) {
+		lm3533_bl_devs[i].platform_data = &pdata->backlights[i];
+		lm3533_bl_devs[i].pdata_size = sizeof(pdata->backlights[i]);
+	}
+
+	ret = mfd_add_devices(lm3533->dev, 0, lm3533_bl_devs,
+					pdata->num_backlights, NULL, 0);
+	if (ret) {
+		dev_err(lm3533->dev, "failed to add backlight devices\n");
+		return ret;
+	}
+
+	lm3533->have_backlights = 1;
+
+	return 0;
+}
+
+static int __devinit lm3533_device_led_init(struct lm3533 *lm3533)
+{
+	struct lm3533_platform_data *pdata = lm3533->dev->platform_data;
+	int i;
+	int ret;
+
+	if (!pdata->leds || pdata->num_leds == 0)
+		return 0;
+
+	if (pdata->num_leds > ARRAY_SIZE(lm3533_led_devs))
+		pdata->num_leds = ARRAY_SIZE(lm3533_led_devs);
+
+	for (i = 0; i < pdata->num_leds; ++i) {
+		lm3533_led_devs[i].platform_data = &pdata->leds[i];
+		lm3533_led_devs[i].pdata_size = sizeof(pdata->leds[i]);
+	}
+
+	ret = mfd_add_devices(lm3533->dev, 0, lm3533_led_devs,
+						pdata->num_leds, NULL, 0);
+	if (ret) {
+		dev_err(lm3533->dev, "failed to add LED devices\n");
+		return ret;
+	}
+
+	lm3533->have_leds = 1;
+
+	return 0;
+}
+
+static int __devinit lm3533_device_init(struct lm3533 *lm3533)
+{
+	struct lm3533_platform_data *pdata = lm3533->dev->platform_data;
+	int ret;
+
+	dev_dbg(lm3533->dev, "%s\n", __func__);
+
+	if (!pdata) {
+		dev_err(lm3533->dev, "no platform data\n");
+		return -EINVAL;
+	}
+
+	lm3533->gpio_hwen = pdata->gpio_hwen;
+
+	dev_set_drvdata(lm3533->dev, lm3533);
+
+	if (gpio_is_valid(lm3533->gpio_hwen)) {
+		ret = gpio_request_one(lm3533->gpio_hwen, GPIOF_OUT_INIT_LOW,
+								"lm3533-hwen");
+		if (ret < 0) {
+			dev_err(lm3533->dev,
+				"failed to request HWEN GPIO %d\n",
+				lm3533->gpio_hwen);
+			return ret;
+		}
+	}
+
+	lm3533_enable(lm3533);
+
+	lm3533_device_als_init(lm3533);
+	lm3533_device_bl_init(lm3533);
+	lm3533_device_led_init(lm3533);
+
+	ret = sysfs_create_group(&lm3533->dev->kobj, &lm3533_attribute_group);
+	if (ret < 0) {
+		dev_err(lm3533->dev, "failed to create sysfs attributes\n");
+		goto err_unregister;
+	}
+
+	return 0;
+
+err_unregister:
+	mfd_remove_devices(lm3533->dev);
+	lm3533_disable(lm3533);
+	if (gpio_is_valid(lm3533->gpio_hwen))
+		gpio_free(lm3533->gpio_hwen);
+
+	return ret;
+}
+
+static void __devexit lm3533_device_exit(struct lm3533 *lm3533)
+{
+	dev_dbg(lm3533->dev, "%s\n", __func__);
+
+	sysfs_remove_group(&lm3533->dev->kobj, &lm3533_attribute_group);
+
+	mfd_remove_devices(lm3533->dev);
+	lm3533_disable(lm3533);
+	if (gpio_is_valid(lm3533->gpio_hwen))
+		gpio_free(lm3533->gpio_hwen);
+}
+
+static bool lm3533_readable_register(struct device *dev, unsigned int reg)
+{
+	switch (reg) {
+	case 0x10 ... 0x2c:
+	case 0x30 ... 0x38:
+	case 0x40 ... 0x45:
+	case 0x50 ... 0x57:
+	case 0x60 ... 0x6e:
+	case 0x70 ... 0x75:
+	case 0x80 ... 0x85:
+	case 0x90 ... 0x95:
+	case 0xa0 ... 0xa5:
+	case 0xb0 ... 0xb2:
+		return true;
+	default:
+		return false;
+	}
+}
+
+static bool lm3533_volatile_register(struct device *dev, unsigned int reg)
+{
+	switch (reg) {
+	case 0x34:		/* zone */
+	case 0x37 ... 0x38:	/* adc */
+	case 0xb0 ... 0xb1:	/* fault */
+		return true;
+	default:
+		return false;
+	}
+}
+
+static bool lm3533_precious_register(struct device *dev, unsigned int reg)
+{
+	switch (reg) {
+	case 0x34:		/* zone */
+		return true;
+	default:
+		return false;
+	}
+}
+
+static struct regmap_config regmap_config = {
+	.reg_bits	= 8,
+	.val_bits	= 8,
+	.max_register	= LM3533_REG_MAX,
+	.readable_reg	= lm3533_readable_register,
+	.volatile_reg	= lm3533_volatile_register,
+	.precious_reg	= lm3533_precious_register,
+};
+
+static int __devinit lm3533_i2c_probe(struct i2c_client *i2c,
+					const struct i2c_device_id *id)
+{
+	struct lm3533 *lm3533;
+	int ret;
+
+	dev_dbg(&i2c->dev, "%s\n", __func__);
+
+	lm3533 = kzalloc(sizeof(*lm3533), GFP_KERNEL);
+	if (!lm3533)
+		return -ENOMEM;
+
+	i2c_set_clientdata(i2c, lm3533);
+
+	lm3533->regmap = regmap_init_i2c(i2c, &regmap_config);
+	if (IS_ERR(lm3533->regmap)) {
+		ret = PTR_ERR(lm3533->regmap);
+		goto err_regmap;
+	}
+
+	lm3533->dev = &i2c->dev;
+	lm3533->irq = i2c->irq;
+
+	ret = lm3533_device_init(lm3533);
+	if (ret)
+		goto err_dev;
+
+	return 0;
+
+err_dev:
+	regmap_exit(lm3533->regmap);
+err_regmap:
+	kfree(lm3533);
+
+	return ret;
+}
+
+static int __devexit lm3533_i2c_remove(struct i2c_client *i2c)
+{
+	struct lm3533 *lm3533 = i2c_get_clientdata(i2c);
+
+	dev_dbg(&i2c->dev, "%s\n", __func__);
+
+	lm3533_device_exit(lm3533);
+	regmap_exit(lm3533->regmap);
+
+	kfree(lm3533);
+
+	return 0;
+}
+
+static const struct i2c_device_id lm3533_i2c_ids[] = {
+	{ "lm3533", 0 },
+	{ },
+};
+MODULE_DEVICE_TABLE(i2c, lm3533_i2c_ids);
+
+static struct i2c_driver lm3533_i2c_driver = {
+	.driver = {
+		   .name = "lm3533",
+		   .owner = THIS_MODULE,
+	},
+	.id_table	= lm3533_i2c_ids,
+	.probe		= lm3533_i2c_probe,
+	.remove		= __devexit_p(lm3533_i2c_remove),
+};
+
+static int __init lm3533_i2c_init(void)
+{
+	return i2c_add_driver(&lm3533_i2c_driver);
+}
+subsys_initcall(lm3533_i2c_init);
+
+static void __exit lm3533_i2c_exit(void)
+{
+	i2c_del_driver(&lm3533_i2c_driver);
+}
+module_exit(lm3533_i2c_exit);
+
+MODULE_AUTHOR("Johan Hovold <jhovold@gmail.com>");
+MODULE_DESCRIPTION("LM3533 Core");
+MODULE_LICENSE("GPL");
diff --git a/drivers/mfd/lm3533-ctrlbank.c b/drivers/mfd/lm3533-ctrlbank.c
new file mode 100644
index 0000000..c2732a3
--- /dev/null
+++ b/drivers/mfd/lm3533-ctrlbank.c
@@ -0,0 +1,134 @@
+/*
+ * lm3533-ctrlbank.c -- LM3533 Generic Control Bank interface
+ *
+ * Copyright (C) 2011-2012 Texas Instruments
+ *
+ * Author: Johan Hovold <jhovold@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under  the terms of the GNU General  Public License as published by the
+ * Free Software Foundation;  either version 2 of the License, or (at your
+ * option) any later version.
+ */
+
+#include <linux/device.h>
+#include <linux/module.h>
+
+#include <linux/mfd/lm3533.h>
+
+
+#define LM3533_BRIGHTNESS_MAX		255
+#define LM3533_MAX_CURRENT_MAX		31
+#define LM3533_PWM_MAX			0x3f
+
+#define LM3533_REG_PWM_BASE		0x14
+#define LM3533_REG_MAX_CURRENT_BASE	0x1f
+#define LM3533_REG_CTRLBANK_ENABLE	0x27
+#define LM3533_REG_BRIGHTNESS_BASE	0x40
+
+
+static inline u8 lm3533_ctrlbank_get_reg(struct lm3533_ctrlbank *cb, u8 base)
+{
+	return base + cb->id;
+}
+
+int lm3533_ctrlbank_enable(struct lm3533_ctrlbank *cb)
+{
+	u8 mask;
+	int ret;
+
+	dev_dbg(cb->dev, "%s - %d\n", __func__, cb->id);
+
+	mask = 1 << cb->id;
+	ret = lm3533_update(cb->lm3533, LM3533_REG_CTRLBANK_ENABLE,
+								mask, mask);
+	if (ret)
+		dev_err(cb->dev, "failed to enable ctrlbank %d\n", cb->id);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(lm3533_ctrlbank_enable);
+
+int lm3533_ctrlbank_disable(struct lm3533_ctrlbank *cb)
+{
+	u8 mask;
+	int ret;
+
+	dev_dbg(cb->dev, "%s - %d\n", __func__, cb->id);
+
+	mask = 1 << cb->id;
+	ret = lm3533_update(cb->lm3533, LM3533_REG_CTRLBANK_ENABLE, 0, mask);
+	if (ret)
+		dev_err(cb->dev, "failed to disable ctrlbank %d\n", cb->id);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(lm3533_ctrlbank_disable);
+
+#define lm3533_ctrlbank_set(_name, _NAME)				\
+int lm3533_ctrlbank_set_##_name(struct lm3533_ctrlbank *cb, u8 val)	\
+{									\
+	u8 reg;								\
+	int ret;							\
+									\
+	if (val > LM3533_##_NAME##_MAX)					\
+		return -EINVAL;						\
+									\
+	reg = lm3533_ctrlbank_get_reg(cb, LM3533_REG_##_NAME##_BASE);	\
+	ret = lm3533_write(cb->lm3533, reg, val);			\
+	if (ret)							\
+		dev_err(cb->dev, "failed to set " #_name "\n");		\
+									\
+	return ret;							\
+}									\
+EXPORT_SYMBOL_GPL(lm3533_ctrlbank_set_##_name);
+
+#define lm3533_ctrlbank_get(_name, _NAME)				\
+int lm3533_ctrlbank_get_##_name(struct lm3533_ctrlbank *cb, u8 *val)	\
+{									\
+	u8 reg;								\
+	int ret;							\
+									\
+	reg = lm3533_ctrlbank_get_reg(cb, LM3533_REG_##_NAME##_BASE);	\
+	ret = lm3533_read(cb->lm3533, reg, val);			\
+	if (ret)							\
+		dev_err(cb->dev, "failed to get " #_name "\n");		\
+									\
+	return ret;							\
+}									\
+EXPORT_SYMBOL_GPL(lm3533_ctrlbank_get_##_name);
+
+lm3533_ctrlbank_set(brightness, BRIGHTNESS);
+lm3533_ctrlbank_get(brightness, BRIGHTNESS);
+
+/*
+ * Full scale current.
+ *
+ * Imax = 5 + val * 0.8 mA, e.g.:
+ *
+ *    0 - 5 mA
+ *     ...
+ *   19 - 20.2 mA (default)
+ *     ...
+ *   31 - 29.8 mA
+ */
+lm3533_ctrlbank_set(max_current, MAX_CURRENT);
+lm3533_ctrlbank_get(max_current, MAX_CURRENT);
+
+/*
+ * PWM-input control mask:
+ *
+ *   bit 5 - PWM-input enabled in Zone 4
+ *   bit 4 - PWM-input enabled in Zone 3
+ *   bit 3 - PWM-input enabled in Zone 2
+ *   bit 2 - PWM-input enabled in Zone 1
+ *   bit 1 - PWM-input enabled in Zone 0
+ *   bit 0 - PWM-input enabled
+ */
+lm3533_ctrlbank_set(pwm, PWM);
+lm3533_ctrlbank_get(pwm, PWM);
+
+
+MODULE_AUTHOR("Johan Hovold <jhovold@gmail.com>");
+MODULE_DESCRIPTION("LM3533 Control Bank interface");
+MODULE_LICENSE("GPL");
diff --git a/include/linux/mfd/lm3533.h b/include/linux/mfd/lm3533.h
new file mode 100644
index 0000000..75f85f3
--- /dev/null
+++ b/include/linux/mfd/lm3533.h
@@ -0,0 +1,89 @@
+/*
+ * lm3533.h -- LM3533 interface
+ *
+ * Copyright (C) 2011-2012 Texas Instruments
+ *
+ * Author: Johan Hovold <jhovold@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under  the terms of the GNU General  Public License as published by the
+ * Free Software Foundation;  either version 2 of the License, or (at your
+ * option) any later version.
+ */
+
+#ifndef __LINUX_MFD_LM3533_H
+#define __LINUX_MFD_LM3533_H
+
+#define LM3533_ATTR_RO(_name) \
+	DEVICE_ATTR(_name, S_IRUGO, show_##_name, NULL)
+#define LM3533_ATTR_RW(_name) \
+	DEVICE_ATTR(_name, S_IRUGO | S_IWUSR , show_##_name, store_##_name)
+
+struct device;
+struct regmap;
+
+struct lm3533 {
+	struct device *dev;
+
+	struct regmap *regmap;
+
+	int gpio_hwen;
+	int irq;
+
+	unsigned have_als:1;
+	unsigned have_backlights:1;
+	unsigned have_leds:1;
+};
+
+struct lm3533_ctrlbank {
+	struct lm3533 *lm3533;
+	struct device *dev;
+	int id;
+};
+
+struct lm3533_als_platform_data {
+	unsigned pwm_mode:1;		/* PWM input mode (default analog) */
+};
+
+struct lm3533_bl_platform_data {
+	char *name;
+	u8 default_brightness;		/* 0 - 255 */
+	u8 max_current;			/* 0 - 31 */
+	u8 pwm;				/* 0 - 0x3f */
+};
+
+struct lm3533_led_platform_data {
+	char *name;
+	const char *default_trigger;
+	u8 max_current;			/* 0 - 31 */
+	u8 pwm;				/* 0 - 0x3f */
+};
+
+struct lm3533_platform_data {
+	int gpio_hwen;
+
+	struct lm3533_als_platform_data *als;
+
+	struct lm3533_bl_platform_data *backlights;
+	int num_backlights;
+
+	struct lm3533_led_platform_data *leds;
+	int num_leds;
+};
+
+extern int lm3533_ctrlbank_enable(struct lm3533_ctrlbank *cb);
+extern int lm3533_ctrlbank_disable(struct lm3533_ctrlbank *cb);
+
+extern int lm3533_ctrlbank_set_brightness(struct lm3533_ctrlbank *cb, u8 val);
+extern int lm3533_ctrlbank_get_brightness(struct lm3533_ctrlbank *cb, u8 *val);
+extern int lm3533_ctrlbank_set_max_current(struct lm3533_ctrlbank *cb, u8 val);
+extern int lm3533_ctrlbank_get_max_current(struct lm3533_ctrlbank *cb,
+								u8 *val);
+extern int lm3533_ctrlbank_set_pwm(struct lm3533_ctrlbank *cb, u8 val);
+extern int lm3533_ctrlbank_get_pwm(struct lm3533_ctrlbank *cb, u8 *val);
+
+extern int lm3533_read(struct lm3533 *lm3533, u8 reg, u8 *val);
+extern int lm3533_write(struct lm3533 *lm3533, u8 reg, u8 val);
+extern int lm3533_update(struct lm3533 *lm3533, u8 reg, u8 val, u8 mask);
+
+#endif	/* __LINUX_MFD_LM3533_H */
-- 
cgit v0.10.2


From 887c8ec7219fc8eba78bb8f44a74c660934e9b98 Mon Sep 17 00:00:00 2001
From: Aaron Sierra <asierra@xes-inc.com>
Date: Fri, 20 Apr 2012 14:14:11 -0500
Subject: watchdog: Convert iTCO_wdt driver to mfd model

This patch converts the iTCO_wdt driver to use the multi-function device
driver model. It uses resources discovered by the lpc_ich driver, so that
it no longer does its own PCI scanning.

Signed-off-by: Aaron Sierra <asierra@xes-inc.com>
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>

diff --git a/drivers/mfd/Kconfig b/drivers/mfd/Kconfig
index 211f5de..1e9a7d5 100644
--- a/drivers/mfd/Kconfig
+++ b/drivers/mfd/Kconfig
@@ -785,7 +785,8 @@ config LPC_ICH
 	help
 	  The LPC bridge function of the Intel ICH provides support for
 	  many functional units. This driver provides needed support for
-	  other drivers to control these functions, currently GPIO.
+	  other drivers to control these functions, currently GPIO and
+	  watchdog.
 
 config MFD_RDC321X
 	tristate "Support for RDC-R321x southbridge"
diff --git a/drivers/mfd/lpc_ich.c b/drivers/mfd/lpc_ich.c
index 7e3a7b6..027cc8f 100644
--- a/drivers/mfd/lpc_ich.c
+++ b/drivers/mfd/lpc_ich.c
@@ -65,14 +65,42 @@
 #define ACPIBASE		0x40
 #define ACPIBASE_GPE_OFF	0x28
 #define ACPIBASE_GPE_END	0x2f
+#define ACPIBASE_SMI_OFF	0x30
+#define ACPIBASE_SMI_END	0x33
+#define ACPIBASE_TCO_OFF	0x60
+#define ACPIBASE_TCO_END	0x7f
 #define ACPICTRL		0x44
 
+#define ACPIBASE_GCS_OFF	0x3410
+#define ACPIBASE_GCS_END	0x3414
+
 #define GPIOBASE		0x48
 #define GPIOCTRL		0x4C
 
+#define RCBABASE		0xf0
+
+#define wdt_io_res(i) wdt_res(0, i)
+#define wdt_mem_res(i) wdt_res(ICH_RES_MEM_OFF, i)
+#define wdt_res(b, i) (&wdt_ich_res[(b) + (i)])
+
 static int lpc_ich_acpi_save = -1;
 static int lpc_ich_gpio_save = -1;
 
+static struct resource wdt_ich_res[] = {
+	/* ACPI - TCO */
+	{
+		.flags = IORESOURCE_IO,
+	},
+	/* ACPI - SMI */
+	{
+		.flags = IORESOURCE_IO,
+	},
+	/* GCS */
+	{
+		.flags = IORESOURCE_MEM,
+	},
+};
+
 static struct resource gpio_ich_res[] = {
 	/* GPIO */
 	{
@@ -85,10 +113,17 @@ static struct resource gpio_ich_res[] = {
 };
 
 enum lpc_cells {
-	LPC_GPIO = 0,
+	LPC_WDT = 0,
+	LPC_GPIO,
 };
 
 static struct mfd_cell lpc_ich_cells[] = {
+	[LPC_WDT] = {
+		.name = "iTCO_wdt",
+		.num_resources = ARRAY_SIZE(wdt_ich_res),
+		.resources = wdt_ich_res,
+		.ignore_resource_conflicts = true,
+	},
 	[LPC_GPIO] = {
 		.name = "gpio_ich",
 		.num_resources = ARRAY_SIZE(gpio_ich_res),
@@ -162,218 +197,276 @@ enum lpc_chipsets {
 struct lpc_ich_info lpc_chipset_info[] __devinitdata = {
 	[LPC_ICH] = {
 		.name = "ICH",
+		.iTCO_version = 1,
 	},
 	[LPC_ICH0] = {
 		.name = "ICH0",
+		.iTCO_version = 1,
 	},
 	[LPC_ICH2] = {
 		.name = "ICH2",
+		.iTCO_version = 1,
 	},
 	[LPC_ICH2M] = {
 		.name = "ICH2-M",
+		.iTCO_version = 1,
 	},
 	[LPC_ICH3] = {
 		.name = "ICH3-S",
+		.iTCO_version = 1,
 	},
 	[LPC_ICH3M] = {
 		.name = "ICH3-M",
+		.iTCO_version = 1,
 	},
 	[LPC_ICH4] = {
 		.name = "ICH4",
+		.iTCO_version = 1,
 	},
 	[LPC_ICH4M] = {
 		.name = "ICH4-M",
+		.iTCO_version = 1,
 	},
 	[LPC_CICH] = {
 		.name = "C-ICH",
+		.iTCO_version = 1,
 	},
 	[LPC_ICH5] = {
 		.name = "ICH5 or ICH5R",
+		.iTCO_version = 1,
 	},
 	[LPC_6300ESB] = {
 		.name = "6300ESB",
+		.iTCO_version = 1,
 	},
 	[LPC_ICH6] = {
 		.name = "ICH6 or ICH6R",
+		.iTCO_version = 2,
 		.gpio_version = ICH_V6_GPIO,
 	},
 	[LPC_ICH6M] = {
 		.name = "ICH6-M",
+		.iTCO_version = 2,
 		.gpio_version = ICH_V6_GPIO,
 	},
 	[LPC_ICH6W] = {
 		.name = "ICH6W or ICH6RW",
+		.iTCO_version = 2,
 		.gpio_version = ICH_V6_GPIO,
 	},
 	[LPC_631XESB] = {
 		.name = "631xESB/632xESB",
+		.iTCO_version = 2,
 		.gpio_version = ICH_V6_GPIO,
 	},
 	[LPC_ICH7] = {
 		.name = "ICH7 or ICH7R",
+		.iTCO_version = 2,
 		.gpio_version = ICH_V7_GPIO,
 	},
 	[LPC_ICH7DH] = {
 		.name = "ICH7DH",
+		.iTCO_version = 2,
 		.gpio_version = ICH_V7_GPIO,
 	},
 	[LPC_ICH7M] = {
 		.name = "ICH7-M or ICH7-U",
+		.iTCO_version = 2,
 		.gpio_version = ICH_V7_GPIO,
 	},
 	[LPC_ICH7MDH] = {
 		.name = "ICH7-M DH",
+		.iTCO_version = 2,
 		.gpio_version = ICH_V7_GPIO,
 	},
 	[LPC_NM10] = {
 		.name = "NM10",
+		.iTCO_version = 2,
 	},
 	[LPC_ICH8] = {
 		.name = "ICH8 or ICH8R",
+		.iTCO_version = 2,
 		.gpio_version = ICH_V7_GPIO,
 	},
 	[LPC_ICH8DH] = {
 		.name = "ICH8DH",
+		.iTCO_version = 2,
 		.gpio_version = ICH_V7_GPIO,
 	},
 	[LPC_ICH8DO] = {
 		.name = "ICH8DO",
+		.iTCO_version = 2,
 		.gpio_version = ICH_V7_GPIO,
 	},
 	[LPC_ICH8M] = {
 		.name = "ICH8M",
+		.iTCO_version = 2,
 		.gpio_version = ICH_V7_GPIO,
 	},
 	[LPC_ICH8ME] = {
 		.name = "ICH8M-E",
+		.iTCO_version = 2,
 		.gpio_version = ICH_V7_GPIO,
 	},
 	[LPC_ICH9] = {
 		.name = "ICH9",
+		.iTCO_version = 2,
 		.gpio_version = ICH_V9_GPIO,
 	},
 	[LPC_ICH9R] = {
 		.name = "ICH9R",
+		.iTCO_version = 2,
 		.gpio_version = ICH_V9_GPIO,
 	},
 	[LPC_ICH9DH] = {
 		.name = "ICH9DH",
+		.iTCO_version = 2,
 		.gpio_version = ICH_V9_GPIO,
 	},
 	[LPC_ICH9DO] = {
 		.name = "ICH9DO",
+		.iTCO_version = 2,
 		.gpio_version = ICH_V9_GPIO,
 	},
 	[LPC_ICH9M] = {
 		.name = "ICH9M",
+		.iTCO_version = 2,
 		.gpio_version = ICH_V9_GPIO,
 	},
 	[LPC_ICH9ME] = {
 		.name = "ICH9M-E",
+		.iTCO_version = 2,
 		.gpio_version = ICH_V9_GPIO,
 	},
 	[LPC_ICH10] = {
 		.name = "ICH10",
+		.iTCO_version = 2,
 		.gpio_version = ICH_V10CONS_GPIO,
 	},
 	[LPC_ICH10R] = {
 		.name = "ICH10R",
+		.iTCO_version = 2,
 		.gpio_version = ICH_V10CONS_GPIO,
 	},
 	[LPC_ICH10D] = {
 		.name = "ICH10D",
+		.iTCO_version = 2,
 		.gpio_version = ICH_V10CORP_GPIO,
 	},
 	[LPC_ICH10DO] = {
 		.name = "ICH10DO",
+		.iTCO_version = 2,
 		.gpio_version = ICH_V10CORP_GPIO,
 	},
 	[LPC_PCH] = {
 		.name = "PCH Desktop Full Featured",
+		.iTCO_version = 2,
 		.gpio_version = ICH_V5_GPIO,
 	},
 	[LPC_PCHM] = {
 		.name = "PCH Mobile Full Featured",
+		.iTCO_version = 2,
 		.gpio_version = ICH_V5_GPIO,
 	},
 	[LPC_P55] = {
 		.name = "P55",
+		.iTCO_version = 2,
 		.gpio_version = ICH_V5_GPIO,
 	},
 	[LPC_PM55] = {
 		.name = "PM55",
+		.iTCO_version = 2,
 		.gpio_version = ICH_V5_GPIO,
 	},
 	[LPC_H55] = {
 		.name = "H55",
+		.iTCO_version = 2,
 		.gpio_version = ICH_V5_GPIO,
 	},
 	[LPC_QM57] = {
 		.name = "QM57",
+		.iTCO_version = 2,
 		.gpio_version = ICH_V5_GPIO,
 	},
 	[LPC_H57] = {
 		.name = "H57",
+		.iTCO_version = 2,
 		.gpio_version = ICH_V5_GPIO,
 	},
 	[LPC_HM55] = {
 		.name = "HM55",
+		.iTCO_version = 2,
 		.gpio_version = ICH_V5_GPIO,
 	},
 	[LPC_Q57] = {
 		.name = "Q57",
+		.iTCO_version = 2,
 		.gpio_version = ICH_V5_GPIO,
 	},
 	[LPC_HM57] = {
 		.name = "HM57",
+		.iTCO_version = 2,
 		.gpio_version = ICH_V5_GPIO,
 	},
 	[LPC_PCHMSFF] = {
 		.name = "PCH Mobile SFF Full Featured",
+		.iTCO_version = 2,
 		.gpio_version = ICH_V5_GPIO,
 	},
 	[LPC_QS57] = {
 		.name = "QS57",
+		.iTCO_version = 2,
 		.gpio_version = ICH_V5_GPIO,
 	},
 	[LPC_3400] = {
 		.name = "3400",
+		.iTCO_version = 2,
 		.gpio_version = ICH_V5_GPIO,
 	},
 	[LPC_3420] = {
 		.name = "3420",
+		.iTCO_version = 2,
 		.gpio_version = ICH_V5_GPIO,
 	},
 	[LPC_3450] = {
 		.name = "3450",
+		.iTCO_version = 2,
 		.gpio_version = ICH_V5_GPIO,
 	},
 	[LPC_EP80579] = {
 		.name = "EP80579",
+		.iTCO_version = 2,
 	},
 	[LPC_CPT] = {
 		.name = "Cougar Point",
+		.iTCO_version = 2,
 		.gpio_version = ICH_V5_GPIO,
 	},
 	[LPC_CPTD] = {
 		.name = "Cougar Point Desktop",
+		.iTCO_version = 2,
 		.gpio_version = ICH_V5_GPIO,
 	},
 	[LPC_CPTM] = {
 		.name = "Cougar Point Mobile",
+		.iTCO_version = 2,
 		.gpio_version = ICH_V5_GPIO,
 	},
 	[LPC_PBG] = {
 		.name = "Patsburg",
+		.iTCO_version = 2,
 	},
 	[LPC_DH89XXCC] = {
 		.name = "DH89xxCC",
+		.iTCO_version = 2,
 	},
 	[LPC_PPT] = {
 		.name = "Panther Point",
+		.iTCO_version = 2,
 	},
 	[LPC_LPT] = {
 		.name = "Lynx Point",
+		.iTCO_version = 2,
 	},
 };
 
@@ -666,12 +759,88 @@ gpio_done:
 	return ret;
 }
 
+static int __devinit lpc_ich_init_wdt(struct pci_dev *dev,
+				const struct pci_device_id *id)
+{
+	u32 base_addr_cfg;
+	u32 base_addr;
+	int ret;
+	bool acpi_conflict = false;
+	struct resource *res;
+
+	/* Setup power management base register */
+	pci_read_config_dword(dev, ACPIBASE, &base_addr_cfg);
+	base_addr = base_addr_cfg & 0x0000ff80;
+	if (!base_addr) {
+		dev_err(&dev->dev, "I/O space for ACPI uninitialized\n");
+		ret = -ENODEV;
+		goto wdt_done;
+	}
+
+	res = wdt_io_res(ICH_RES_IO_TCO);
+	res->start = base_addr + ACPIBASE_TCO_OFF;
+	res->end = base_addr + ACPIBASE_TCO_END;
+	ret = acpi_check_resource_conflict(res);
+	if (ret) {
+		acpi_conflict = true;
+		goto wdt_done;
+	}
+
+	res = wdt_io_res(ICH_RES_IO_SMI);
+	res->start = base_addr + ACPIBASE_SMI_OFF;
+	res->end = base_addr + ACPIBASE_SMI_END;
+	ret = acpi_check_resource_conflict(res);
+	if (ret) {
+		acpi_conflict = true;
+		goto wdt_done;
+	}
+	lpc_ich_enable_acpi_space(dev);
+
+	/*
+	 * Get the Memory-Mapped GCS register. To get access to it
+	 * we have to read RCBA from PCI Config space 0xf0 and use
+	 * it as base. GCS = RCBA + ICH6_GCS(0x3410).
+	 */
+	if (lpc_chipset_info[id->driver_data].iTCO_version == 2) {
+		pci_read_config_dword(dev, RCBABASE, &base_addr_cfg);
+		base_addr = base_addr_cfg & 0xffffc000;
+		if (!(base_addr_cfg & 1)) {
+			pr_err("RCBA is disabled by hardware/BIOS, "
+					"device disabled\n");
+			ret = -ENODEV;
+			goto wdt_done;
+		}
+		res = wdt_mem_res(ICH_RES_MEM_GCS);
+		res->start = base_addr + ACPIBASE_GCS_OFF;
+		res->end = base_addr + ACPIBASE_GCS_END;
+		ret = acpi_check_resource_conflict(res);
+		if (ret) {
+			acpi_conflict = true;
+			goto wdt_done;
+		}
+	}
+
+	lpc_ich_finalize_cell(&lpc_ich_cells[LPC_WDT], id);
+	ret = mfd_add_devices(&dev->dev, -1, &lpc_ich_cells[LPC_WDT],
+				1, NULL, 0);
+
+wdt_done:
+	if (acpi_conflict)
+		pr_warn("Resource conflict(s) found affecting %s\n",
+				lpc_ich_cells[LPC_WDT].name);
+	return ret;
+}
+
 static int __devinit lpc_ich_probe(struct pci_dev *dev,
 				const struct pci_device_id *id)
 {
 	int ret;
 	bool cell_added = false;
 
+	ret = lpc_ich_init_wdt(dev, id);
+	if (!ret)
+		cell_added = true;
+
 	ret = lpc_ich_init_gpio(dev, id);
 	if (!ret)
 		cell_added = true;
diff --git a/drivers/watchdog/Kconfig b/drivers/watchdog/Kconfig
index 3709624..a9ed087 100644
--- a/drivers/watchdog/Kconfig
+++ b/drivers/watchdog/Kconfig
@@ -563,6 +563,7 @@ config INTEL_SCU_WATCHDOG
 config ITCO_WDT
 	tristate "Intel TCO Timer/Watchdog"
 	depends on (X86 || IA64) && PCI
+	select LPC_ICH
 	---help---
 	  Hardware driver for the intel TCO timer based watchdog devices.
 	  These drivers are included in the Intel 82801 I/O Controller
diff --git a/drivers/watchdog/iTCO_vendor.h b/drivers/watchdog/iTCO_vendor.h
index 9e27e64..3c57b455 100644
--- a/drivers/watchdog/iTCO_vendor.h
+++ b/drivers/watchdog/iTCO_vendor.h
@@ -1,8 +1,8 @@
 /* iTCO Vendor Specific Support hooks */
 #ifdef CONFIG_ITCO_VENDOR_SUPPORT
-extern void iTCO_vendor_pre_start(unsigned long, unsigned int);
-extern void iTCO_vendor_pre_stop(unsigned long);
-extern void iTCO_vendor_pre_keepalive(unsigned long, unsigned int);
+extern void iTCO_vendor_pre_start(struct resource *, unsigned int);
+extern void iTCO_vendor_pre_stop(struct resource *);
+extern void iTCO_vendor_pre_keepalive(struct resource *, unsigned int);
 extern void iTCO_vendor_pre_set_heartbeat(unsigned int);
 extern int iTCO_vendor_check_noreboot_on(void);
 #else
diff --git a/drivers/watchdog/iTCO_vendor_support.c b/drivers/watchdog/iTCO_vendor_support.c
index 2721d29..b6b2f90 100644
--- a/drivers/watchdog/iTCO_vendor_support.c
+++ b/drivers/watchdog/iTCO_vendor_support.c
@@ -35,11 +35,6 @@
 
 #include "iTCO_vendor.h"
 
-/* iTCO defines */
-#define	SMI_EN		(acpibase + 0x30) /* SMI Control and Enable Register */
-#define	TCOBASE		(acpibase + 0x60) /* TCO base address */
-#define	TCO1_STS	(TCOBASE + 0x04)  /* TCO1 Status Register */
-
 /* List of vendor support modes */
 /* SuperMicro Pentium 3 Era 370SSE+-OEM1/P3TSSE */
 #define SUPERMICRO_OLD_BOARD	1
@@ -82,24 +77,24 @@ MODULE_PARM_DESC(vendorsupport, "iTCO vendor specific support mode, default="
  *	    20.6 seconds.
  */
 
-static void supermicro_old_pre_start(unsigned long acpibase)
+static void supermicro_old_pre_start(struct resource *smires)
 {
 	unsigned long val32;
 
 	/* Bit 13: TCO_EN -> 0 = Disables TCO logic generating an SMI# */
-	val32 = inl(SMI_EN);
+	val32 = inl(smires->start);
 	val32 &= 0xffffdfff;	/* Turn off SMI clearing watchdog */
-	outl(val32, SMI_EN);	/* Needed to activate watchdog */
+	outl(val32, smires->start);	/* Needed to activate watchdog */
 }
 
-static void supermicro_old_pre_stop(unsigned long acpibase)
+static void supermicro_old_pre_stop(struct resource *smires)
 {
 	unsigned long val32;
 
 	/* Bit 13: TCO_EN -> 1 = Enables the TCO logic to generate SMI# */
-	val32 = inl(SMI_EN);
+	val32 = inl(smires->start);
 	val32 |= 0x00002000;	/* Turn on SMI clearing watchdog */
-	outl(val32, SMI_EN);	/* Needed to deactivate watchdog */
+	outl(val32, smires->start);	/* Needed to deactivate watchdog */
 }
 
 /*
@@ -270,66 +265,66 @@ static void supermicro_new_pre_set_heartbeat(unsigned int heartbeat)
  *	Don't use this fix if you don't need to!!!
  */
 
-static void broken_bios_start(unsigned long acpibase)
+static void broken_bios_start(struct resource *smires)
 {
 	unsigned long val32;
 
-	val32 = inl(SMI_EN);
+	val32 = inl(smires->start);
 	/* Bit 13: TCO_EN     -> 0 = Disables TCO logic generating an SMI#
 	   Bit  0: GBL_SMI_EN -> 0 = No SMI# will be generated by ICH. */
 	val32 &= 0xffffdffe;
-	outl(val32, SMI_EN);
+	outl(val32, smires->start);
 }
 
-static void broken_bios_stop(unsigned long acpibase)
+static void broken_bios_stop(struct resource *smires)
 {
 	unsigned long val32;
 
-	val32 = inl(SMI_EN);
+	val32 = inl(smires->start);
 	/* Bit 13: TCO_EN     -> 1 = Enables TCO logic generating an SMI#
 	   Bit  0: GBL_SMI_EN -> 1 = Turn global SMI on again. */
 	val32 |= 0x00002001;
-	outl(val32, SMI_EN);
+	outl(val32, smires->start);
 }
 
 /*
  *	Generic Support Functions
  */
 
-void iTCO_vendor_pre_start(unsigned long acpibase,
+void iTCO_vendor_pre_start(struct resource *smires,
 			   unsigned int heartbeat)
 {
 	switch (vendorsupport) {
 	case SUPERMICRO_OLD_BOARD:
-		supermicro_old_pre_start(acpibase);
+		supermicro_old_pre_start(smires);
 		break;
 	case SUPERMICRO_NEW_BOARD:
 		supermicro_new_pre_start(heartbeat);
 		break;
 	case BROKEN_BIOS:
-		broken_bios_start(acpibase);
+		broken_bios_start(smires);
 		break;
 	}
 }
 EXPORT_SYMBOL(iTCO_vendor_pre_start);
 
-void iTCO_vendor_pre_stop(unsigned long acpibase)
+void iTCO_vendor_pre_stop(struct resource *smires)
 {
 	switch (vendorsupport) {
 	case SUPERMICRO_OLD_BOARD:
-		supermicro_old_pre_stop(acpibase);
+		supermicro_old_pre_stop(smires);
 		break;
 	case SUPERMICRO_NEW_BOARD:
 		supermicro_new_pre_stop();
 		break;
 	case BROKEN_BIOS:
-		broken_bios_stop(acpibase);
+		broken_bios_stop(smires);
 		break;
 	}
 }
 EXPORT_SYMBOL(iTCO_vendor_pre_stop);
 
-void iTCO_vendor_pre_keepalive(unsigned long acpibase, unsigned int heartbeat)
+void iTCO_vendor_pre_keepalive(struct resource *smires, unsigned int heartbeat)
 {
 	if (vendorsupport == SUPERMICRO_NEW_BOARD)
 		supermicro_new_pre_set_heartbeat(heartbeat);
diff --git a/drivers/watchdog/iTCO_wdt.c b/drivers/watchdog/iTCO_wdt.c
index 9fecb95..741528b 100644
--- a/drivers/watchdog/iTCO_wdt.c
+++ b/drivers/watchdog/iTCO_wdt.c
@@ -66,316 +66,16 @@
 #include <linux/spinlock.h>		/* For spin_lock/spin_unlock/... */
 #include <linux/uaccess.h>		/* For copy_to_user/put_user/... */
 #include <linux/io.h>			/* For inb/outb/... */
+#include <linux/mfd/core.h>
+#include <linux/mfd/lpc_ich.h>
 
 #include "iTCO_vendor.h"
 
-/* TCO related info */
-enum iTCO_chipsets {
-	TCO_ICH = 0,	/* ICH */
-	TCO_ICH0,	/* ICH0 */
-	TCO_ICH2,	/* ICH2 */
-	TCO_ICH2M,	/* ICH2-M */
-	TCO_ICH3,	/* ICH3-S */
-	TCO_ICH3M,	/* ICH3-M */
-	TCO_ICH4,	/* ICH4 */
-	TCO_ICH4M,	/* ICH4-M */
-	TCO_CICH,	/* C-ICH */
-	TCO_ICH5,	/* ICH5 & ICH5R */
-	TCO_6300ESB,	/* 6300ESB */
-	TCO_ICH6,	/* ICH6 & ICH6R */
-	TCO_ICH6M,	/* ICH6-M */
-	TCO_ICH6W,	/* ICH6W & ICH6RW */
-	TCO_631XESB,	/* 631xESB/632xESB */
-	TCO_ICH7,	/* ICH7 & ICH7R */
-	TCO_ICH7DH,	/* ICH7DH */
-	TCO_ICH7M,	/* ICH7-M & ICH7-U */
-	TCO_ICH7MDH,	/* ICH7-M DH */
-	TCO_NM10,	/* NM10 */
-	TCO_ICH8,	/* ICH8 & ICH8R */
-	TCO_ICH8DH,	/* ICH8DH */
-	TCO_ICH8DO,	/* ICH8DO */
-	TCO_ICH8M,	/* ICH8M */
-	TCO_ICH8ME,	/* ICH8M-E */
-	TCO_ICH9,	/* ICH9 */
-	TCO_ICH9R,	/* ICH9R */
-	TCO_ICH9DH,	/* ICH9DH */
-	TCO_ICH9DO,	/* ICH9DO */
-	TCO_ICH9M,	/* ICH9M */
-	TCO_ICH9ME,	/* ICH9M-E */
-	TCO_ICH10,	/* ICH10 */
-	TCO_ICH10R,	/* ICH10R */
-	TCO_ICH10D,	/* ICH10D */
-	TCO_ICH10DO,	/* ICH10DO */
-	TCO_PCH,	/* PCH Desktop Full Featured */
-	TCO_PCHM,	/* PCH Mobile Full Featured */
-	TCO_P55,	/* P55 */
-	TCO_PM55,	/* PM55 */
-	TCO_H55,	/* H55 */
-	TCO_QM57,	/* QM57 */
-	TCO_H57,	/* H57 */
-	TCO_HM55,	/* HM55 */
-	TCO_Q57,	/* Q57 */
-	TCO_HM57,	/* HM57 */
-	TCO_PCHMSFF,	/* PCH Mobile SFF Full Featured */
-	TCO_QS57,	/* QS57 */
-	TCO_3400,	/* 3400 */
-	TCO_3420,	/* 3420 */
-	TCO_3450,	/* 3450 */
-	TCO_EP80579,	/* EP80579 */
-	TCO_CPT,	/* Cougar Point */
-	TCO_CPTD,	/* Cougar Point Desktop */
-	TCO_CPTM,	/* Cougar Point Mobile */
-	TCO_PBG,	/* Patsburg */
-	TCO_DH89XXCC,	/* DH89xxCC */
-	TCO_PPT,	/* Panther Point */
-	TCO_LPT,	/* Lynx Point */
-};
-
-static struct {
-	char *name;
-	unsigned int iTCO_version;
-} iTCO_chipset_info[] __devinitdata = {
-	{"ICH", 1},
-	{"ICH0", 1},
-	{"ICH2", 1},
-	{"ICH2-M", 1},
-	{"ICH3-S", 1},
-	{"ICH3-M", 1},
-	{"ICH4", 1},
-	{"ICH4-M", 1},
-	{"C-ICH", 1},
-	{"ICH5 or ICH5R", 1},
-	{"6300ESB", 1},
-	{"ICH6 or ICH6R", 2},
-	{"ICH6-M", 2},
-	{"ICH6W or ICH6RW", 2},
-	{"631xESB/632xESB", 2},
-	{"ICH7 or ICH7R", 2},
-	{"ICH7DH", 2},
-	{"ICH7-M or ICH7-U", 2},
-	{"ICH7-M DH", 2},
-	{"NM10", 2},
-	{"ICH8 or ICH8R", 2},
-	{"ICH8DH", 2},
-	{"ICH8DO", 2},
-	{"ICH8M", 2},
-	{"ICH8M-E", 2},
-	{"ICH9", 2},
-	{"ICH9R", 2},
-	{"ICH9DH", 2},
-	{"ICH9DO", 2},
-	{"ICH9M", 2},
-	{"ICH9M-E", 2},
-	{"ICH10", 2},
-	{"ICH10R", 2},
-	{"ICH10D", 2},
-	{"ICH10DO", 2},
-	{"PCH Desktop Full Featured", 2},
-	{"PCH Mobile Full Featured", 2},
-	{"P55", 2},
-	{"PM55", 2},
-	{"H55", 2},
-	{"QM57", 2},
-	{"H57", 2},
-	{"HM55", 2},
-	{"Q57", 2},
-	{"HM57", 2},
-	{"PCH Mobile SFF Full Featured", 2},
-	{"QS57", 2},
-	{"3400", 2},
-	{"3420", 2},
-	{"3450", 2},
-	{"EP80579", 2},
-	{"Cougar Point", 2},
-	{"Cougar Point Desktop", 2},
-	{"Cougar Point Mobile", 2},
-	{"Patsburg", 2},
-	{"DH89xxCC", 2},
-	{"Panther Point", 2},
-	{"Lynx Point", 2},
-	{NULL, 0}
-};
-
-/*
- * This data only exists for exporting the supported PCI ids
- * via MODULE_DEVICE_TABLE.  We do not actually register a
- * pci_driver, because the I/O Controller Hub has also other
- * functions that probably will be registered by other drivers.
- */
-static DEFINE_PCI_DEVICE_TABLE(iTCO_wdt_pci_tbl) = {
-	{ PCI_VDEVICE(INTEL, 0x2410), TCO_ICH},
-	{ PCI_VDEVICE(INTEL, 0x2420), TCO_ICH0},
-	{ PCI_VDEVICE(INTEL, 0x2440), TCO_ICH2},
-	{ PCI_VDEVICE(INTEL, 0x244c), TCO_ICH2M},
-	{ PCI_VDEVICE(INTEL, 0x2480), TCO_ICH3},
-	{ PCI_VDEVICE(INTEL, 0x248c), TCO_ICH3M},
-	{ PCI_VDEVICE(INTEL, 0x24c0), TCO_ICH4},
-	{ PCI_VDEVICE(INTEL, 0x24cc), TCO_ICH4M},
-	{ PCI_VDEVICE(INTEL, 0x2450), TCO_CICH},
-	{ PCI_VDEVICE(INTEL, 0x24d0), TCO_ICH5},
-	{ PCI_VDEVICE(INTEL, 0x25a1), TCO_6300ESB},
-	{ PCI_VDEVICE(INTEL, 0x2640), TCO_ICH6},
-	{ PCI_VDEVICE(INTEL, 0x2641), TCO_ICH6M},
-	{ PCI_VDEVICE(INTEL, 0x2642), TCO_ICH6W},
-	{ PCI_VDEVICE(INTEL, 0x2670), TCO_631XESB},
-	{ PCI_VDEVICE(INTEL, 0x2671), TCO_631XESB},
-	{ PCI_VDEVICE(INTEL, 0x2672), TCO_631XESB},
-	{ PCI_VDEVICE(INTEL, 0x2673), TCO_631XESB},
-	{ PCI_VDEVICE(INTEL, 0x2674), TCO_631XESB},
-	{ PCI_VDEVICE(INTEL, 0x2675), TCO_631XESB},
-	{ PCI_VDEVICE(INTEL, 0x2676), TCO_631XESB},
-	{ PCI_VDEVICE(INTEL, 0x2677), TCO_631XESB},
-	{ PCI_VDEVICE(INTEL, 0x2678), TCO_631XESB},
-	{ PCI_VDEVICE(INTEL, 0x2679), TCO_631XESB},
-	{ PCI_VDEVICE(INTEL, 0x267a), TCO_631XESB},
-	{ PCI_VDEVICE(INTEL, 0x267b), TCO_631XESB},
-	{ PCI_VDEVICE(INTEL, 0x267c), TCO_631XESB},
-	{ PCI_VDEVICE(INTEL, 0x267d), TCO_631XESB},
-	{ PCI_VDEVICE(INTEL, 0x267e), TCO_631XESB},
-	{ PCI_VDEVICE(INTEL, 0x267f), TCO_631XESB},
-	{ PCI_VDEVICE(INTEL, 0x27b8), TCO_ICH7},
-	{ PCI_VDEVICE(INTEL, 0x27b0), TCO_ICH7DH},
-	{ PCI_VDEVICE(INTEL, 0x27b9), TCO_ICH7M},
-	{ PCI_VDEVICE(INTEL, 0x27bd), TCO_ICH7MDH},
-	{ PCI_VDEVICE(INTEL, 0x27bc), TCO_NM10},
-	{ PCI_VDEVICE(INTEL, 0x2810), TCO_ICH8},
-	{ PCI_VDEVICE(INTEL, 0x2812), TCO_ICH8DH},
-	{ PCI_VDEVICE(INTEL, 0x2814), TCO_ICH8DO},
-	{ PCI_VDEVICE(INTEL, 0x2815), TCO_ICH8M},
-	{ PCI_VDEVICE(INTEL, 0x2811), TCO_ICH8ME},
-	{ PCI_VDEVICE(INTEL, 0x2918), TCO_ICH9},
-	{ PCI_VDEVICE(INTEL, 0x2916), TCO_ICH9R},
-	{ PCI_VDEVICE(INTEL, 0x2912), TCO_ICH9DH},
-	{ PCI_VDEVICE(INTEL, 0x2914), TCO_ICH9DO},
-	{ PCI_VDEVICE(INTEL, 0x2919), TCO_ICH9M},
-	{ PCI_VDEVICE(INTEL, 0x2917), TCO_ICH9ME},
-	{ PCI_VDEVICE(INTEL, 0x3a18), TCO_ICH10},
-	{ PCI_VDEVICE(INTEL, 0x3a16), TCO_ICH10R},
-	{ PCI_VDEVICE(INTEL, 0x3a1a), TCO_ICH10D},
-	{ PCI_VDEVICE(INTEL, 0x3a14), TCO_ICH10DO},
-	{ PCI_VDEVICE(INTEL, 0x3b00), TCO_PCH},
-	{ PCI_VDEVICE(INTEL, 0x3b01), TCO_PCHM},
-	{ PCI_VDEVICE(INTEL, 0x3b02), TCO_P55},
-	{ PCI_VDEVICE(INTEL, 0x3b03), TCO_PM55},
-	{ PCI_VDEVICE(INTEL, 0x3b06), TCO_H55},
-	{ PCI_VDEVICE(INTEL, 0x3b07), TCO_QM57},
-	{ PCI_VDEVICE(INTEL, 0x3b08), TCO_H57},
-	{ PCI_VDEVICE(INTEL, 0x3b09), TCO_HM55},
-	{ PCI_VDEVICE(INTEL, 0x3b0a), TCO_Q57},
-	{ PCI_VDEVICE(INTEL, 0x3b0b), TCO_HM57},
-	{ PCI_VDEVICE(INTEL, 0x3b0d), TCO_PCHMSFF},
-	{ PCI_VDEVICE(INTEL, 0x3b0f), TCO_QS57},
-	{ PCI_VDEVICE(INTEL, 0x3b12), TCO_3400},
-	{ PCI_VDEVICE(INTEL, 0x3b14), TCO_3420},
-	{ PCI_VDEVICE(INTEL, 0x3b16), TCO_3450},
-	{ PCI_VDEVICE(INTEL, 0x5031), TCO_EP80579},
-	{ PCI_VDEVICE(INTEL, 0x1c41), TCO_CPT},
-	{ PCI_VDEVICE(INTEL, 0x1c42), TCO_CPTD},
-	{ PCI_VDEVICE(INTEL, 0x1c43), TCO_CPTM},
-	{ PCI_VDEVICE(INTEL, 0x1c44), TCO_CPT},
-	{ PCI_VDEVICE(INTEL, 0x1c45), TCO_CPT},
-	{ PCI_VDEVICE(INTEL, 0x1c46), TCO_CPT},
-	{ PCI_VDEVICE(INTEL, 0x1c47), TCO_CPT},
-	{ PCI_VDEVICE(INTEL, 0x1c48), TCO_CPT},
-	{ PCI_VDEVICE(INTEL, 0x1c49), TCO_CPT},
-	{ PCI_VDEVICE(INTEL, 0x1c4a), TCO_CPT},
-	{ PCI_VDEVICE(INTEL, 0x1c4b), TCO_CPT},
-	{ PCI_VDEVICE(INTEL, 0x1c4c), TCO_CPT},
-	{ PCI_VDEVICE(INTEL, 0x1c4d), TCO_CPT},
-	{ PCI_VDEVICE(INTEL, 0x1c4e), TCO_CPT},
-	{ PCI_VDEVICE(INTEL, 0x1c4f), TCO_CPT},
-	{ PCI_VDEVICE(INTEL, 0x1c50), TCO_CPT},
-	{ PCI_VDEVICE(INTEL, 0x1c51), TCO_CPT},
-	{ PCI_VDEVICE(INTEL, 0x1c52), TCO_CPT},
-	{ PCI_VDEVICE(INTEL, 0x1c53), TCO_CPT},
-	{ PCI_VDEVICE(INTEL, 0x1c54), TCO_CPT},
-	{ PCI_VDEVICE(INTEL, 0x1c55), TCO_CPT},
-	{ PCI_VDEVICE(INTEL, 0x1c56), TCO_CPT},
-	{ PCI_VDEVICE(INTEL, 0x1c57), TCO_CPT},
-	{ PCI_VDEVICE(INTEL, 0x1c58), TCO_CPT},
-	{ PCI_VDEVICE(INTEL, 0x1c59), TCO_CPT},
-	{ PCI_VDEVICE(INTEL, 0x1c5a), TCO_CPT},
-	{ PCI_VDEVICE(INTEL, 0x1c5b), TCO_CPT},
-	{ PCI_VDEVICE(INTEL, 0x1c5c), TCO_CPT},
-	{ PCI_VDEVICE(INTEL, 0x1c5d), TCO_CPT},
-	{ PCI_VDEVICE(INTEL, 0x1c5e), TCO_CPT},
-	{ PCI_VDEVICE(INTEL, 0x1c5f), TCO_CPT},
-	{ PCI_VDEVICE(INTEL, 0x1d40), TCO_PBG},
-	{ PCI_VDEVICE(INTEL, 0x1d41), TCO_PBG},
-	{ PCI_VDEVICE(INTEL, 0x2310), TCO_DH89XXCC},
-	{ PCI_VDEVICE(INTEL, 0x1e40), TCO_PPT},
-	{ PCI_VDEVICE(INTEL, 0x1e41), TCO_PPT},
-	{ PCI_VDEVICE(INTEL, 0x1e42), TCO_PPT},
-	{ PCI_VDEVICE(INTEL, 0x1e43), TCO_PPT},
-	{ PCI_VDEVICE(INTEL, 0x1e44), TCO_PPT},
-	{ PCI_VDEVICE(INTEL, 0x1e45), TCO_PPT},
-	{ PCI_VDEVICE(INTEL, 0x1e46), TCO_PPT},
-	{ PCI_VDEVICE(INTEL, 0x1e47), TCO_PPT},
-	{ PCI_VDEVICE(INTEL, 0x1e48), TCO_PPT},
-	{ PCI_VDEVICE(INTEL, 0x1e49), TCO_PPT},
-	{ PCI_VDEVICE(INTEL, 0x1e4a), TCO_PPT},
-	{ PCI_VDEVICE(INTEL, 0x1e4b), TCO_PPT},
-	{ PCI_VDEVICE(INTEL, 0x1e4c), TCO_PPT},
-	{ PCI_VDEVICE(INTEL, 0x1e4d), TCO_PPT},
-	{ PCI_VDEVICE(INTEL, 0x1e4e), TCO_PPT},
-	{ PCI_VDEVICE(INTEL, 0x1e4f), TCO_PPT},
-	{ PCI_VDEVICE(INTEL, 0x1e50), TCO_PPT},
-	{ PCI_VDEVICE(INTEL, 0x1e51), TCO_PPT},
-	{ PCI_VDEVICE(INTEL, 0x1e52), TCO_PPT},
-	{ PCI_VDEVICE(INTEL, 0x1e53), TCO_PPT},
-	{ PCI_VDEVICE(INTEL, 0x1e54), TCO_PPT},
-	{ PCI_VDEVICE(INTEL, 0x1e55), TCO_PPT},
-	{ PCI_VDEVICE(INTEL, 0x1e56), TCO_PPT},
-	{ PCI_VDEVICE(INTEL, 0x1e57), TCO_PPT},
-	{ PCI_VDEVICE(INTEL, 0x1e58), TCO_PPT},
-	{ PCI_VDEVICE(INTEL, 0x1e59), TCO_PPT},
-	{ PCI_VDEVICE(INTEL, 0x1e5a), TCO_PPT},
-	{ PCI_VDEVICE(INTEL, 0x1e5b), TCO_PPT},
-	{ PCI_VDEVICE(INTEL, 0x1e5c), TCO_PPT},
-	{ PCI_VDEVICE(INTEL, 0x1e5d), TCO_PPT},
-	{ PCI_VDEVICE(INTEL, 0x1e5e), TCO_PPT},
-	{ PCI_VDEVICE(INTEL, 0x1e5f), TCO_PPT},
-	{ PCI_VDEVICE(INTEL, 0x8c40), TCO_LPT},
-	{ PCI_VDEVICE(INTEL, 0x8c41), TCO_LPT},
-	{ PCI_VDEVICE(INTEL, 0x8c42), TCO_LPT},
-	{ PCI_VDEVICE(INTEL, 0x8c43), TCO_LPT},
-	{ PCI_VDEVICE(INTEL, 0x8c44), TCO_LPT},
-	{ PCI_VDEVICE(INTEL, 0x8c45), TCO_LPT},
-	{ PCI_VDEVICE(INTEL, 0x8c46), TCO_LPT},
-	{ PCI_VDEVICE(INTEL, 0x8c47), TCO_LPT},
-	{ PCI_VDEVICE(INTEL, 0x8c48), TCO_LPT},
-	{ PCI_VDEVICE(INTEL, 0x8c49), TCO_LPT},
-	{ PCI_VDEVICE(INTEL, 0x8c4a), TCO_LPT},
-	{ PCI_VDEVICE(INTEL, 0x8c4b), TCO_LPT},
-	{ PCI_VDEVICE(INTEL, 0x8c4c), TCO_LPT},
-	{ PCI_VDEVICE(INTEL, 0x8c4d), TCO_LPT},
-	{ PCI_VDEVICE(INTEL, 0x8c4e), TCO_LPT},
-	{ PCI_VDEVICE(INTEL, 0x8c4f), TCO_LPT},
-	{ PCI_VDEVICE(INTEL, 0x8c50), TCO_LPT},
-	{ PCI_VDEVICE(INTEL, 0x8c51), TCO_LPT},
-	{ PCI_VDEVICE(INTEL, 0x8c52), TCO_LPT},
-	{ PCI_VDEVICE(INTEL, 0x8c53), TCO_LPT},
-	{ PCI_VDEVICE(INTEL, 0x8c54), TCO_LPT},
-	{ PCI_VDEVICE(INTEL, 0x8c55), TCO_LPT},
-	{ PCI_VDEVICE(INTEL, 0x8c56), TCO_LPT},
-	{ PCI_VDEVICE(INTEL, 0x8c57), TCO_LPT},
-	{ PCI_VDEVICE(INTEL, 0x8c58), TCO_LPT},
-	{ PCI_VDEVICE(INTEL, 0x8c59), TCO_LPT},
-	{ PCI_VDEVICE(INTEL, 0x8c5a), TCO_LPT},
-	{ PCI_VDEVICE(INTEL, 0x8c5b), TCO_LPT},
-	{ PCI_VDEVICE(INTEL, 0x8c5c), TCO_LPT},
-	{ PCI_VDEVICE(INTEL, 0x8c5d), TCO_LPT},
-	{ PCI_VDEVICE(INTEL, 0x8c5e), TCO_LPT},
-	{ PCI_VDEVICE(INTEL, 0x8c5f), TCO_LPT},
-	{ 0, },			/* End of list */
-};
-MODULE_DEVICE_TABLE(pci, iTCO_wdt_pci_tbl);
-
 /* Address definitions for the TCO */
 /* TCO base address */
-#define TCOBASE		(iTCO_wdt_private.ACPIBASE + 0x60)
+#define TCOBASE		(iTCO_wdt_private.tco_res->start)
 /* SMI Control and Enable Register */
-#define SMI_EN		(iTCO_wdt_private.ACPIBASE + 0x30)
+#define SMI_EN		(iTCO_wdt_private.smi_res->start)
 
 #define TCO_RLD		(TCOBASE + 0x00) /* TCO Timer Reload and Curr. Value */
 #define TCOv1_TMR	(TCOBASE + 0x01) /* TCOv1 Timer Initial Value	*/
@@ -393,19 +93,18 @@ static char expect_release;
 static struct {		/* this is private data for the iTCO_wdt device */
 	/* TCO version/generation */
 	unsigned int iTCO_version;
-	/* The device's ACPIBASE address (TCOBASE = ACPIBASE+0x60) */
-	unsigned long ACPIBASE;
+	struct resource *tco_res;
+	struct resource *smi_res;
+	struct resource *gcs_res;
 	/* NO_REBOOT flag is Memory-Mapped GCS register bit 5 (TCO version 2)*/
 	unsigned long __iomem *gcs;
 	/* the lock for io operations */
 	spinlock_t io_lock;
+	struct platform_device *dev;
 	/* the PCI-device */
 	struct pci_dev *pdev;
 } iTCO_wdt_private;
 
-/* the watchdog platform device */
-static struct platform_device *iTCO_wdt_platform_device;
-
 /* module parameters */
 #define WATCHDOG_HEARTBEAT 30	/* 30 sec default heartbeat */
 static int heartbeat = WATCHDOG_HEARTBEAT;  /* in seconds */
@@ -485,7 +184,7 @@ static int iTCO_wdt_start(void)
 
 	spin_lock(&iTCO_wdt_private.io_lock);
 
-	iTCO_vendor_pre_start(iTCO_wdt_private.ACPIBASE, heartbeat);
+	iTCO_vendor_pre_start(iTCO_wdt_private.smi_res, heartbeat);
 
 	/* disable chipset's NO_REBOOT bit */
 	if (iTCO_wdt_unset_NO_REBOOT_bit()) {
@@ -519,7 +218,7 @@ static int iTCO_wdt_stop(void)
 
 	spin_lock(&iTCO_wdt_private.io_lock);
 
-	iTCO_vendor_pre_stop(iTCO_wdt_private.ACPIBASE);
+	iTCO_vendor_pre_stop(iTCO_wdt_private.smi_res);
 
 	/* Bit 11: TCO Timer Halt -> 1 = The TCO timer is disabled */
 	val = inw(TCO1_CNT);
@@ -541,7 +240,7 @@ static int iTCO_wdt_keepalive(void)
 {
 	spin_lock(&iTCO_wdt_private.io_lock);
 
-	iTCO_vendor_pre_keepalive(iTCO_wdt_private.ACPIBASE, heartbeat);
+	iTCO_vendor_pre_keepalive(iTCO_wdt_private.smi_res, heartbeat);
 
 	/* Reload the timer by writing to the TCO Timer Counter register */
 	if (iTCO_wdt_private.iTCO_version == 2)
@@ -786,83 +485,120 @@ static struct miscdevice iTCO_wdt_miscdev = {
  *	Init & exit routines
  */
 
-static int __devinit iTCO_wdt_init(struct pci_dev *pdev,
-		const struct pci_device_id *ent, struct platform_device *dev)
+static void __devexit iTCO_wdt_cleanup(void)
+{
+	/* Stop the timer before we leave */
+	if (!nowayout)
+		iTCO_wdt_stop();
+
+	/* Deregister */
+	misc_deregister(&iTCO_wdt_miscdev);
+
+	/* release resources */
+	release_region(iTCO_wdt_private.tco_res->start,
+			resource_size(iTCO_wdt_private.tco_res));
+	release_region(iTCO_wdt_private.smi_res->start,
+			resource_size(iTCO_wdt_private.smi_res));
+	if (iTCO_wdt_private.iTCO_version == 2) {
+		iounmap(iTCO_wdt_private.gcs);
+		release_mem_region(iTCO_wdt_private.gcs_res->start,
+				resource_size(iTCO_wdt_private.gcs_res));
+	}
+
+	iTCO_wdt_private.tco_res = NULL;
+	iTCO_wdt_private.smi_res = NULL;
+	iTCO_wdt_private.gcs_res = NULL;
+	iTCO_wdt_private.gcs = NULL;
+}
+
+static int __devinit iTCO_wdt_probe(struct platform_device *dev)
 {
-	int ret;
-	u32 base_address;
-	unsigned long RCBA;
+	int ret = -ENODEV;
 	unsigned long val32;
+	struct lpc_ich_info *ich_info = dev->dev.platform_data;
+
+	if (!ich_info)
+		goto out;
+
+	spin_lock_init(&iTCO_wdt_private.io_lock);
+
+	iTCO_wdt_private.tco_res =
+		platform_get_resource(dev, IORESOURCE_IO, ICH_RES_IO_TCO);
+	if (!iTCO_wdt_private.tco_res)
+		goto out;
+
+	iTCO_wdt_private.smi_res =
+		platform_get_resource(dev, IORESOURCE_IO, ICH_RES_IO_SMI);
+	if (!iTCO_wdt_private.smi_res)
+		goto out;
+
+	iTCO_wdt_private.iTCO_version = ich_info->iTCO_version;
+	iTCO_wdt_private.dev = dev;
+	iTCO_wdt_private.pdev = to_pci_dev(dev->dev.parent);
 
 	/*
-	 *      Find the ACPI/PM base I/O address which is the base
-	 *      for the TCO registers (TCOBASE=ACPIBASE + 0x60)
-	 *      ACPIBASE is bits [15:7] from 0x40-0x43
+	 * Get the Memory-Mapped GCS register, we need it for the
+	 * NO_REBOOT flag (TCO v2).
 	 */
-	pci_read_config_dword(pdev, 0x40, &base_address);
-	base_address &= 0x0000ff80;
-	if (base_address == 0x00000000) {
-		/* Something's wrong here, ACPIBASE has to be set */
-		pr_err("failed to get TCOBASE address, device disabled by hardware/BIOS\n");
-		return -ENODEV;
-	}
-	iTCO_wdt_private.iTCO_version =
-			iTCO_chipset_info[ent->driver_data].iTCO_version;
-	iTCO_wdt_private.ACPIBASE = base_address;
-	iTCO_wdt_private.pdev = pdev;
-
-	/* Get the Memory-Mapped GCS register, we need it for the
-	   NO_REBOOT flag (TCO v2). To get access to it you have to
-	   read RCBA from PCI Config space 0xf0 and use it as base.
-	   GCS = RCBA + ICH6_GCS(0x3410). */
 	if (iTCO_wdt_private.iTCO_version == 2) {
-		pci_read_config_dword(pdev, 0xf0, &base_address);
-		if ((base_address & 1) == 0) {
-			pr_err("RCBA is disabled by hardware/BIOS, device disabled\n");
-			ret = -ENODEV;
+		iTCO_wdt_private.gcs_res = platform_get_resource(dev,
+							IORESOURCE_MEM,
+							ICH_RES_MEM_GCS);
+
+		if (!iTCO_wdt_private.gcs_res)
+			goto out;
+
+		if (!request_mem_region(iTCO_wdt_private.gcs_res->start,
+			resource_size(iTCO_wdt_private.gcs_res), dev->name)) {
+			ret = -EBUSY;
 			goto out;
 		}
-		RCBA = base_address & 0xffffc000;
-		iTCO_wdt_private.gcs = ioremap((RCBA + 0x3410), 4);
+		iTCO_wdt_private.gcs = ioremap(iTCO_wdt_private.gcs_res->start,
+			resource_size(iTCO_wdt_private.gcs_res));
+		if (!iTCO_wdt_private.gcs) {
+			ret = -EIO;
+			goto unreg_gcs;
+		}
 	}
 
 	/* Check chipset's NO_REBOOT bit */
 	if (iTCO_wdt_unset_NO_REBOOT_bit() && iTCO_vendor_check_noreboot_on()) {
 		pr_info("unable to reset NO_REBOOT flag, device disabled by hardware/BIOS\n");
 		ret = -ENODEV;	/* Cannot reset NO_REBOOT bit */
-		goto out_unmap;
+		goto unmap_gcs;
 	}
 
 	/* Set the NO_REBOOT bit to prevent later reboots, just for sure */
 	iTCO_wdt_set_NO_REBOOT_bit();
 
 	/* The TCO logic uses the TCO_EN bit in the SMI_EN register */
-	if (!request_region(SMI_EN, 4, "iTCO_wdt")) {
-		pr_err("I/O address 0x%04lx already in use, device disabled\n",
+	if (!request_region(iTCO_wdt_private.smi_res->start,
+			resource_size(iTCO_wdt_private.smi_res), dev->name)) {
+		pr_err("I/O address 0x%04llx already in use, device disabled\n",
 		       SMI_EN);
-		ret = -EIO;
-		goto out_unmap;
+		ret = -EBUSY;
+		goto unmap_gcs;
 	}
 	if (turn_SMI_watchdog_clear_off >= iTCO_wdt_private.iTCO_version) {
-		/* Bit 13: TCO_EN -> 0 = Disables TCO logic generating an SMI# */
+		/*
+		 * Bit 13: TCO_EN -> 0
+		 * Disables TCO logic generating an SMI#
+		 */
 		val32 = inl(SMI_EN);
 		val32 &= 0xffffdfff;	/* Turn off SMI clearing watchdog */
 		outl(val32, SMI_EN);
 	}
 
-	/* The TCO I/O registers reside in a 32-byte range pointed to
-	   by the TCOBASE value */
-	if (!request_region(TCOBASE, 0x20, "iTCO_wdt")) {
-		pr_err("I/O address 0x%04lx already in use, device disabled\n",
+	if (!request_region(iTCO_wdt_private.tco_res->start,
+			resource_size(iTCO_wdt_private.tco_res), dev->name)) {
+		pr_err("I/O address 0x%04llx already in use, device disabled\n",
 		       TCOBASE);
-		ret = -EIO;
-		goto unreg_smi_en;
+		ret = -EBUSY;
+		goto unreg_smi;
 	}
 
-	pr_info("Found a %s TCO device (Version=%d, TCOBASE=0x%04lx)\n",
-		iTCO_chipset_info[ent->driver_data].name,
-		iTCO_chipset_info[ent->driver_data].iTCO_version,
-		TCOBASE);
+	pr_info("Found a %s TCO device (Version=%d, TCOBASE=0x%04llx)\n",
+		ich_info->name, ich_info->iTCO_version, TCOBASE);
 
 	/* Clear out the (probably old) status */
 	outw(0x0008, TCO1_STS);	/* Clear the Time Out Status bit */
@@ -883,7 +619,7 @@ static int __devinit iTCO_wdt_init(struct pci_dev *pdev,
 	if (ret != 0) {
 		pr_err("cannot register miscdev on minor=%d (err=%d)\n",
 		       WATCHDOG_MINOR, ret);
-		goto unreg_region;
+		goto unreg_tco;
 	}
 
 	pr_info("initialized. heartbeat=%d sec (nowayout=%d)\n",
@@ -891,62 +627,31 @@ static int __devinit iTCO_wdt_init(struct pci_dev *pdev,
 
 	return 0;
 
-unreg_region:
-	release_region(TCOBASE, 0x20);
-unreg_smi_en:
-	release_region(SMI_EN, 4);
-out_unmap:
+unreg_tco:
+	release_region(iTCO_wdt_private.tco_res->start,
+			resource_size(iTCO_wdt_private.tco_res));
+unreg_smi:
+	release_region(iTCO_wdt_private.smi_res->start,
+			resource_size(iTCO_wdt_private.smi_res));
+unmap_gcs:
 	if (iTCO_wdt_private.iTCO_version == 2)
 		iounmap(iTCO_wdt_private.gcs);
-out:
-	iTCO_wdt_private.ACPIBASE = 0;
-	return ret;
-}
-
-static void __devexit iTCO_wdt_cleanup(void)
-{
-	/* Stop the timer before we leave */
-	if (!nowayout)
-		iTCO_wdt_stop();
-
-	/* Deregister */
-	misc_deregister(&iTCO_wdt_miscdev);
-	release_region(TCOBASE, 0x20);
-	release_region(SMI_EN, 4);
+unreg_gcs:
 	if (iTCO_wdt_private.iTCO_version == 2)
-		iounmap(iTCO_wdt_private.gcs);
-	pci_dev_put(iTCO_wdt_private.pdev);
-	iTCO_wdt_private.ACPIBASE = 0;
-}
-
-static int __devinit iTCO_wdt_probe(struct platform_device *dev)
-{
-	int ret = -ENODEV;
-	int found = 0;
-	struct pci_dev *pdev = NULL;
-	const struct pci_device_id *ent;
-
-	spin_lock_init(&iTCO_wdt_private.io_lock);
-
-	for_each_pci_dev(pdev) {
-		ent = pci_match_id(iTCO_wdt_pci_tbl, pdev);
-		if (ent) {
-			found++;
-			ret = iTCO_wdt_init(pdev, ent, dev);
-			if (!ret)
-				break;
-		}
-	}
-
-	if (!found)
-		pr_info("No device detected\n");
+		release_mem_region(iTCO_wdt_private.gcs_res->start,
+				resource_size(iTCO_wdt_private.gcs_res));
+out:
+	iTCO_wdt_private.tco_res = NULL;
+	iTCO_wdt_private.smi_res = NULL;
+	iTCO_wdt_private.gcs_res = NULL;
+	iTCO_wdt_private.gcs = NULL;
 
 	return ret;
 }
 
 static int __devexit iTCO_wdt_remove(struct platform_device *dev)
 {
-	if (iTCO_wdt_private.ACPIBASE)
+	if (iTCO_wdt_private.tco_res || iTCO_wdt_private.smi_res)
 		iTCO_wdt_cleanup();
 
 	return 0;
@@ -977,23 +682,11 @@ static int __init iTCO_wdt_init_module(void)
 	if (err)
 		return err;
 
-	iTCO_wdt_platform_device = platform_device_register_simple(DRV_NAME,
-								-1, NULL, 0);
-	if (IS_ERR(iTCO_wdt_platform_device)) {
-		err = PTR_ERR(iTCO_wdt_platform_device);
-		goto unreg_platform_driver;
-	}
-
 	return 0;
-
-unreg_platform_driver:
-	platform_driver_unregister(&iTCO_wdt_driver);
-	return err;
 }
 
 static void __exit iTCO_wdt_cleanup_module(void)
 {
-	platform_device_unregister(iTCO_wdt_platform_device);
 	platform_driver_unregister(&iTCO_wdt_driver);
 	pr_info("Watchdog Module Unloaded\n");
 }
diff --git a/include/linux/mfd/lpc_ich.h b/include/linux/mfd/lpc_ich.h
index 91300b1..fec5256 100644
--- a/include/linux/mfd/lpc_ich.h
+++ b/include/linux/mfd/lpc_ich.h
@@ -20,6 +20,12 @@
 #ifndef LPC_ICH_H
 #define LPC_ICH_H
 
+/* Watchdog resources */
+#define ICH_RES_IO_TCO	0
+#define ICH_RES_IO_SMI	1
+#define ICH_RES_MEM_OFF	2
+#define ICH_RES_MEM_GCS	0
+
 /* GPIO resources */
 #define ICH_RES_GPIO	0
 #define ICH_RES_GPE0	1
@@ -35,6 +41,7 @@
 
 struct lpc_ich_info {
 	char name[32];
+	unsigned int iTCO_version;
 	unsigned int gpio_version;
 };
 
-- 
cgit v0.10.2


From 1379f49ea91a28f5c023d041aab785c3de60c65d Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Mon, 23 Apr 2012 14:28:44 +0200
Subject: mfd: Allow for const stmpe keyboard data

Since it's not like we will re-arrange the keys at run-time, it
seems proper to allow the keymap data to be const. This solves
a compilation warning in ux500.

Cc: Dmitry Torokhov <dtor@mail.ru>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>

diff --git a/include/linux/mfd/stmpe.h b/include/linux/mfd/stmpe.h
index 8516fd1..f8d5b4d 100644
--- a/include/linux/mfd/stmpe.h
+++ b/include/linux/mfd/stmpe.h
@@ -117,7 +117,7 @@ struct matrix_keymap_data;
  * @no_autorepeat: disable key autorepeat
  */
 struct stmpe_keypad_platform_data {
-	struct matrix_keymap_data *keymap_data;
+	const struct matrix_keymap_data *keymap_data;
 	unsigned int debounce_ms;
 	unsigned int scan_count;
 	bool no_autorepeat;
-- 
cgit v0.10.2


From b683a0a675560307ebc458cf9044d98d27820b7c Mon Sep 17 00:00:00 2001
From: Axel Lin <axel.lin@gmail.com>
Date: Wed, 25 Apr 2012 09:30:36 +0800
Subject: mfd: Return proper error if tps65090 regmap_init_i2c fails

Return proper error instead of 0 if regmap_init_i2c fails.

Signed-off-by: Axel Lin <axel.lin@gmail.com>
Acked-by: Venu Byravarasu <vbyravarasu@nvidia.com>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>

diff --git a/drivers/mfd/tps65090.c b/drivers/mfd/tps65090.c
index 6dc4c34..d994087 100644
--- a/drivers/mfd/tps65090.c
+++ b/drivers/mfd/tps65090.c
@@ -297,10 +297,10 @@ static int __devinit tps65090_i2c_probe(struct i2c_client *client,
 	tps65090->rmap = regmap_init_i2c(tps65090->client,
 		&tps65090_regmap_config);
 	if (IS_ERR(tps65090->rmap)) {
-		dev_err(&client->dev, "regmap_init failed with err: %ld\n",
-			PTR_ERR(tps65090->rmap));
+		ret = PTR_ERR(tps65090->rmap);
+		dev_err(&client->dev, "regmap_init failed with err: %d\n", ret);
 		goto err_irq_exit;
-	};
+	}
 
 	ret = mfd_add_devices(tps65090->dev, -1, tps65090s,
 		ARRAY_SIZE(tps65090s), NULL, 0);
-- 
cgit v0.10.2


From f8dddc0cfe9f56ed74fd5efde8d0754f5fb73a3f Mon Sep 17 00:00:00 2001
From: Axel Lin <axel.lin@gmail.com>
Date: Wed, 25 Apr 2012 10:01:55 +0800
Subject: mfd: Convert rc5t583 to devm_regmap_init_i2c()

Signed-off-by: Axel Lin <axel.lin@gmail.com>
Acked-by: Laxman Dewangan <ldewangan@nvidia.com>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>

diff --git a/drivers/mfd/rc5t583.c b/drivers/mfd/rc5t583.c
index 1cb4c35..cdc1df7 100644
--- a/drivers/mfd/rc5t583.c
+++ b/drivers/mfd/rc5t583.c
@@ -268,7 +268,7 @@ static int __devinit rc5t583_i2c_probe(struct i2c_client *i2c,
 	rc5t583->dev = &i2c->dev;
 	i2c_set_clientdata(i2c, rc5t583);
 
-	rc5t583->regmap = regmap_init_i2c(i2c, &rc5t583_regmap_config);
+	rc5t583->regmap = devm_regmap_init_i2c(i2c, &rc5t583_regmap_config);
 	if (IS_ERR(rc5t583->regmap)) {
 		ret = PTR_ERR(rc5t583->regmap);
 		dev_err(&i2c->dev, "regmap initialization failed: %d\n", ret);
@@ -277,7 +277,7 @@ static int __devinit rc5t583_i2c_probe(struct i2c_client *i2c,
 
 	ret = rc5t583_clear_ext_power_req(rc5t583, pdata);
 	if (ret < 0)
-		goto err_irq_init;
+		return ret;
 
 	if (i2c->irq) {
 		ret = rc5t583_irq_init(rc5t583, i2c->irq, pdata->irq_base);
@@ -300,8 +300,6 @@ static int __devinit rc5t583_i2c_probe(struct i2c_client *i2c,
 err_add_devs:
 	if (irq_init_success)
 		rc5t583_irq_exit(rc5t583);
-err_irq_init:
-	regmap_exit(rc5t583->regmap);
 	return ret;
 }
 
@@ -311,7 +309,6 @@ static int  __devexit rc5t583_i2c_remove(struct i2c_client *i2c)
 
 	mfd_remove_devices(rc5t583->dev);
 	rc5t583_irq_exit(rc5t583);
-	regmap_exit(rc5t583->regmap);
 	return 0;
 }
 
-- 
cgit v0.10.2


From 1092e1c761dd0dc92a431085e322956d91d057a6 Mon Sep 17 00:00:00 2001
From: Axel Lin <axel.lin@gmail.com>
Date: Wed, 25 Apr 2012 10:03:44 +0800
Subject: mfd: Convert s5m-core to devm_regmap_init_i2c()

Signed-off-by: Axel Lin <axel.lin@gmail.com>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>

diff --git a/drivers/mfd/s5m-core.c b/drivers/mfd/s5m-core.c
index 48949d9..dd17030 100644
--- a/drivers/mfd/s5m-core.c
+++ b/drivers/mfd/s5m-core.c
@@ -114,12 +114,12 @@ static int s5m87xx_i2c_probe(struct i2c_client *i2c,
 		s5m87xx->wakeup = pdata->wakeup;
 	}
 
-	s5m87xx->regmap = regmap_init_i2c(i2c, &s5m_regmap_config);
+	s5m87xx->regmap = devm_regmap_init_i2c(i2c, &s5m_regmap_config);
 	if (IS_ERR(s5m87xx->regmap)) {
 		ret = PTR_ERR(s5m87xx->regmap);
 		dev_err(&i2c->dev, "Failed to allocate register map: %d\n",
 			ret);
-		goto err;
+		return ret;
 	}
 
 	s5m87xx->rtc = i2c_new_dummy(i2c->adapter, RTC_I2C_ADDR);
@@ -159,7 +159,6 @@ err:
 	mfd_remove_devices(s5m87xx->dev);
 	s5m_irq_exit(s5m87xx);
 	i2c_unregister_device(s5m87xx->rtc);
-	regmap_exit(s5m87xx->regmap);
 	return ret;
 }
 
@@ -170,7 +169,6 @@ static int s5m87xx_i2c_remove(struct i2c_client *i2c)
 	mfd_remove_devices(s5m87xx->dev);
 	s5m_irq_exit(s5m87xx);
 	i2c_unregister_device(s5m87xx->rtc);
-	regmap_exit(s5m87xx->regmap);
 	return 0;
 }
 
-- 
cgit v0.10.2


From 1d88f7a01d9588b3298cfd6a2ec30538e96d166e Mon Sep 17 00:00:00 2001
From: Axel Lin <axel.lin@gmail.com>
Date: Wed, 25 Apr 2012 10:04:58 +0800
Subject: mfd: Convert tps65090 to devm_regmap_init_i2c()

Signed-off-by: Axel Lin <axel.lin@gmail.com>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>

diff --git a/drivers/mfd/tps65090.c b/drivers/mfd/tps65090.c
index d994087..9ca4c44 100644
--- a/drivers/mfd/tps65090.c
+++ b/drivers/mfd/tps65090.c
@@ -294,8 +294,8 @@ static int __devinit tps65090_i2c_probe(struct i2c_client *client,
 		}
 	}
 
-	tps65090->rmap = regmap_init_i2c(tps65090->client,
-		&tps65090_regmap_config);
+	tps65090->rmap = devm_regmap_init_i2c(tps65090->client,
+					      &tps65090_regmap_config);
 	if (IS_ERR(tps65090->rmap)) {
 		ret = PTR_ERR(tps65090->rmap);
 		dev_err(&client->dev, "regmap_init failed with err: %d\n", ret);
@@ -307,14 +307,11 @@ static int __devinit tps65090_i2c_probe(struct i2c_client *client,
 	if (ret) {
 		dev_err(&client->dev, "add mfd devices failed with err: %d\n",
 			ret);
-		goto err_regmap_exit;
+		goto err_irq_exit;
 	}
 
 	return 0;
 
-err_regmap_exit:
-	regmap_exit(tps65090->rmap);
-
 err_irq_exit:
 	if (client->irq)
 		free_irq(client->irq, tps65090);
@@ -327,7 +324,6 @@ static int __devexit tps65090_i2c_remove(struct i2c_client *client)
 	struct tps65090 *tps65090 = i2c_get_clientdata(client);
 
 	mfd_remove_devices(tps65090->dev);
-	regmap_exit(tps65090->rmap);
 	if (client->irq)
 		free_irq(client->irq, tps65090);
 
-- 
cgit v0.10.2


From 0ef4619c74ef1e24e9ee340f95ee922f970cde54 Mon Sep 17 00:00:00 2001
From: Axel Lin <axel.lin@gmail.com>
Date: Wed, 25 Apr 2012 10:06:40 +0800
Subject: mfd: Convert tps65217 to devm_regmap_init_i2c()

Signed-off-by: Axel Lin <axel.lin@gmail.com>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>

diff --git a/drivers/mfd/tps65217.c b/drivers/mfd/tps65217.c
index f7d854e..c064c0a 100644
--- a/drivers/mfd/tps65217.c
+++ b/drivers/mfd/tps65217.c
@@ -150,7 +150,7 @@ static int __devinit tps65217_probe(struct i2c_client *client,
 		return -ENOMEM;
 
 	tps->pdata = pdata;
-	tps->regmap = regmap_init_i2c(client, &tps65217_regmap_config);
+	tps->regmap = devm_regmap_init_i2c(client, &tps65217_regmap_config);
 	if (IS_ERR(tps->regmap)) {
 		ret = PTR_ERR(tps->regmap);
 		dev_err(tps->dev, "Failed to allocate register map: %d\n",
@@ -163,9 +163,9 @@ static int __devinit tps65217_probe(struct i2c_client *client,
 
 	ret = tps65217_reg_read(tps, TPS65217_REG_CHIPID, &version);
 	if (ret < 0) {
-		dev_err(tps->dev, "Failed to read revision"
-					" register: %d\n", ret);
-		goto err_regmap;
+		dev_err(tps->dev, "Failed to read revision register: %d\n",
+			ret);
+		return ret;
 	}
 
 	dev_info(tps->dev, "TPS65217 ID %#x version 1.%d\n",
@@ -190,11 +190,6 @@ static int __devinit tps65217_probe(struct i2c_client *client,
 	}
 
 	return 0;
-
-err_regmap:
-	regmap_exit(tps->regmap);
-
-	return ret;
 }
 
 static int __devexit tps65217_remove(struct i2c_client *client)
@@ -205,8 +200,6 @@ static int __devexit tps65217_remove(struct i2c_client *client)
 	for (i = 0; i < TPS65217_NUM_REGULATOR; i++)
 		platform_device_unregister(tps->regulator_pdev[i]);
 
-	regmap_exit(tps->regmap);
-
 	return 0;
 }
 
-- 
cgit v0.10.2


From bbf6adc10cb2d5a1345cc66e61b3891d42a9afbe Mon Sep 17 00:00:00 2001
From: Axel Lin <axel.lin@gmail.com>
Date: Wed, 25 Apr 2012 10:09:46 +0800
Subject: mfd: Convert twl6040-core to devm_regmap_init_i2c()

Signed-off-by: Axel Lin <axel.lin@gmail.com>
Acked-by: Peter Ujfalusi <peter.ujfalusi@ti.com>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>

diff --git a/drivers/mfd/twl6040-core.c b/drivers/mfd/twl6040-core.c
index 2d6beda..493f4a6 100644
--- a/drivers/mfd/twl6040-core.c
+++ b/drivers/mfd/twl6040-core.c
@@ -524,7 +524,7 @@ static int __devinit twl6040_probe(struct i2c_client *client,
 		goto err;
 	}
 
-	twl6040->regmap = regmap_init_i2c(client, &twl6040_regmap_config);
+	twl6040->regmap = devm_regmap_init_i2c(client, &twl6040_regmap_config);
 	if (IS_ERR(twl6040->regmap)) {
 		ret = PTR_ERR(twl6040->regmap);
 		goto err;
@@ -623,7 +623,6 @@ gpio2_err:
 		gpio_free(twl6040->audpwron);
 gpio1_err:
 	i2c_set_clientdata(client, NULL);
-	regmap_exit(twl6040->regmap);
 err:
 	return ret;
 }
@@ -643,7 +642,6 @@ static int __devexit twl6040_remove(struct i2c_client *client)
 
 	mfd_remove_devices(&client->dev);
 	i2c_set_clientdata(client, NULL);
-	regmap_exit(twl6040->regmap);
 
 	return 0;
 }
-- 
cgit v0.10.2


From 5a2f1b5fae593dbdf4f3656ee5a5d111df3e9acb Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Wed, 25 Apr 2012 13:05:24 +1000
Subject: mfd: enable wakeup on twl4030 IRQ.

Most of the interrupts that come through this line should trigger
wakeups:
  power button
  RTC alarm
  power available
  usb plug/unplug

so mark the interrupt as a wakeup interrupt.
This is particularly important for when the interrupt arrives during
the late suspend phase.  Without this setting it will be ignored.

Signed-off-by: NeilBrown <neilb@suse.de>
Acked-by: Kevin Hilman <khilman@ti.com>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>

diff --git a/drivers/mfd/twl4030-irq.c b/drivers/mfd/twl4030-irq.c
index 5d656e8..ad733d7 100644
--- a/drivers/mfd/twl4030-irq.c
+++ b/drivers/mfd/twl4030-irq.c
@@ -757,6 +757,7 @@ int twl4030_init_irq(struct device *dev, int irq_num)
 		dev_err(dev, "could not claim irq%d: %d\n", irq_num, status);
 		goto fail_rqirq;
 	}
+	enable_irq_wake(irq_num);
 
 	return irq_base;
 fail_rqirq:
-- 
cgit v0.10.2


From 3aff4ebb95b20ad8db2c1447e8c52097d89af5a7 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 9 May 2012 14:30:35 -0400
Subject: NFS: Prevent a deadlock in the new writeback code

We have to unlock the nfs_page before we call nfs_end_page_writeback
to avoid races with functions that expect the page to be unlocked
when PG_locked and PG_writeback are not set.
The problem is that nfs_unlock_request also releases the nfs_page,
causing a deadlock if the release of the nfs_open_context
triggers an iput() while the PG_writeback flag is still set...

The solution is to separate the unlocking and release of the nfs_page,
so that we can do the former before nfs_end_page_writeback and the
latter after.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Cc: Fred Isaman <iisaman@netapp.com>

diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 33a21ca..69146f3 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -128,10 +128,10 @@ nfs_create_request(struct nfs_open_context *ctx, struct inode *inode,
 }
 
 /**
- * nfs_unlock_request - Unlock request and wake up sleepers.
+ * nfs_unlock_request_dont_release - Unlock request and wake up sleepers.
  * @req:
  */
-void nfs_unlock_request(struct nfs_page *req)
+void nfs_unlock_request_dont_release(struct nfs_page *req)
 {
 	if (!NFS_WBACK_BUSY(req)) {
 		printk(KERN_ERR "NFS: Invalid unlock attempted\n");
@@ -141,6 +141,14 @@ void nfs_unlock_request(struct nfs_page *req)
 	clear_bit(PG_BUSY, &req->wb_flags);
 	smp_mb__after_clear_bit();
 	wake_up_bit(&req->wb_flags, PG_BUSY);
+}
+
+/**
+ * nfs_unlock_request - Unlock request and release the nfs_page
+ */
+void nfs_unlock_request(struct nfs_page *req)
+{
+	nfs_unlock_request_dont_release(req);
 	nfs_release_request(req);
 }
 
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 6f263da..fd36b31 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -628,8 +628,9 @@ static void nfs_write_completion(struct nfs_pgio_header *hdr)
 remove_req:
 		nfs_inode_remove_request(req);
 next:
-		nfs_unlock_request(req);
+		nfs_unlock_request_dont_release(req);
 		nfs_end_page_writeback(page);
+		nfs_release_request(req);
 	}
 out:
 	hdr->release(hdr);
@@ -1042,8 +1043,9 @@ static void nfs_redirty_request(struct nfs_page *req)
 	struct page *page = req->wb_page;
 
 	nfs_mark_request_dirty(req);
-	nfs_unlock_request(req);
+	nfs_unlock_request_dont_release(req);
 	nfs_end_page_writeback(page);
+	nfs_release_request(req);
 }
 
 static void nfs_async_write_error(struct list_head *head)
diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h
index f9ee9eb..ef75042 100644
--- a/include/linux/nfs_page.h
+++ b/include/linux/nfs_page.h
@@ -96,6 +96,7 @@ extern bool nfs_generic_pg_test(struct nfs_pageio_descriptor *desc,
 				struct nfs_page *req);
 extern  int nfs_wait_on_request(struct nfs_page *);
 extern	void nfs_unlock_request(struct nfs_page *req);
+extern void nfs_unlock_request_dont_release(struct nfs_page *req);
 
 /*
  * Lock the page of an asynchronous request without getting a new reference
-- 
cgit v0.10.2


From d1182b33ed9b62470cb6ab892a8a301542120086 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 9 May 2012 13:37:43 -0400
Subject: NFS: nfs_set_page_writeback no longer needs to reference the page

We now hold a reference to the nfs_page across the calls to
nfs_set_page_writeback and nfs_end_page_writeback, and that
means we already have a reference to the struct page.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Cc: Fred Isaman <iisaman@netapp.com>

diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index fd36b31..8382329 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -230,7 +230,6 @@ static int nfs_set_page_writeback(struct page *page)
 		struct inode *inode = page->mapping->host;
 		struct nfs_server *nfss = NFS_SERVER(inode);
 
-		page_cache_get(page);
 		if (atomic_long_inc_return(&nfss->writeback) >
 				NFS_CONGESTION_ON_THRESH) {
 			set_bdi_congested(&nfss->backing_dev_info,
@@ -246,7 +245,6 @@ static void nfs_end_page_writeback(struct page *page)
 	struct nfs_server *nfss = NFS_SERVER(inode);
 
 	end_page_writeback(page);
-	page_cache_release(page);
 	if (atomic_long_dec_return(&nfss->writeback) < NFS_CONGESTION_OFF_THRESH)
 		clear_bdi_congested(&nfss->backing_dev_info, BLK_RW_ASYNC);
 }
@@ -607,13 +605,12 @@ static void nfs_write_completion(struct nfs_pgio_header *hdr)
 	nfs_init_cinfo_from_inode(&cinfo, hdr->inode);
 	while (!list_empty(&hdr->pages)) {
 		struct nfs_page *req = nfs_list_entry(hdr->pages.next);
-		struct page *page = req->wb_page;
 
 		bytes += req->wb_bytes;
 		nfs_list_remove_request(req);
 		if (test_bit(NFS_IOHDR_ERROR, &hdr->flags) &&
 		    (hdr->good_bytes < bytes)) {
-			nfs_set_pageerror(page);
+			nfs_set_pageerror(req->wb_page);
 			nfs_context_set_write_error(req->wb_context, hdr->error);
 			goto remove_req;
 		}
@@ -629,7 +626,7 @@ remove_req:
 		nfs_inode_remove_request(req);
 next:
 		nfs_unlock_request_dont_release(req);
-		nfs_end_page_writeback(page);
+		nfs_end_page_writeback(req->wb_page);
 		nfs_release_request(req);
 	}
 out:
@@ -1040,11 +1037,9 @@ static int nfs_do_multiple_writes(struct list_head *head,
  */
 static void nfs_redirty_request(struct nfs_page *req)
 {
-	struct page *page = req->wb_page;
-
 	nfs_mark_request_dirty(req);
 	nfs_unlock_request_dont_release(req);
-	nfs_end_page_writeback(page);
+	nfs_end_page_writeback(req->wb_page);
 	nfs_release_request(req);
 }
 
-- 
cgit v0.10.2


From 7ad84aa9448571678c243f0c5ef383fbe5b50f4f Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 9 May 2012 13:19:15 -0400
Subject: NFS: Clean up - simplify nfs_lock_request()

We only have two places where we need to grab a reference when trying
to lock the nfs_page. We're better off making that explicit.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Cc: Fred Isaman <iisaman@netapp.com>

diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 257d009..465ea84 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -657,6 +657,7 @@ static ssize_t nfs_direct_write_schedule_segment(struct nfs_pageio_descriptor *d
 				break;
 			}
 			nfs_lock_request(req);
+			kref_get(&req->wb_kref);
 			req->wb_index = pos >> PAGE_SHIFT;
 			req->wb_offset = pos & ~PAGE_MASK;
 			if (!nfs_pageio_add_request(desc, req)) {
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 8382329..553f7ef 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -260,10 +260,10 @@ static struct nfs_page *nfs_find_and_lock_request(struct page *page, bool nonblo
 		req = nfs_page_find_request_locked(page);
 		if (req == NULL)
 			break;
-		if (nfs_lock_request_dontget(req))
+		if (nfs_lock_request(req))
 			break;
 		/* Note: If we hold the page lock, as is the case in nfs_writepage,
-		 *	 then the call to nfs_lock_request_dontget() will always
+		 *	 then the call to nfs_lock_request() will always
 		 *	 succeed provided that someone hasn't already marked the
 		 *	 request as dirty (in which case we don't care).
 		 */
@@ -406,7 +406,7 @@ static void nfs_inode_add_request(struct inode *inode, struct nfs_page *req)
 	struct nfs_inode *nfsi = NFS_I(inode);
 
 	/* Lock the request! */
-	nfs_lock_request_dontget(req);
+	nfs_lock_request(req);
 
 	spin_lock(&inode->i_lock);
 	if (!nfsi->npages && nfs_have_delegation(inode, FMODE_WRITE))
@@ -651,6 +651,7 @@ nfs_scan_commit_list(struct list_head *src, struct list_head *dst,
 	list_for_each_entry_safe(req, tmp, src, wb_list) {
 		if (!nfs_lock_request(req))
 			continue;
+		kref_get(&req->wb_kref);
 		if (cond_resched_lock(cinfo->lock))
 			list_safe_reset_next(req, tmp, wb_list);
 		nfs_request_remove_commit_list(req, cinfo);
@@ -741,7 +742,7 @@ static struct nfs_page *nfs_try_to_update_request(struct inode *inode,
 		    || end < req->wb_offset)
 			goto out_flushme;
 
-		if (nfs_lock_request_dontget(req))
+		if (nfs_lock_request(req))
 			break;
 
 		/* The request is locked, so wait and then retry */
@@ -1717,7 +1718,7 @@ int nfs_wb_page_cancel(struct inode *inode, struct page *page)
 		req = nfs_page_find_request(page);
 		if (req == NULL)
 			break;
-		if (nfs_lock_request_dontget(req)) {
+		if (nfs_lock_request(req)) {
 			nfs_clear_request_commit(req);
 			nfs_inode_remove_request(req);
 			/*
diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h
index ef75042..263f30a 100644
--- a/include/linux/nfs_page.h
+++ b/include/linux/nfs_page.h
@@ -99,24 +99,14 @@ extern	void nfs_unlock_request(struct nfs_page *req);
 extern void nfs_unlock_request_dont_release(struct nfs_page *req);
 
 /*
- * Lock the page of an asynchronous request without getting a new reference
+ * Lock the page of an asynchronous request
  */
 static inline int
-nfs_lock_request_dontget(struct nfs_page *req)
-{
-	return !test_and_set_bit(PG_BUSY, &req->wb_flags);
-}
-
-static inline int
 nfs_lock_request(struct nfs_page *req)
 {
-	if (test_and_set_bit(PG_BUSY, &req->wb_flags))
-		return 0;
-	kref_get(&req->wb_kref);
-	return 1;
+	return !test_and_set_bit(PG_BUSY, &req->wb_flags);
 }
 
-
 /**
  * nfs_list_add_request - Insert a request into a list
  * @req: request
-- 
cgit v0.10.2


From 1d1afcbc294cc7c788eb5c7b6b98e8d63caf002c Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 9 May 2012 14:04:55 -0400
Subject: NFS: Clean up - Rename nfs_unlock_request and
 nfs_unlock_request_dont_release

Function rename to ensure that the functionality of nfs_unlock_request()
mirrors that of nfs_lock_request(). Then let nfs_unlock_and_release_request()
do the work of what used to be called nfs_unlock_request()...

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Cc: Fred Isaman <iisaman@netapp.com>

diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 465ea84..845e201 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -488,7 +488,7 @@ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
 
 	while (!list_empty(&failed)) {
 		nfs_release_request(req);
-		nfs_unlock_request(req);
+		nfs_unlock_and_release_request(req);
 	}
 
 	if (put_dreq(dreq))
@@ -521,7 +521,7 @@ static void nfs_direct_commit_complete(struct nfs_commit_data *data)
 			nfs_mark_request_commit(req, NULL, &cinfo);
 		} else
 			nfs_release_request(req);
-		nfs_unlock_request(req);
+		nfs_unlock_and_release_request(req);
 	}
 
 	if (atomic_dec_and_test(&cinfo.mds->rpcs_out))
@@ -662,7 +662,7 @@ static ssize_t nfs_direct_write_schedule_segment(struct nfs_pageio_descriptor *d
 			req->wb_offset = pos & ~PAGE_MASK;
 			if (!nfs_pageio_add_request(desc, req)) {
 				result = desc->pg_error;
-				nfs_unlock_request(req);
+				nfs_unlock_and_release_request(req);
 				nfs_release_request(req);
 				break;
 			}
@@ -739,7 +739,7 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr)
 		default:
 			nfs_release_request(req);
 		}
-		nfs_unlock_request(req);
+		nfs_unlock_and_release_request(req);
 	}
 
 out_put:
@@ -756,7 +756,7 @@ static void nfs_write_sync_pgio_error(struct list_head *head)
 		req = nfs_list_entry(head->next);
 		nfs_list_remove_request(req);
 		nfs_release_request(req);
-		nfs_unlock_request(req);
+		nfs_unlock_and_release_request(req);
 	}
 }
 
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 69146f3..aed913c 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -128,10 +128,10 @@ nfs_create_request(struct nfs_open_context *ctx, struct inode *inode,
 }
 
 /**
- * nfs_unlock_request_dont_release - Unlock request and wake up sleepers.
+ * nfs_unlock_request - Unlock request and wake up sleepers.
  * @req:
  */
-void nfs_unlock_request_dont_release(struct nfs_page *req)
+void nfs_unlock_request(struct nfs_page *req)
 {
 	if (!NFS_WBACK_BUSY(req)) {
 		printk(KERN_ERR "NFS: Invalid unlock attempted\n");
@@ -144,11 +144,12 @@ void nfs_unlock_request_dont_release(struct nfs_page *req)
 }
 
 /**
- * nfs_unlock_request - Unlock request and release the nfs_page
+ * nfs_unlock_and_release_request - Unlock request and release the nfs_page
+ * @req:
  */
-void nfs_unlock_request(struct nfs_page *req)
+void nfs_unlock_and_release_request(struct nfs_page *req)
 {
-	nfs_unlock_request_dont_release(req);
+	nfs_unlock_request(req);
 	nfs_release_request(req);
 }
 
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 553f7ef..8ffd7d5 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -625,7 +625,7 @@ static void nfs_write_completion(struct nfs_pgio_header *hdr)
 remove_req:
 		nfs_inode_remove_request(req);
 next:
-		nfs_unlock_request_dont_release(req);
+		nfs_unlock_request(req);
 		nfs_end_page_writeback(req->wb_page);
 		nfs_release_request(req);
 	}
@@ -812,7 +812,7 @@ static int nfs_writepage_setup(struct nfs_open_context *ctx, struct page *page,
 	nfs_grow_file(page, offset, count);
 	nfs_mark_uptodate(page, req->wb_pgbase, req->wb_bytes);
 	nfs_mark_request_dirty(req);
-	nfs_unlock_request(req);
+	nfs_unlock_and_release_request(req);
 	return 0;
 }
 
@@ -1039,7 +1039,7 @@ static int nfs_do_multiple_writes(struct list_head *head,
 static void nfs_redirty_request(struct nfs_page *req)
 {
 	nfs_mark_request_dirty(req);
-	nfs_unlock_request_dont_release(req);
+	nfs_unlock_request(req);
 	nfs_end_page_writeback(req->wb_page);
 	nfs_release_request(req);
 }
@@ -1477,7 +1477,7 @@ void nfs_retry_commit(struct list_head *page_list,
 			dec_bdi_stat(req->wb_page->mapping->backing_dev_info,
 				     BDI_RECLAIMABLE);
 		}
-		nfs_unlock_request(req);
+		nfs_unlock_and_release_request(req);
 	}
 }
 EXPORT_SYMBOL_GPL(nfs_retry_commit);
@@ -1555,7 +1555,7 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data)
 		dprintk(" mismatch\n");
 		nfs_mark_request_dirty(req);
 	next:
-		nfs_unlock_request(req);
+		nfs_unlock_and_release_request(req);
 	}
 	nfs_init_cinfo(&cinfo, data->inode, data->dreq);
 	if (atomic_dec_and_test(&cinfo.mds->rpcs_out))
@@ -1726,7 +1726,7 @@ int nfs_wb_page_cancel(struct inode *inode, struct page *page)
 			 * page as being dirty
 			 */
 			cancel_dirty_page(page, PAGE_CACHE_SIZE);
-			nfs_unlock_request(req);
+			nfs_unlock_and_release_request(req);
 			break;
 		}
 		ret = nfs_wait_on_request(req);
diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h
index 263f30a..88d166b 100644
--- a/include/linux/nfs_page.h
+++ b/include/linux/nfs_page.h
@@ -96,7 +96,7 @@ extern bool nfs_generic_pg_test(struct nfs_pageio_descriptor *desc,
 				struct nfs_page *req);
 extern  int nfs_wait_on_request(struct nfs_page *);
 extern	void nfs_unlock_request(struct nfs_page *req);
-extern void nfs_unlock_request_dont_release(struct nfs_page *req);
+extern	void nfs_unlock_and_release_request(struct nfs_page *req);
 
 /*
  * Lock the page of an asynchronous request
-- 
cgit v0.10.2


From 0427708657750bdc03af3491a0297cab5e7efabf Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 9 May 2012 13:54:53 -0400
Subject: NFS: Clean up - Simplify reference counting in fs/nfs/direct.c

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Cc: Fred Isaman <iisaman@netapp.com>

diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 845e201..c47a46e 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -486,10 +486,8 @@ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
 	}
 	nfs_pageio_complete(&desc);
 
-	while (!list_empty(&failed)) {
-		nfs_release_request(req);
+	while (!list_empty(&failed))
 		nfs_unlock_and_release_request(req);
-	}
 
 	if (put_dreq(dreq))
 		nfs_direct_write_complete(dreq, dreq->inode);
@@ -518,9 +516,9 @@ static void nfs_direct_commit_complete(struct nfs_commit_data *data)
 		nfs_list_remove_request(req);
 		if (dreq->flags == NFS_ODIRECT_RESCHED_WRITES) {
 			/* Note the rewrite will go through mds */
+			kref_get(&req->wb_kref);
 			nfs_mark_request_commit(req, NULL, &cinfo);
-		} else
-			nfs_release_request(req);
+		}
 		nfs_unlock_and_release_request(req);
 	}
 
@@ -657,13 +655,11 @@ static ssize_t nfs_direct_write_schedule_segment(struct nfs_pageio_descriptor *d
 				break;
 			}
 			nfs_lock_request(req);
-			kref_get(&req->wb_kref);
 			req->wb_index = pos >> PAGE_SHIFT;
 			req->wb_offset = pos & ~PAGE_MASK;
 			if (!nfs_pageio_add_request(desc, req)) {
 				result = desc->pg_error;
 				nfs_unlock_and_release_request(req);
-				nfs_release_request(req);
 				break;
 			}
 			pgbase = 0;
@@ -734,10 +730,8 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr)
 		switch (bit) {
 		case NFS_IOHDR_NEED_RESCHED:
 		case NFS_IOHDR_NEED_COMMIT:
+			kref_get(&req->wb_kref);
 			nfs_mark_request_commit(req, hdr->lseg, &cinfo);
-			break;
-		default:
-			nfs_release_request(req);
 		}
 		nfs_unlock_and_release_request(req);
 	}
@@ -755,7 +749,6 @@ static void nfs_write_sync_pgio_error(struct list_head *head)
 	while (!list_empty(head)) {
 		req = nfs_list_entry(head->next);
 		nfs_list_remove_request(req);
-		nfs_release_request(req);
 		nfs_unlock_and_release_request(req);
 	}
 }
-- 
cgit v0.10.2


From 5af7df6b831ef9fd5fbde9d4bbd596f742cb2ad8 Mon Sep 17 00:00:00 2001
From: Peter Ujfalusi <peter.ujfalusi@ti.com>
Date: Wed, 2 May 2012 16:54:42 +0300
Subject: mfd: Add regulator support for twl6040 VIO, V2V1 supplies

twl6040 has three power supply source:
VBAT needs to be connected to VBAT, VIO, and V2V1.
Add regulator support for the VIO, V2V1 supplies.
Initially handle the two supply together with bulk commands.

Signed-off-by: Peter Ujfalusi <peter.ujfalusi@ti.com>
Reviewed-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
Signed-off-by: Tero Kristo <t-kristo@ti.com>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>

diff --git a/drivers/mfd/twl6040-core.c b/drivers/mfd/twl6040-core.c
index 493f4a6..7a92d95 100644
--- a/drivers/mfd/twl6040-core.c
+++ b/drivers/mfd/twl6040-core.c
@@ -27,6 +27,7 @@
 #include <linux/types.h>
 #include <linux/slab.h>
 #include <linux/kernel.h>
+#include <linux/err.h>
 #include <linux/platform_device.h>
 #include <linux/gpio.h>
 #include <linux/delay.h>
@@ -35,8 +36,10 @@
 #include <linux/err.h>
 #include <linux/mfd/core.h>
 #include <linux/mfd/twl6040.h>
+#include <linux/regulator/consumer.h>
 
 #define VIBRACTRL_MEMBER(reg) ((reg == TWL6040_REG_VIBCTLL) ? 0 : 1)
+#define TWL6040_NUM_SUPPLIES	(2)
 
 int twl6040_reg_read(struct twl6040 *twl6040, unsigned int reg)
 {
@@ -532,6 +535,21 @@ static int __devinit twl6040_probe(struct i2c_client *client,
 
 	i2c_set_clientdata(client, twl6040);
 
+	twl6040->supplies[0].supply = "vio";
+	twl6040->supplies[1].supply = "v2v1";
+	ret = regulator_bulk_get(&client->dev, TWL6040_NUM_SUPPLIES,
+				 twl6040->supplies);
+	if (ret != 0) {
+		dev_err(&client->dev, "Failed to get supplies: %d\n", ret);
+		goto regulator_get_err;
+	}
+
+	ret = regulator_bulk_enable(TWL6040_NUM_SUPPLIES, twl6040->supplies);
+	if (ret != 0) {
+		dev_err(&client->dev, "Failed to enable supplies: %d\n", ret);
+		goto power_err;
+	}
+
 	twl6040->dev = &client->dev;
 	twl6040->irq = client->irq;
 	twl6040->irq_base = pdata->irq_base;
@@ -552,13 +570,13 @@ static int __devinit twl6040_probe(struct i2c_client *client,
 		ret = gpio_request_one(twl6040->audpwron, GPIOF_OUT_INIT_LOW,
 				       "audpwron");
 		if (ret)
-			goto gpio1_err;
+			goto gpio_err;
 	}
 
 	/* codec interrupt */
 	ret = twl6040_irq_init(twl6040);
 	if (ret)
-		goto gpio2_err;
+		goto irq_init_err;
 
 	ret = request_threaded_irq(twl6040->irq_base + TWL6040_IRQ_READY,
 				   NULL, twl6040_naudint_handler, 0,
@@ -618,10 +636,14 @@ mfd_err:
 	free_irq(twl6040->irq_base + TWL6040_IRQ_READY, twl6040);
 irq_err:
 	twl6040_irq_exit(twl6040);
-gpio2_err:
+irq_init_err:
 	if (gpio_is_valid(twl6040->audpwron))
 		gpio_free(twl6040->audpwron);
-gpio1_err:
+gpio_err:
+	regulator_bulk_disable(TWL6040_NUM_SUPPLIES, twl6040->supplies);
+power_err:
+	regulator_bulk_free(TWL6040_NUM_SUPPLIES, twl6040->supplies);
+regulator_get_err:
 	i2c_set_clientdata(client, NULL);
 err:
 	return ret;
@@ -643,6 +665,9 @@ static int __devexit twl6040_remove(struct i2c_client *client)
 	mfd_remove_devices(&client->dev);
 	i2c_set_clientdata(client, NULL);
 
+	regulator_bulk_disable(TWL6040_NUM_SUPPLIES, twl6040->supplies);
+	regulator_bulk_free(TWL6040_NUM_SUPPLIES, twl6040->supplies);
+
 	return 0;
 }
 
diff --git a/include/linux/mfd/twl6040.h b/include/linux/mfd/twl6040.h
index b15b5f0..6659487 100644
--- a/include/linux/mfd/twl6040.h
+++ b/include/linux/mfd/twl6040.h
@@ -27,6 +27,7 @@
 
 #include <linux/interrupt.h>
 #include <linux/mfd/core.h>
+#include <linux/regulator/consumer.h>
 
 #define TWL6040_REG_ASICID		0x01
 #define TWL6040_REG_ASICREV		0x02
@@ -203,6 +204,7 @@ struct regmap;
 struct twl6040 {
 	struct device *dev;
 	struct regmap *regmap;
+	struct regulator_bulk_data supplies[2]; /* supplies for vio, v2v1 */
 	struct mutex mutex;
 	struct mutex io_mutex;
 	struct mutex irq_mutex;
-- 
cgit v0.10.2


From 7ccfe9b1d58ef5cf8fdbd50b6ee2ae0e9aa9cb36 Mon Sep 17 00:00:00 2001
From: Michel JAOUEN <michel.jaouen@stericsson.com>
Date: Mon, 7 May 2012 15:02:03 +0200
Subject: mfd: Support of hierachical interrupt for ab8500

Hierarchical interrupt is supported since ab8500 V2.
However, it is not implemented in the ab8500-core driver.
With the current implementation, when an ab9540 interrupt
occurs, 17 Latch registers are read through i2c. With
hierarchical interrupt implementation, there are only 4 i2c
accesses.

Signed-off-by: Maxime Coquelin <maxime.coquelin@stericsson.com>
Reviewed-by: Michel Jaouen <michel.jaouen@stericsson.com>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>

diff --git a/drivers/mfd/ab8500-core.c b/drivers/mfd/ab8500-core.c
index 08850f0..6a59919 100644
--- a/drivers/mfd/ab8500-core.c
+++ b/drivers/mfd/ab8500-core.c
@@ -91,6 +91,15 @@
 #define AB8500_IT_MASK23_REG		0x56
 #define AB8500_IT_MASK24_REG		0x57
 
+/*
+ * latch hierarchy registers
+ */
+#define AB8500_IT_LATCHHIER1_REG	0x60
+#define AB8500_IT_LATCHHIER2_REG	0x61
+#define AB8500_IT_LATCHHIER3_REG	0x62
+
+#define AB8500_IT_LATCHHIER_NUM		3
+
 #define AB8500_REV_REG			0x80
 #define AB8500_IC_NAME_REG		0x82
 #define AB8500_SWITCH_OFF_STATUS	0x00
@@ -340,6 +349,90 @@ static struct irq_chip ab8500_irq_chip = {
 	.irq_unmask		= ab8500_irq_unmask,
 };
 
+static int ab8500_handle_hierarchical_line(struct ab8500 *ab8500,
+					int latch_offset, u8 latch_val)
+{
+	int int_bit = __ffs(latch_val);
+	int line, i;
+
+	do {
+		int_bit = __ffs(latch_val);
+
+		for (i = 0; i < ab8500->mask_size; i++)
+			if (ab8500->irq_reg_offset[i] == latch_offset)
+				break;
+
+		if (i >= ab8500->mask_size) {
+			dev_err(ab8500->dev, "Register offset 0x%2x not declared\n",
+					latch_offset);
+			return -ENXIO;
+		}
+
+		line = (i << 3) + int_bit;
+		latch_val &= ~(1 << int_bit);
+
+		handle_nested_irq(ab8500->irq_base + line);
+	} while (latch_val);
+
+	return 0;
+}
+
+static int ab8500_handle_hierarchical_latch(struct ab8500 *ab8500,
+					int hier_offset, u8 hier_val)
+{
+	int latch_bit, status;
+	u8 latch_offset, latch_val;
+
+	do {
+		latch_bit = __ffs(hier_val);
+		latch_offset = (hier_offset << 3) + latch_bit;
+
+		/* Fix inconsistent ITFromLatch25 bit mapping... */
+		if (unlikely(latch_offset == 17))
+			latch_offset = 24;
+
+		status = get_register_interruptible(ab8500,
+				AB8500_INTERRUPT,
+				AB8500_IT_LATCH1_REG + latch_offset,
+				&latch_val);
+		if (status < 0 || latch_val == 0)
+			goto discard;
+
+		status = ab8500_handle_hierarchical_line(ab8500,
+				latch_offset, latch_val);
+		if (status < 0)
+			return status;
+discard:
+		hier_val &= ~(1 << latch_bit);
+	} while (hier_val);
+
+	return 0;
+}
+
+static irqreturn_t ab8500_hierarchical_irq(int irq, void *dev)
+{
+	struct ab8500 *ab8500 = dev;
+	u8 i;
+
+	dev_vdbg(ab8500->dev, "interrupt\n");
+
+	/*  Hierarchical interrupt version */
+	for (i = 0; i < AB8500_IT_LATCHHIER_NUM; i++) {
+		int status;
+		u8 hier_val;
+
+		status = get_register_interruptible(ab8500, AB8500_INTERRUPT,
+			AB8500_IT_LATCHHIER1_REG + i, &hier_val);
+		if (status < 0 || hier_val == 0)
+			continue;
+
+		status = ab8500_handle_hierarchical_latch(ab8500, i, hier_val);
+		if (status < 0)
+			break;
+	}
+	return IRQ_HANDLED;
+}
+
 static irqreturn_t ab8500_irq(int irq, void *dev)
 {
 	struct ab8500 *ab8500 = dev;
@@ -1179,9 +1272,18 @@ int __devinit ab8500_init(struct ab8500 *ab8500, enum ab8500_version version)
 		if (ret)
 			goto out_freeoldmask;
 
-		ret = request_threaded_irq(ab8500->irq, NULL, ab8500_irq,
-					   IRQF_ONESHOT | IRQF_NO_SUSPEND,
-					   "ab8500", ab8500);
+		/*  Activate this feature only in ab9540 */
+		/*  till tests are done on ab8500 1p2 or later*/
+		if (is_ab9540(ab8500))
+			ret = request_threaded_irq(ab8500->irq, NULL,
+					ab8500_hierarchical_irq,
+					IRQF_ONESHOT | IRQF_NO_SUSPEND,
+					"ab8500", ab8500);
+		else
+			ret = request_threaded_irq(ab8500->irq, NULL,
+					ab8500_irq,
+					IRQF_ONESHOT | IRQF_NO_SUSPEND,
+					"ab8500", ab8500);
 		if (ret)
 			goto out_removeirq;
 	}
-- 
cgit v0.10.2


From 3028eb2b324c517da1e9e589743c4a5154f70dd1 Mon Sep 17 00:00:00 2001
From: Bryan Schumaker <bjschuma@netapp.com>
Date: Thu, 10 May 2012 15:07:30 -0400
Subject: NFS: Rename nfs4_proc_get_root()

This function is really getting the root filehandle and not the root
dentry of the filesystem.  I also removed the rpc_ops lookup from
nfs4_get_rootfh() under the assumption that if we reach this function
then we already know we are using NFS v4.

Signed-off-by: Bryan Schumaker <bjschuma@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c
index 4ca6f5c..8a0f33e 100644
--- a/fs/nfs/getroot.c
+++ b/fs/nfs/getroot.c
@@ -150,7 +150,7 @@ int nfs4_get_rootfh(struct nfs_server *server, struct nfs_fh *mntfh)
 		goto out;
 
 	/* Start by getting the root filehandle from the server */
-	ret = server->nfs_client->rpc_ops->getroot(server, mntfh, &fsinfo);
+	ret = nfs4_proc_get_rootfh(server, mntfh, &fsinfo);
 	if (ret < 0) {
 		dprintk("nfs4_get_rootfh: getroot error = %d\n", -ret);
 		goto out;
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 97365b0..edeef71 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -214,6 +214,7 @@ struct vfsmount *nfs4_submount(struct nfs_server *, struct dentry *,
 /* nfs4proc.c */
 extern int nfs4_proc_setclientid(struct nfs_client *, u32, unsigned short, struct rpc_cred *, struct nfs4_setclientid_res *);
 extern int nfs4_proc_setclientid_confirm(struct nfs_client *, struct nfs4_setclientid_res *arg, struct rpc_cred *);
+extern int nfs4_proc_get_rootfh(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *);
 extern int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred);
 extern int nfs4_init_clientid(struct nfs_client *, struct rpc_cred *);
 extern int nfs41_init_clientid(struct nfs_client *, struct rpc_cred *);
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 98eb48d..69212d2 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -2345,8 +2345,8 @@ static int nfs4_find_root_sec(struct nfs_server *server, struct nfs_fh *fhandle,
 /*
  * get the file handle for the "/" directory on the server
  */
-static int nfs4_proc_get_root(struct nfs_server *server, struct nfs_fh *fhandle,
-			      struct nfs_fsinfo *info)
+int nfs4_proc_get_rootfh(struct nfs_server *server, struct nfs_fh *fhandle,
+			 struct nfs_fsinfo *info)
 {
 	int minor_version = server->nfs_client->cl_minorversion;
 	int status = nfs4_lookup_root(server, fhandle, info);
@@ -6539,7 +6539,7 @@ const struct nfs_rpc_ops nfs_v4_clientops = {
 	.dir_inode_ops	= &nfs4_dir_inode_operations,
 	.file_inode_ops	= &nfs4_file_inode_operations,
 	.file_ops	= &nfs4_file_operations,
-	.getroot	= nfs4_proc_get_root,
+	.getroot	= nfs4_proc_get_rootfh,
 	.submount	= nfs4_submount,
 	.getattr	= nfs4_proc_getattr,
 	.setattr	= nfs4_proc_setattr,
-- 
cgit v0.10.2


From bae36241be7fab16b2e987d31b6e6bd4456ac188 Mon Sep 17 00:00:00 2001
From: Bryan Schumaker <bjschuma@netapp.com>
Date: Thu, 10 May 2012 15:07:31 -0400
Subject: NFS: Create a single nfs_get_root()

This patch splits out the NFS v4 specific functionality of
nfs4_get_root() into its own rpc_op called by the generic client, and
leaves nfs4_proc_get_rootfh() as its own stand alone function.  This
also allows me to change nfs4_remote_mount(), nfs4_xdev_mount() and
nfs4_remote_referral_mount() to use the generic client's nfs_get_root()
function.  Later patches in this series will collapse these functions
into one common function, so using the same get_root() function
everywhere simplifies future changes.

Signed-off-by: Bryan Schumaker <bjschuma@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c
index 8a0f33e..8abfb19 100644
--- a/fs/nfs/getroot.c
+++ b/fs/nfs/getroot.c
@@ -178,87 +178,4 @@ out:
 	return ret;
 }
 
-/*
- * get an NFS4 root dentry from the root filehandle
- */
-struct dentry *nfs4_get_root(struct super_block *sb, struct nfs_fh *mntfh,
-			     const char *devname)
-{
-	struct nfs_server *server = NFS_SB(sb);
-	struct nfs_fattr *fattr = NULL;
-	struct dentry *ret;
-	struct inode *inode;
-	void *name = kstrdup(devname, GFP_KERNEL);
-	int error;
-
-	dprintk("--> nfs4_get_root()\n");
-
-	if (!name)
-		return ERR_PTR(-ENOMEM);
-
-	/* get the info about the server and filesystem */
-	error = nfs4_server_capabilities(server, mntfh);
-	if (error < 0) {
-		dprintk("nfs_get_root: getcaps error = %d\n",
-			-error);
-		kfree(name);
-		return ERR_PTR(error);
-	}
-
-	fattr = nfs_alloc_fattr();
-	if (fattr == NULL) {
-		kfree(name);
-		return ERR_PTR(-ENOMEM);
-	}
-
-	/* get the actual root for this mount */
-	error = server->nfs_client->rpc_ops->getattr(server, mntfh, fattr);
-	if (error < 0) {
-		dprintk("nfs_get_root: getattr error = %d\n", -error);
-		ret = ERR_PTR(error);
-		goto out;
-	}
-
-	if (fattr->valid & NFS_ATTR_FATTR_FSID &&
-	    !nfs_fsid_equal(&server->fsid, &fattr->fsid))
-		memcpy(&server->fsid, &fattr->fsid, sizeof(server->fsid));
-
-	inode = nfs_fhget(sb, mntfh, fattr);
-	if (IS_ERR(inode)) {
-		dprintk("nfs_get_root: get root inode failed\n");
-		ret = ERR_CAST(inode);
-		goto out;
-	}
-
-	error = nfs_superblock_set_dummy_root(sb, inode);
-	if (error != 0) {
-		ret = ERR_PTR(error);
-		goto out;
-	}
-
-	/* root dentries normally start off anonymous and get spliced in later
-	 * if the dentry tree reaches them; however if the dentry already
-	 * exists, we'll pick it up at this point and use it as the root
-	 */
-	ret = d_obtain_alias(inode);
-	if (IS_ERR(ret)) {
-		dprintk("nfs_get_root: get root dentry failed\n");
-		goto out;
-	}
-
-	security_d_instantiate(ret, inode);
-	spin_lock(&ret->d_lock);
-	if (IS_ROOT(ret) && !(ret->d_flags & DCACHE_NFSFS_RENAMED)) {
-		ret->d_fsdata = name;
-		name = NULL;
-	}
-	spin_unlock(&ret->d_lock);
-out:
-	if (name)
-		kfree(name);
-	nfs_free_fattr(fattr);
-	dprintk("<-- nfs4_get_root()\n");
-	return ret;
-}
-
 #endif /* CONFIG_NFS_V4 */
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 69212d2..e6ab15f 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -80,6 +80,7 @@ static int _nfs4_recover_proc_open(struct nfs4_opendata *data);
 static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *);
 static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *, struct nfs4_state *);
 static void nfs_fixup_referral_attributes(struct nfs_fattr *fattr);
+static int nfs4_proc_getattr(struct nfs_server *, struct nfs_fh *, struct nfs_fattr *);
 static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr);
 static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
 			    struct nfs_fattr *fattr, struct iattr *sattr,
@@ -2363,6 +2364,31 @@ int nfs4_proc_get_rootfh(struct nfs_server *server, struct nfs_fh *fhandle,
 	return nfs4_map_errors(status);
 }
 
+static int nfs4_proc_get_root(struct nfs_server *server, struct nfs_fh *mntfh,
+			      struct nfs_fsinfo *info)
+{
+	int error;
+	struct nfs_fattr *fattr = info->fattr;
+
+	error = nfs4_server_capabilities(server, mntfh);
+	if (error < 0) {
+		dprintk("nfs4_get_root: getcaps error = %d\n", -error);
+		return error;
+	}
+
+	error = nfs4_proc_getattr(server, mntfh, fattr);
+	if (error < 0) {
+		dprintk("nfs4_get_root: getattr error = %d\n", -error);
+		return error;
+	}
+
+	if (fattr->valid & NFS_ATTR_FATTR_FSID &&
+	    !nfs_fsid_equal(&server->fsid, &fattr->fsid))
+		memcpy(&server->fsid, &fattr->fsid, sizeof(server->fsid));
+
+	return error;
+}
+
 /*
  * Get locations and (maybe) other attributes of a referral.
  * Note that we'll actually follow the referral later when
@@ -6539,7 +6565,7 @@ const struct nfs_rpc_ops nfs_v4_clientops = {
 	.dir_inode_ops	= &nfs4_dir_inode_operations,
 	.file_inode_ops	= &nfs4_file_inode_operations,
 	.file_ops	= &nfs4_file_operations,
-	.getroot	= nfs4_proc_get_rootfh,
+	.getroot	= nfs4_proc_get_root,
 	.submount	= nfs4_submount,
 	.getattr	= nfs4_proc_getattr,
 	.setattr	= nfs4_proc_setattr,
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 4ac7fca..75b1717 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -2727,7 +2727,7 @@ nfs4_remote_mount(struct file_system_type *fs_type, int flags,
 		nfs_fscache_get_super_cookie(s, data->fscache_uniq, NULL);
 	}
 
-	mntroot = nfs4_get_root(s, mntfh, dev_name);
+	mntroot = nfs_get_root(s, mntfh, dev_name);
 	if (IS_ERR(mntroot)) {
 		error = PTR_ERR(mntroot);
 		goto error_splat_super;
@@ -2991,7 +2991,7 @@ nfs4_xdev_mount(struct file_system_type *fs_type, int flags,
 		nfs_fscache_get_super_cookie(s, NULL, data);
 	}
 
-	mntroot = nfs4_get_root(s, data->fh, dev_name);
+	mntroot = nfs_get_root(s, data->fh, dev_name);
 	if (IS_ERR(mntroot)) {
 		error = PTR_ERR(mntroot);
 		goto error_splat_super;
@@ -3082,7 +3082,7 @@ nfs4_remote_referral_mount(struct file_system_type *fs_type, int flags,
 		nfs_fscache_get_super_cookie(s, NULL, data);
 	}
 
-	mntroot = nfs4_get_root(s, mntfh, dev_name);
+	mntroot = nfs_get_root(s, mntfh, dev_name);
 	if (IS_ERR(mntroot)) {
 		error = PTR_ERR(mntroot);
 		goto error_splat_super;
-- 
cgit v0.10.2


From 2311b9439ce8c525f3f8f821fc2ca9a541f673a5 Mon Sep 17 00:00:00 2001
From: Bryan Schumaker <bjschuma@netapp.com>
Date: Thu, 10 May 2012 15:07:32 -0400
Subject: NFS: Don't pass mount data to nfs_fscache_get_super_cookie()

I intend on creating a single nfs_fs_mount() function used by all our
mount paths.  To avoid checking between new mounts and clone mounts, I
instead pass both structures to a new function in super.c that finds the
cache key and then looks up the super cookie.

Signed-off-by: Bryan Schumaker <bjschuma@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c
index ae65c16..c817787 100644
--- a/fs/nfs/fscache.c
+++ b/fs/nfs/fscache.c
@@ -64,23 +64,12 @@ void nfs_fscache_release_client_cookie(struct nfs_client *clp)
  * either by the 'fsc=xxx' option to mount, or by inheriting it from the parent
  * superblock across an automount point of some nature.
  */
-void nfs_fscache_get_super_cookie(struct super_block *sb, const char *uniq,
-				  struct nfs_clone_mount *mntdata)
+void nfs_fscache_get_super_cookie(struct super_block *sb, const char *uniq, int ulen)
 {
 	struct nfs_fscache_key *key, *xkey;
 	struct nfs_server *nfss = NFS_SB(sb);
 	struct rb_node **p, *parent;
-	int diff, ulen;
-
-	if (uniq) {
-		ulen = strlen(uniq);
-	} else if (mntdata) {
-		struct nfs_server *mnt_s = NFS_SB(mntdata->sb);
-		if (mnt_s->fscache_key) {
-			uniq = mnt_s->fscache_key->key.uniquifier;
-			ulen = mnt_s->fscache_key->key.uniq_len;
-		}
-	}
+	int diff;
 
 	if (!uniq) {
 		uniq = "";
diff --git a/fs/nfs/fscache.h b/fs/nfs/fscache.h
index b9c572d..2a08b91 100644
--- a/fs/nfs/fscache.h
+++ b/fs/nfs/fscache.h
@@ -73,9 +73,7 @@ extern void nfs_fscache_unregister(void);
 extern void nfs_fscache_get_client_cookie(struct nfs_client *);
 extern void nfs_fscache_release_client_cookie(struct nfs_client *);
 
-extern void nfs_fscache_get_super_cookie(struct super_block *,
-					 const char *,
-					 struct nfs_clone_mount *);
+extern void nfs_fscache_get_super_cookie(struct super_block *, const char *, int);
 extern void nfs_fscache_release_super_cookie(struct super_block *);
 
 extern void nfs_fscache_init_inode_cookie(struct inode *);
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 75b1717..f56fb35 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -2278,6 +2278,27 @@ static int nfs_compare_super(struct super_block *sb, void *data)
 	return nfs_compare_mount_options(sb, server, mntflags);
 }
 
+static void nfs_get_cache_cookie(struct super_block *sb,
+				 struct nfs_parsed_mount_data *parsed,
+				 struct nfs_clone_mount *cloned)
+{
+	char *uniq = NULL;
+	int ulen = 0;
+
+	if (parsed && parsed->fscache_uniq) {
+		uniq = parsed->fscache_uniq;
+		ulen = strlen(parsed->fscache_uniq);
+	} else if (cloned) {
+		struct nfs_server *mnt_s = NFS_SB(cloned->sb);
+		if (mnt_s->fscache_key) {
+			uniq = mnt_s->fscache_key->key.uniquifier;
+			ulen = mnt_s->fscache_key->key.uniq_len;
+		};
+	}
+
+	nfs_fscache_get_super_cookie(sb, uniq, ulen);
+}
+
 static int nfs_bdi_register(struct nfs_server *server)
 {
 	return bdi_register_dev(&server->backing_dev_info, server->s_dev);
@@ -2352,7 +2373,7 @@ static struct dentry *nfs_fs_mount(struct file_system_type *fs_type,
 	if (!s->s_root) {
 		/* initial superblock/root creation */
 		nfs_fill_super(s, data);
-		nfs_fscache_get_super_cookie(s, data->fscache_uniq, NULL);
+		nfs_get_cache_cookie(s, data, NULL);
 	}
 
 	mntroot = nfs_get_root(s, mntfh, dev_name);
@@ -2461,7 +2482,7 @@ nfs_xdev_mount(struct file_system_type *fs_type, int flags,
 	if (!s->s_root) {
 		/* initial superblock/root creation */
 		nfs_clone_super(s, data->sb);
-		nfs_fscache_get_super_cookie(s, NULL, data);
+		nfs_get_cache_cookie(s, NULL, data);
 	}
 
 	mntroot = nfs_get_root(s, data->fh, dev_name);
@@ -2724,7 +2745,7 @@ nfs4_remote_mount(struct file_system_type *fs_type, int flags,
 	if (!s->s_root) {
 		/* initial superblock/root creation */
 		nfs4_fill_super(s);
-		nfs_fscache_get_super_cookie(s, data->fscache_uniq, NULL);
+		nfs_get_cache_cookie(s, data, NULL);
 	}
 
 	mntroot = nfs_get_root(s, mntfh, dev_name);
@@ -2988,7 +3009,7 @@ nfs4_xdev_mount(struct file_system_type *fs_type, int flags,
 	if (!s->s_root) {
 		/* initial superblock/root creation */
 		nfs4_clone_super(s, data->sb);
-		nfs_fscache_get_super_cookie(s, NULL, data);
+		nfs_get_cache_cookie(s, NULL, data);
 	}
 
 	mntroot = nfs_get_root(s, data->fh, dev_name);
@@ -3079,7 +3100,7 @@ nfs4_remote_referral_mount(struct file_system_type *fs_type, int flags,
 	if (!s->s_root) {
 		/* initial superblock/root creation */
 		nfs4_fill_super(s);
-		nfs_fscache_get_super_cookie(s, NULL, data);
+		nfs_get_cache_cookie(s, NULL, data);
 	}
 
 	mntroot = nfs_get_root(s, mntfh, dev_name);
-- 
cgit v0.10.2


From 586f95cd4ffda7aa120327ec09865b181c809cdf Mon Sep 17 00:00:00 2001
From: Bryan Schumaker <bjschuma@netapp.com>
Date: Thu, 10 May 2012 15:07:33 -0400
Subject: NFS: Remove NFS4_MOUNT_UNSHARED

This flag is numerically equivalent to NFS_MOUNT_UNSHARED, so I can
remove it to make collapsing functions more straightforward.

Signed-off-by: Bryan Schumaker <bjschuma@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index f56fb35..40d43e0 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -2719,7 +2719,7 @@ nfs4_remote_mount(struct file_system_type *fs_type, int flags,
 	}
 	sb_mntdata.server = server;
 
-	if (server->flags & NFS4_MOUNT_UNSHARED)
+	if (server->flags & NFS_MOUNT_UNSHARED)
 		compare_super = NULL;
 
 	/* -o noac implies -o sync */
@@ -2983,7 +2983,7 @@ nfs4_xdev_mount(struct file_system_type *fs_type, int flags,
 	}
 	sb_mntdata.server = server;
 
-	if (server->flags & NFS4_MOUNT_UNSHARED)
+	if (server->flags & NFS_MOUNT_UNSHARED)
 		compare_super = NULL;
 
 	/* -o noac implies -o sync */
@@ -3074,7 +3074,7 @@ nfs4_remote_referral_mount(struct file_system_type *fs_type, int flags,
 	}
 	sb_mntdata.server = server;
 
-	if (server->flags & NFS4_MOUNT_UNSHARED)
+	if (server->flags & NFS_MOUNT_UNSHARED)
 		compare_super = NULL;
 
 	/* -o noac implies -o sync */
-- 
cgit v0.10.2


From c40f8d1d35a27d81b4af9d5d2f7286fd978ae9b2 Mon Sep 17 00:00:00 2001
From: Bryan Schumaker <bjschuma@netapp.com>
Date: Thu, 10 May 2012 15:07:34 -0400
Subject: NFS: Create a common fs_mount() function

The nfs4_remote_mount() function was only slightly different from the
nfs_fs_mount() function used by the generic client.  I created a new
nfs_mount_info structure to set different parameters to help combine
these functions.

Signed-off-by: Bryan Schumaker <bjschuma@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 40d43e0..64a62da 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -277,6 +277,11 @@ static match_table_t nfs_vers_tokens = {
 	{ Opt_vers_err, NULL }
 };
 
+struct nfs_mount_info {
+	void (*fill_super)(struct super_block *, struct nfs_mount_info *);
+	struct nfs_parsed_mount_data *parsed;
+};
+
 static void nfs_umount_begin(struct super_block *);
 static int  nfs_statfs(struct dentry *, struct kstatfs *);
 static int  nfs_show_options(struct seq_file *, struct dentry *);
@@ -2129,8 +2134,9 @@ static inline void nfs_initialise_sb(struct super_block *sb)
  * Finish setting up an NFS2/3 superblock
  */
 static void nfs_fill_super(struct super_block *sb,
-			   struct nfs_parsed_mount_data *data)
+			   struct nfs_mount_info *mount_info)
 {
+	struct nfs_parsed_mount_data *data = mount_info->parsed;
 	struct nfs_server *server = NFS_SB(sb);
 
 	sb->s_blocksize_bits = 0;
@@ -2304,47 +2310,21 @@ static int nfs_bdi_register(struct nfs_server *server)
 	return bdi_register_dev(&server->backing_dev_info, server->s_dev);
 }
 
-static struct dentry *nfs_fs_mount(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *raw_data)
+static struct dentry *nfs_fs_mount_common(struct file_system_type *fs_type,
+					  struct nfs_server *server,
+					  int flags, const char *dev_name,
+					  struct nfs_fh *mntfh,
+					  struct nfs_mount_info *mount_info)
 {
-	struct nfs_server *server = NULL;
 	struct super_block *s;
-	struct nfs_parsed_mount_data *data;
-	struct nfs_fh *mntfh;
 	struct dentry *mntroot = ERR_PTR(-ENOMEM);
 	int (*compare_super)(struct super_block *, void *) = nfs_compare_super;
 	struct nfs_sb_mountdata sb_mntdata = {
 		.mntflags = flags,
+		.server = server,
 	};
 	int error;
 
-	data = nfs_alloc_parsed_mount_data(NFS_DEFAULT_VERSION);
-	mntfh = nfs_alloc_fhandle();
-	if (data == NULL || mntfh == NULL)
-		goto out;
-
-	/* Validate the mount data */
-	error = nfs_validate_mount_data(raw_data, data, mntfh, dev_name);
-	if (error < 0) {
-		mntroot = ERR_PTR(error);
-		goto out;
-	}
-
-#ifdef CONFIG_NFS_V4
-	if (data->version == 4) {
-		mntroot = nfs4_try_mount(flags, dev_name, data);
-		goto out;
-	}
-#endif	/* CONFIG_NFS_V4 */
-
-	/* Get a volume representation */
-	server = nfs_create_server(data, mntfh);
-	if (IS_ERR(server)) {
-		mntroot = ERR_CAST(server);
-		goto out;
-	}
-	sb_mntdata.server = server;
-
 	if (server->flags & NFS_MOUNT_UNSHARED)
 		compare_super = NULL;
 
@@ -2372,23 +2352,21 @@ static struct dentry *nfs_fs_mount(struct file_system_type *fs_type,
 
 	if (!s->s_root) {
 		/* initial superblock/root creation */
-		nfs_fill_super(s, data);
-		nfs_get_cache_cookie(s, data, NULL);
+		mount_info->fill_super(s, mount_info);
+		nfs_get_cache_cookie(s, mount_info->parsed, NULL);
 	}
 
 	mntroot = nfs_get_root(s, mntfh, dev_name);
 	if (IS_ERR(mntroot))
 		goto error_splat_super;
 
-	error = security_sb_set_mnt_opts(s, &data->lsm_opts);
+	error = security_sb_set_mnt_opts(s, &mount_info->parsed->lsm_opts);
 	if (error)
 		goto error_splat_root;
 
 	s->s_flags |= MS_ACTIVE;
 
 out:
-	nfs_free_parsed_mount_data(data);
-	nfs_free_fhandle(mntfh);
 	return mntroot;
 
 out_err_nosb:
@@ -2406,6 +2384,52 @@ error_splat_bdi:
 	goto out;
 }
 
+static struct dentry *nfs_fs_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *raw_data)
+{
+	struct nfs_server *server;
+	struct nfs_parsed_mount_data *data = NULL;
+	struct nfs_mount_info mount_info = {
+		.fill_super = nfs_fill_super,
+	};
+	struct nfs_fh *mntfh;
+	struct dentry *mntroot = ERR_PTR(-ENOMEM);
+	int error;
+
+	data = nfs_alloc_parsed_mount_data(NFS_DEFAULT_VERSION);
+	mntfh = nfs_alloc_fhandle();
+	if (data == NULL || mntfh == NULL)
+		goto out;
+
+	/* Validate the mount data */
+	error = nfs_validate_mount_data(raw_data, data, mntfh, dev_name);
+	if (error < 0) {
+		mntroot = ERR_PTR(error);
+		goto out;
+	}
+	mount_info.parsed = data;
+
+#ifdef CONFIG_NFS_V4
+	if (data->version == 4) {
+		mntroot = nfs4_try_mount(flags, dev_name, data);
+		goto out;
+	}
+#endif	/* CONFIG_NFS_V4 */
+
+	/* Get a volume representation */
+	server = nfs_create_server(data, mntfh);
+	if (IS_ERR(server)) {
+		mntroot = ERR_CAST(server);
+		goto out;
+	}
+
+	mntroot = nfs_fs_mount_common(fs_type, server, flags, dev_name, mntfh, &mount_info);
+out:
+	nfs_free_parsed_mount_data(data);
+	nfs_free_fhandle(mntfh);
+	return mntroot;
+}
+
 /*
  * Ensure that we unregister the bdi before kill_anon_super
  * releases the device name
@@ -2544,7 +2568,8 @@ static void nfs4_clone_super(struct super_block *sb,
 /*
  * Set up an NFS4 superblock
  */
-static void nfs4_fill_super(struct super_block *sb)
+static void nfs4_fill_super(struct super_block *sb,
+			    struct nfs_mount_info *mount_info)
 {
 	sb->s_time_gran = 1;
 	sb->s_op = &nfs4_sops;
@@ -2696,89 +2721,31 @@ static struct dentry *
 nfs4_remote_mount(struct file_system_type *fs_type, int flags,
 		  const char *dev_name, void *raw_data)
 {
-	struct nfs_parsed_mount_data *data = raw_data;
-	struct super_block *s;
+	struct nfs_mount_info mount_info = {
+		.fill_super = nfs4_fill_super,
+		.parsed = raw_data,
+	};
 	struct nfs_server *server;
 	struct nfs_fh *mntfh;
-	struct dentry *mntroot;
-	int (*compare_super)(struct super_block *, void *) = nfs_compare_super;
-	struct nfs_sb_mountdata sb_mntdata = {
-		.mntflags = flags,
-	};
-	int error = -ENOMEM;
+	struct dentry *mntroot = ERR_PTR(-ENOMEM);
 
 	mntfh = nfs_alloc_fhandle();
-	if (data == NULL || mntfh == NULL)
+	if (mount_info.parsed == NULL || mntfh == NULL)
 		goto out;
 
 	/* Get a volume representation */
-	server = nfs4_create_server(data, mntfh);
+	server = nfs4_create_server(mount_info.parsed, mntfh);
 	if (IS_ERR(server)) {
-		error = PTR_ERR(server);
+		mntroot = ERR_CAST(server);
 		goto out;
 	}
-	sb_mntdata.server = server;
 
-	if (server->flags & NFS_MOUNT_UNSHARED)
-		compare_super = NULL;
-
-	/* -o noac implies -o sync */
-	if (server->flags & NFS_MOUNT_NOAC)
-		sb_mntdata.mntflags |= MS_SYNCHRONOUS;
-
-	/* Get a superblock - note that we may end up sharing one that already exists */
-	s = sget(&nfs4_fs_type, compare_super, nfs_set_super, &sb_mntdata);
-	if (IS_ERR(s)) {
-		error = PTR_ERR(s);
-		goto out_free;
-	}
-
-	if (s->s_fs_info != server) {
-		nfs_free_server(server);
-		server = NULL;
-	} else {
-		error = nfs_bdi_register(server);
-		if (error)
-			goto error_splat_bdi;
-	}
-
-	if (!s->s_root) {
-		/* initial superblock/root creation */
-		nfs4_fill_super(s);
-		nfs_get_cache_cookie(s, data, NULL);
-	}
-
-	mntroot = nfs_get_root(s, mntfh, dev_name);
-	if (IS_ERR(mntroot)) {
-		error = PTR_ERR(mntroot);
-		goto error_splat_super;
-	}
-
-	error = security_sb_set_mnt_opts(s, &data->lsm_opts);
-	if (error)
-		goto error_splat_root;
-
-	s->s_flags |= MS_ACTIVE;
-
-	nfs_free_fhandle(mntfh);
-	return mntroot;
+	mntroot = nfs_fs_mount_common(fs_type, server, flags,
+				      dev_name, mntfh, &mount_info);
 
 out:
 	nfs_free_fhandle(mntfh);
-	return ERR_PTR(error);
-
-out_free:
-	nfs_free_server(server);
-	goto out;
-
-error_splat_root:
-	dput(mntroot);
-error_splat_super:
-	if (server && !s->s_root)
-		bdi_unregister(&server->backing_dev_info);
-error_splat_bdi:
-	deactivate_locked_super(s);
-	goto out;
+	return mntroot;
 }
 
 static struct vfsmount *nfs_do_root_mount(struct file_system_type *fs_type,
@@ -3099,7 +3066,7 @@ nfs4_remote_referral_mount(struct file_system_type *fs_type, int flags,
 
 	if (!s->s_root) {
 		/* initial superblock/root creation */
-		nfs4_fill_super(s);
+		nfs4_fill_super(s, NULL);
 		nfs_get_cache_cookie(s, NULL, data);
 	}
 
-- 
cgit v0.10.2


From 8c958e0c4c52d600bd2ea677eb920fceda8aee49 Mon Sep 17 00:00:00 2001
From: Bryan Schumaker <bjschuma@netapp.com>
Date: Thu, 10 May 2012 15:07:35 -0400
Subject: NFS: Create a common xdev_mount() function

The only difference between nfs_xdev_mount() and nfs4_xdev_mount() is the
clone_super() function called to clone the super block.  I can combine
these two functions by using the fill_super field in the mount_info
structure.

Signed-off-by: Bryan Schumaker <bjschuma@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 64a62da..707d1f6 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -280,6 +280,7 @@ static match_table_t nfs_vers_tokens = {
 struct nfs_mount_info {
 	void (*fill_super)(struct super_block *, struct nfs_mount_info *);
 	struct nfs_parsed_mount_data *parsed;
+	struct nfs_clone_mount *cloned;
 };
 
 static void nfs_umount_begin(struct super_block *);
@@ -2160,8 +2161,9 @@ static void nfs_fill_super(struct super_block *sb,
  * Finish setting up a cloned NFS2/3 superblock
  */
 static void nfs_clone_super(struct super_block *sb,
-			    const struct super_block *old_sb)
+			    struct nfs_mount_info *mount_info)
 {
+	const struct super_block *old_sb = mount_info->cloned->sb;
 	struct nfs_server *server = NFS_SB(sb);
 
 	sb->s_blocksize_bits = old_sb->s_blocksize_bits;
@@ -2454,13 +2456,13 @@ static void nfs_kill_super(struct super_block *s)
 }
 
 /*
- * Clone an NFS2/3 server record on xdev traversal (FSID-change)
+ * Clone an NFS2/3/4 server record on xdev traversal (FSID-change)
  */
 static struct dentry *
-nfs_xdev_mount(struct file_system_type *fs_type, int flags,
-		const char *dev_name, void *raw_data)
+nfs_xdev_mount_common(struct file_system_type *fs_type, int flags,
+		const char *dev_name, struct nfs_mount_info *mount_info)
 {
-	struct nfs_clone_mount *data = raw_data;
+	struct nfs_clone_mount *data = mount_info->cloned;
 	struct super_block *s;
 	struct nfs_server *server;
 	struct dentry *mntroot;
@@ -2470,7 +2472,7 @@ nfs_xdev_mount(struct file_system_type *fs_type, int flags,
 	};
 	int error;
 
-	dprintk("--> nfs_xdev_mount()\n");
+	dprintk("--> nfs_xdev_mount_common()\n");
 
 	/* create a new volume representation */
 	server = nfs_clone_server(NFS_SB(data->sb), data->fh, data->fattr, data->authflavor);
@@ -2505,7 +2507,7 @@ nfs_xdev_mount(struct file_system_type *fs_type, int flags,
 
 	if (!s->s_root) {
 		/* initial superblock/root creation */
-		nfs_clone_super(s, data->sb);
+		mount_info->fill_super(s, mount_info);
 		nfs_get_cache_cookie(s, NULL, data);
 	}
 
@@ -2525,13 +2527,13 @@ nfs_xdev_mount(struct file_system_type *fs_type, int flags,
 	/* clone any lsm security options from the parent to the new sb */
 	security_sb_clone_mnt_opts(data->sb, s);
 
-	dprintk("<-- nfs_xdev_mount() = 0\n");
+	dprintk("<-- nfs_xdev_mount_common() = 0\n");
 	return mntroot;
 
 out_err_nosb:
 	nfs_free_server(server);
 out_err_noserver:
-	dprintk("<-- nfs_xdev_mount() = %d [error]\n", error);
+	dprintk("<-- nfs_xdev_mount_common() = %d [error]\n", error);
 	return ERR_PTR(error);
 
 error_splat_super:
@@ -2539,18 +2541,33 @@ error_splat_super:
 		bdi_unregister(&server->backing_dev_info);
 error_splat_bdi:
 	deactivate_locked_super(s);
-	dprintk("<-- nfs_xdev_mount() = %d [splat]\n", error);
+	dprintk("<-- nfs_xdev_mount_common() = %d [splat]\n", error);
 	return ERR_PTR(error);
 }
 
+/*
+ * Clone an NFS2/3 server record on xdev traversal (FSID-change)
+ */
+static struct dentry *
+nfs_xdev_mount(struct file_system_type *fs_type, int flags,
+		const char *dev_name, void *raw_data)
+{
+	struct nfs_mount_info mount_info = {
+		.fill_super = nfs_clone_super,
+		.cloned   = raw_data,
+	};
+	return nfs_xdev_mount_common(&nfs_fs_type, flags, dev_name, &mount_info);
+}
+
 #ifdef CONFIG_NFS_V4
 
 /*
  * Finish setting up a cloned NFS4 superblock
  */
 static void nfs4_clone_super(struct super_block *sb,
-			    const struct super_block *old_sb)
+			     struct nfs_mount_info *mount_info)
 {
+	const struct super_block *old_sb = mount_info->cloned->sb;
 	sb->s_blocksize_bits = old_sb->s_blocksize_bits;
 	sb->s_blocksize = old_sb->s_blocksize;
 	sb->s_maxbytes = old_sb->s_maxbytes;
@@ -2930,86 +2947,11 @@ static struct dentry *
 nfs4_xdev_mount(struct file_system_type *fs_type, int flags,
 		 const char *dev_name, void *raw_data)
 {
-	struct nfs_clone_mount *data = raw_data;
-	struct super_block *s;
-	struct nfs_server *server;
-	struct dentry *mntroot;
-	int (*compare_super)(struct super_block *, void *) = nfs_compare_super;
-	struct nfs_sb_mountdata sb_mntdata = {
-		.mntflags = flags,
+	struct nfs_mount_info mount_info = {
+		.fill_super = nfs4_clone_super,
+		.cloned = raw_data,
 	};
-	int error;
-
-	dprintk("--> nfs4_xdev_mount()\n");
-
-	/* create a new volume representation */
-	server = nfs_clone_server(NFS_SB(data->sb), data->fh, data->fattr, data->authflavor);
-	if (IS_ERR(server)) {
-		error = PTR_ERR(server);
-		goto out_err_noserver;
-	}
-	sb_mntdata.server = server;
-
-	if (server->flags & NFS_MOUNT_UNSHARED)
-		compare_super = NULL;
-
-	/* -o noac implies -o sync */
-	if (server->flags & NFS_MOUNT_NOAC)
-		sb_mntdata.mntflags |= MS_SYNCHRONOUS;
-
-	/* Get a superblock - note that we may end up sharing one that already exists */
-	s = sget(&nfs4_fs_type, compare_super, nfs_set_super, &sb_mntdata);
-	if (IS_ERR(s)) {
-		error = PTR_ERR(s);
-		goto out_err_nosb;
-	}
-
-	if (s->s_fs_info != server) {
-		nfs_free_server(server);
-		server = NULL;
-	} else {
-		error = nfs_bdi_register(server);
-		if (error)
-			goto error_splat_bdi;
-	}
-
-	if (!s->s_root) {
-		/* initial superblock/root creation */
-		nfs4_clone_super(s, data->sb);
-		nfs_get_cache_cookie(s, NULL, data);
-	}
-
-	mntroot = nfs_get_root(s, data->fh, dev_name);
-	if (IS_ERR(mntroot)) {
-		error = PTR_ERR(mntroot);
-		goto error_splat_super;
-	}
-	if (mntroot->d_inode->i_op != NFS_SB(s)->nfs_client->rpc_ops->dir_inode_ops) {
-		dput(mntroot);
-		error = -ESTALE;
-		goto error_splat_super;
-	}
-
-	s->s_flags |= MS_ACTIVE;
-
-	security_sb_clone_mnt_opts(data->sb, s);
-
-	dprintk("<-- nfs4_xdev_mount() = 0\n");
-	return mntroot;
-
-out_err_nosb:
-	nfs_free_server(server);
-out_err_noserver:
-	dprintk("<-- nfs4_xdev_mount() = %d [error]\n", error);
-	return ERR_PTR(error);
-
-error_splat_super:
-	if (server && !s->s_root)
-		bdi_unregister(&server->backing_dev_info);
-error_splat_bdi:
-	deactivate_locked_super(s);
-	dprintk("<-- nfs4_xdev_mount() = %d [splat]\n", error);
-	return ERR_PTR(error);
+	return nfs_xdev_mount_common(&nfs4_fs_type, flags, dev_name, &mount_info);
 }
 
 static struct dentry *
-- 
cgit v0.10.2


From 3d176e3fe4f6dc379b252bf43e2e146a8f7caf01 Mon Sep 17 00:00:00 2001
From: Bryan Schumaker <bjschuma@netapp.com>
Date: Thu, 10 May 2012 15:07:36 -0400
Subject: NFS: Use nfs_fs_mount_common() for xdev mounts

At this point, there are only a few small differences between these two
functions.  I can set a few function pointers in the nfs_mount_info
struct to get around these differences.

Signed-off-by: Bryan Schumaker <bjschuma@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 707d1f6..e93a6e9 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -279,6 +279,7 @@ static match_table_t nfs_vers_tokens = {
 
 struct nfs_mount_info {
 	void (*fill_super)(struct super_block *, struct nfs_mount_info *);
+	int (*set_security)(struct super_block *, struct dentry *, struct nfs_mount_info *);
 	struct nfs_parsed_mount_data *parsed;
 	struct nfs_clone_mount *cloned;
 };
@@ -2312,6 +2313,22 @@ static int nfs_bdi_register(struct nfs_server *server)
 	return bdi_register_dev(&server->backing_dev_info, server->s_dev);
 }
 
+static int nfs_set_sb_security(struct super_block *s, struct dentry *mntroot,
+			       struct nfs_mount_info *mount_info)
+{
+	return security_sb_set_mnt_opts(s, &mount_info->parsed->lsm_opts);
+}
+
+static int nfs_clone_sb_security(struct super_block *s, struct dentry *mntroot,
+				 struct nfs_mount_info *mount_info)
+{
+	/* clone any lsm security options from the parent to the new sb */
+	security_sb_clone_mnt_opts(mount_info->cloned->sb, s);
+	if (mntroot->d_inode->i_op != NFS_SB(s)->nfs_client->rpc_ops->dir_inode_ops)
+		return -ESTALE;
+	return 0;
+}
+
 static struct dentry *nfs_fs_mount_common(struct file_system_type *fs_type,
 					  struct nfs_server *server,
 					  int flags, const char *dev_name,
@@ -2355,14 +2372,14 @@ static struct dentry *nfs_fs_mount_common(struct file_system_type *fs_type,
 	if (!s->s_root) {
 		/* initial superblock/root creation */
 		mount_info->fill_super(s, mount_info);
-		nfs_get_cache_cookie(s, mount_info->parsed, NULL);
+		nfs_get_cache_cookie(s, mount_info->parsed, mount_info->cloned);
 	}
 
 	mntroot = nfs_get_root(s, mntfh, dev_name);
 	if (IS_ERR(mntroot))
 		goto error_splat_super;
 
-	error = security_sb_set_mnt_opts(s, &mount_info->parsed->lsm_opts);
+	error = mount_info->set_security(s, mntroot, mount_info);
 	if (error)
 		goto error_splat_root;
 
@@ -2393,6 +2410,7 @@ static struct dentry *nfs_fs_mount(struct file_system_type *fs_type,
 	struct nfs_parsed_mount_data *data = NULL;
 	struct nfs_mount_info mount_info = {
 		.fill_super = nfs_fill_super,
+		.set_security = nfs_set_sb_security,
 	};
 	struct nfs_fh *mntfh;
 	struct dentry *mntroot = ERR_PTR(-ENOMEM);
@@ -2463,13 +2481,8 @@ nfs_xdev_mount_common(struct file_system_type *fs_type, int flags,
 		const char *dev_name, struct nfs_mount_info *mount_info)
 {
 	struct nfs_clone_mount *data = mount_info->cloned;
-	struct super_block *s;
 	struct nfs_server *server;
-	struct dentry *mntroot;
-	int (*compare_super)(struct super_block *, void *) = nfs_compare_super;
-	struct nfs_sb_mountdata sb_mntdata = {
-		.mntflags = flags,
-	};
+	struct dentry *mntroot = ERR_PTR(-ENOMEM);
 	int error;
 
 	dprintk("--> nfs_xdev_mount_common()\n");
@@ -2478,71 +2491,17 @@ nfs_xdev_mount_common(struct file_system_type *fs_type, int flags,
 	server = nfs_clone_server(NFS_SB(data->sb), data->fh, data->fattr, data->authflavor);
 	if (IS_ERR(server)) {
 		error = PTR_ERR(server);
-		goto out_err_noserver;
-	}
-	sb_mntdata.server = server;
-
-	if (server->flags & NFS_MOUNT_UNSHARED)
-		compare_super = NULL;
-
-	/* -o noac implies -o sync */
-	if (server->flags & NFS_MOUNT_NOAC)
-		sb_mntdata.mntflags |= MS_SYNCHRONOUS;
-
-	/* Get a superblock - note that we may end up sharing one that already exists */
-	s = sget(&nfs_fs_type, compare_super, nfs_set_super, &sb_mntdata);
-	if (IS_ERR(s)) {
-		error = PTR_ERR(s);
-		goto out_err_nosb;
-	}
-
-	if (s->s_fs_info != server) {
-		nfs_free_server(server);
-		server = NULL;
-	} else {
-		error = nfs_bdi_register(server);
-		if (error)
-			goto error_splat_bdi;
-	}
-
-	if (!s->s_root) {
-		/* initial superblock/root creation */
-		mount_info->fill_super(s, mount_info);
-		nfs_get_cache_cookie(s, NULL, data);
-	}
-
-	mntroot = nfs_get_root(s, data->fh, dev_name);
-	if (IS_ERR(mntroot)) {
-		error = PTR_ERR(mntroot);
-		goto error_splat_super;
-	}
-	if (mntroot->d_inode->i_op != NFS_SB(s)->nfs_client->rpc_ops->dir_inode_ops) {
-		dput(mntroot);
-		error = -ESTALE;
-		goto error_splat_super;
+		goto out_err;
 	}
 
-	s->s_flags |= MS_ACTIVE;
-
-	/* clone any lsm security options from the parent to the new sb */
-	security_sb_clone_mnt_opts(data->sb, s);
-
+	mntroot = nfs_fs_mount_common(fs_type, server, flags, dev_name, data->fh, mount_info);
 	dprintk("<-- nfs_xdev_mount_common() = 0\n");
+out:
 	return mntroot;
 
-out_err_nosb:
-	nfs_free_server(server);
-out_err_noserver:
+out_err:
 	dprintk("<-- nfs_xdev_mount_common() = %d [error]\n", error);
-	return ERR_PTR(error);
-
-error_splat_super:
-	if (server && !s->s_root)
-		bdi_unregister(&server->backing_dev_info);
-error_splat_bdi:
-	deactivate_locked_super(s);
-	dprintk("<-- nfs_xdev_mount_common() = %d [splat]\n", error);
-	return ERR_PTR(error);
+	goto out;
 }
 
 /*
@@ -2554,6 +2513,7 @@ nfs_xdev_mount(struct file_system_type *fs_type, int flags,
 {
 	struct nfs_mount_info mount_info = {
 		.fill_super = nfs_clone_super,
+		.set_security = nfs_clone_sb_security,
 		.cloned   = raw_data,
 	};
 	return nfs_xdev_mount_common(&nfs_fs_type, flags, dev_name, &mount_info);
@@ -2740,6 +2700,7 @@ nfs4_remote_mount(struct file_system_type *fs_type, int flags,
 {
 	struct nfs_mount_info mount_info = {
 		.fill_super = nfs4_fill_super,
+		.set_security = nfs_set_sb_security,
 		.parsed = raw_data,
 	};
 	struct nfs_server *server;
@@ -2949,6 +2910,7 @@ nfs4_xdev_mount(struct file_system_type *fs_type, int flags,
 {
 	struct nfs_mount_info mount_info = {
 		.fill_super = nfs4_clone_super,
+		.set_security = nfs_clone_sb_security,
 		.cloned = raw_data,
 	};
 	return nfs_xdev_mount_common(&nfs4_fs_type, flags, dev_name, &mount_info);
-- 
cgit v0.10.2


From 21e4b82e13c038457b4fa4d54d988c9f1865bcf6 Mon Sep 17 00:00:00 2001
From: Bryan Schumaker <bjschuma@netapp.com>
Date: Thu, 10 May 2012 15:07:37 -0400
Subject: NFS: Use nfs_fs_mount_common() for remote referral mounts

Signed-off-by: Bryan Schumaker <bjschuma@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index e93a6e9..1157189 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -2920,95 +2920,32 @@ static struct dentry *
 nfs4_remote_referral_mount(struct file_system_type *fs_type, int flags,
 			   const char *dev_name, void *raw_data)
 {
-	struct nfs_clone_mount *data = raw_data;
-	struct super_block *s;
+	struct nfs_mount_info mount_info = {
+		.fill_super = nfs4_fill_super,
+		.set_security = nfs_clone_sb_security,
+		.cloned = raw_data,
+	};
 	struct nfs_server *server;
-	struct dentry *mntroot;
+	struct dentry *mntroot = ERR_PTR(-ENOMEM);
 	struct nfs_fh *mntfh;
-	int (*compare_super)(struct super_block *, void *) = nfs_compare_super;
-	struct nfs_sb_mountdata sb_mntdata = {
-		.mntflags = flags,
-	};
-	int error = -ENOMEM;
 
 	dprintk("--> nfs4_referral_get_sb()\n");
 
 	mntfh = nfs_alloc_fhandle();
-	if (mntfh == NULL)
-		goto out_err_nofh;
+	if (mount_info.cloned == NULL || mntfh == NULL)
+		goto out;
 
 	/* create a new volume representation */
-	server = nfs4_create_referral_server(data, mntfh);
+	server = nfs4_create_referral_server(mount_info.cloned, mntfh);
 	if (IS_ERR(server)) {
-		error = PTR_ERR(server);
-		goto out_err_noserver;
-	}
-	sb_mntdata.server = server;
-
-	if (server->flags & NFS_MOUNT_UNSHARED)
-		compare_super = NULL;
-
-	/* -o noac implies -o sync */
-	if (server->flags & NFS_MOUNT_NOAC)
-		sb_mntdata.mntflags |= MS_SYNCHRONOUS;
-
-	/* Get a superblock - note that we may end up sharing one that already exists */
-	s = sget(&nfs4_fs_type, compare_super, nfs_set_super, &sb_mntdata);
-	if (IS_ERR(s)) {
-		error = PTR_ERR(s);
-		goto out_err_nosb;
-	}
-
-	if (s->s_fs_info != server) {
-		nfs_free_server(server);
-		server = NULL;
-	} else {
-		error = nfs_bdi_register(server);
-		if (error)
-			goto error_splat_bdi;
-	}
-
-	if (!s->s_root) {
-		/* initial superblock/root creation */
-		nfs4_fill_super(s, NULL);
-		nfs_get_cache_cookie(s, NULL, data);
-	}
-
-	mntroot = nfs_get_root(s, mntfh, dev_name);
-	if (IS_ERR(mntroot)) {
-		error = PTR_ERR(mntroot);
-		goto error_splat_super;
-	}
-	if (mntroot->d_inode->i_op != NFS_SB(s)->nfs_client->rpc_ops->dir_inode_ops) {
-		dput(mntroot);
-		error = -ESTALE;
-		goto error_splat_super;
+		mntroot = ERR_CAST(server);
+		goto out;
 	}
 
-	s->s_flags |= MS_ACTIVE;
-
-	security_sb_clone_mnt_opts(data->sb, s);
-
+	mntroot = nfs_fs_mount_common(&nfs4_fs_type, server, flags, dev_name, mntfh, &mount_info);
+out:
 	nfs_free_fhandle(mntfh);
-	dprintk("<-- nfs4_referral_get_sb() = 0\n");
 	return mntroot;
-
-out_err_nosb:
-	nfs_free_server(server);
-out_err_noserver:
-	nfs_free_fhandle(mntfh);
-out_err_nofh:
-	dprintk("<-- nfs4_referral_get_sb() = %d [error]\n", error);
-	return ERR_PTR(error);
-
-error_splat_super:
-	if (server && !s->s_root)
-		bdi_unregister(&server->backing_dev_info);
-error_splat_bdi:
-	deactivate_locked_super(s);
-	nfs_free_fhandle(mntfh);
-	dprintk("<-- nfs4_referral_get_sb() = %d [splat]\n", error);
-	return ERR_PTR(error);
 }
 
 /*
-- 
cgit v0.10.2


From db8333519187d5974cf2ff33910c893bf8727d9f Mon Sep 17 00:00:00 2001
From: Bryan Schumaker <bjschuma@netapp.com>
Date: Thu, 10 May 2012 15:07:38 -0400
Subject: NFS: Let mount data parsing set the NFS version

This field is unconditionally set while parsing mount data, so there is
no need to fill it in here.

Signed-off-by: Bryan Schumaker <bjschuma@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 1157189..db0952d 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -945,7 +945,7 @@ static void nfs_umount_begin(struct super_block *sb)
 		rpc_killall_tasks(rpc);
 }
 
-static struct nfs_parsed_mount_data *nfs_alloc_parsed_mount_data(unsigned int version)
+static struct nfs_parsed_mount_data *nfs_alloc_parsed_mount_data(void)
 {
 	struct nfs_parsed_mount_data *data;
 
@@ -960,7 +960,6 @@ static struct nfs_parsed_mount_data *nfs_alloc_parsed_mount_data(unsigned int ve
 		data->nfs_server.protocol = XPRT_TRANSPORT_TCP;
 		data->auth_flavors[0]	= RPC_AUTH_UNIX;
 		data->auth_flavor_len	= 1;
-		data->version		= version;
 		data->minorversion	= 0;
 		data->net		= current->nsproxy->net_ns;
 		security_init_mnt_opts(&data->lsm_opts);
@@ -2416,7 +2415,7 @@ static struct dentry *nfs_fs_mount(struct file_system_type *fs_type,
 	struct dentry *mntroot = ERR_PTR(-ENOMEM);
 	int error;
 
-	data = nfs_alloc_parsed_mount_data(NFS_DEFAULT_VERSION);
+	data = nfs_alloc_parsed_mount_data();
 	mntfh = nfs_alloc_fhandle();
 	if (data == NULL || mntfh == NULL)
 		goto out;
@@ -2867,7 +2866,7 @@ static struct dentry *nfs4_mount(struct file_system_type *fs_type,
 	int error = -ENOMEM;
 	struct dentry *res = ERR_PTR(-ENOMEM);
 
-	data = nfs_alloc_parsed_mount_data(4);
+	data = nfs_alloc_parsed_mount_data();
 	if (data == NULL)
 		goto out;
 
-- 
cgit v0.10.2


From 486aa699ffb6ec28adbc147326d62ac9294de8dc Mon Sep 17 00:00:00 2001
From: Bryan Schumaker <bjschuma@netapp.com>
Date: Thu, 10 May 2012 15:07:39 -0400
Subject: NFS: Create a new nfs_try_mount()

This function returns the same same return type as nfs4_try_mount() so
they two can be more easily substituted.

Signed-off-by: Bryan Schumaker <bjschuma@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index db0952d..c69c806 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -290,6 +290,9 @@ static int  nfs_show_options(struct seq_file *, struct dentry *);
 static int  nfs_show_devname(struct seq_file *, struct dentry *);
 static int  nfs_show_path(struct seq_file *, struct dentry *);
 static int  nfs_show_stats(struct seq_file *, struct dentry *);
+static struct dentry *nfs_fs_mount_common(struct file_system_type *,
+		struct nfs_server *, int, const char *, struct nfs_fh *,
+		struct nfs_mount_info *);
 static struct dentry *nfs_fs_mount(struct file_system_type *,
 		int, const char *, void *);
 static struct dentry *nfs_xdev_mount(struct file_system_type *fs_type,
@@ -1680,8 +1683,8 @@ static int nfs_walk_authlist(struct nfs_parsed_mount_data *args,
  * Use the remote server's MOUNT service to request the NFS file handle
  * corresponding to the provided path.
  */
-static int nfs_try_mount(struct nfs_parsed_mount_data *args,
-			 struct nfs_fh *root_fh)
+static int nfs_request_mount(struct nfs_parsed_mount_data *args,
+			     struct nfs_fh *root_fh)
 {
 	rpc_authflavor_t server_authlist[NFS_MAX_SECFLAVORS];
 	unsigned int server_authlist_len = ARRAY_SIZE(server_authlist);
@@ -1744,6 +1747,25 @@ static int nfs_try_mount(struct nfs_parsed_mount_data *args,
 	return nfs_walk_authlist(args, &request);
 }
 
+static struct dentry *nfs_try_mount(int flags, const char *dev_name,
+				    struct nfs_fh *mntfh,
+				    struct nfs_mount_info *mount_info)
+{
+	int status;
+	struct nfs_server *server;
+
+	status = nfs_request_mount(mount_info->parsed, mntfh);
+	if (status)
+		return ERR_PTR(status);
+
+	/* Get a volume representation */
+	server = nfs_create_server(mount_info->parsed, mntfh);
+	if (IS_ERR(server))
+		return ERR_CAST(server);
+
+	return nfs_fs_mount_common(&nfs_fs_type, server, flags, dev_name, mntfh, mount_info);
+}
+
 /*
  * Split "dev_name" into "hostname:export_path".
  *
@@ -1966,11 +1988,6 @@ static int nfs_validate_mount_data(void *options,
 					   PAGE_SIZE,
 					   &args->nfs_server.export_path,
 					   NFS_MAXPATHLEN);
-		if (!status)
-			status = nfs_try_mount(args, mntfh);
-
-		kfree(args->nfs_server.export_path);
-		args->nfs_server.export_path = NULL;
 
 		if (status)
 			return status;
@@ -2405,7 +2422,6 @@ error_splat_bdi:
 static struct dentry *nfs_fs_mount(struct file_system_type *fs_type,
 	int flags, const char *dev_name, void *raw_data)
 {
-	struct nfs_server *server;
 	struct nfs_parsed_mount_data *data = NULL;
 	struct nfs_mount_info mount_info = {
 		.fill_super = nfs_fill_super,
@@ -2429,20 +2445,12 @@ static struct dentry *nfs_fs_mount(struct file_system_type *fs_type,
 	mount_info.parsed = data;
 
 #ifdef CONFIG_NFS_V4
-	if (data->version == 4) {
+	if (data->version == 4)
 		mntroot = nfs4_try_mount(flags, dev_name, data);
-		goto out;
-	}
+	else
 #endif	/* CONFIG_NFS_V4 */
+		mntroot = nfs_try_mount(flags, dev_name, mntfh, &mount_info);
 
-	/* Get a volume representation */
-	server = nfs_create_server(data, mntfh);
-	if (IS_ERR(server)) {
-		mntroot = ERR_CAST(server);
-		goto out;
-	}
-
-	mntroot = nfs_fs_mount_common(fs_type, server, flags, dev_name, mntfh, &mount_info);
 out:
 	nfs_free_parsed_mount_data(data);
 	nfs_free_fhandle(mntfh);
-- 
cgit v0.10.2


From b72e4f42a33137acc037546277a08f407d3c1016 Mon Sep 17 00:00:00 2001
From: Bryan Schumaker <bjschuma@netapp.com>
Date: Thu, 10 May 2012 15:07:40 -0400
Subject: NFS: Create a single function for text mount data

The v2/3 and v4 cases were very similar, with just a few parameters
changed.  This makes it easy to share code.

Signed-off-by: Bryan Schumaker <bjschuma@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 1855e8f..4e9b0ff 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -103,6 +103,7 @@ struct nfs_parsed_mount_data {
 	unsigned int		version;
 	unsigned int		minorversion;
 	char			*fscache_uniq;
+	bool			need_mount;
 
 	struct {
 		struct sockaddr_storage	address;
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index c69c806..db636d7 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -66,6 +66,7 @@
 #include "pnfs.h"
 
 #define NFSDBG_FACILITY		NFSDBG_VFS
+#define NFS_TEXT_DATA		1
 
 #ifdef CONFIG_NFS_V3
 #define NFS_DEFAULT_VERSION 3
@@ -333,8 +334,7 @@ static const struct super_operations nfs_sops = {
 };
 
 #ifdef CONFIG_NFS_V4
-static int nfs4_validate_text_mount_data(void *options,
-	struct nfs_parsed_mount_data *args, const char *dev_name);
+static void nfs4_validate_mount_flags(struct nfs_parsed_mount_data *);
 static struct dentry *nfs4_try_mount(int flags, const char *dev_name,
 	struct nfs_parsed_mount_data *data);
 static struct dentry *nfs4_mount(struct file_system_type *fs_type,
@@ -964,6 +964,7 @@ static struct nfs_parsed_mount_data *nfs_alloc_parsed_mount_data(void)
 		data->auth_flavors[0]	= RPC_AUTH_UNIX;
 		data->auth_flavor_len	= 1;
 		data->minorversion	= 0;
+		data->need_mount	= true;
 		data->net		= current->nsproxy->net_ns;
 		security_init_mnt_opts(&data->lsm_opts);
 	}
@@ -1754,9 +1755,11 @@ static struct dentry *nfs_try_mount(int flags, const char *dev_name,
 	int status;
 	struct nfs_server *server;
 
-	status = nfs_request_mount(mount_info->parsed, mntfh);
-	if (status)
-		return ERR_PTR(status);
+	if (mount_info->parsed->need_mount) {
+		status = nfs_request_mount(mount_info->parsed, mntfh);
+		if (status)
+			return ERR_PTR(status);
+	}
 
 	/* Get a volume representation */
 	server = nfs_create_server(mount_info->parsed, mntfh);
@@ -1911,6 +1914,7 @@ static int nfs_validate_mount_data(void *options,
 		args->acregmax		= data->acregmax;
 		args->acdirmin		= data->acdirmin;
 		args->acdirmax		= data->acdirmax;
+		args->need_mount	= false;
 
 		memcpy(sap, &data->addr, sizeof(data->addr));
 		args->nfs_server.addrlen = sizeof(data->addr);
@@ -1962,38 +1966,8 @@ static int nfs_validate_mount_data(void *options,
 		}
 
 		break;
-	default: {
-		int status;
-
-		if (nfs_parse_mount_options((char *)options, args) == 0)
-			return -EINVAL;
-
-		if (!nfs_verify_server_address(sap))
-			goto out_no_address;
-
-		if (args->version == 4)
-#ifdef CONFIG_NFS_V4
-			return nfs4_validate_text_mount_data(options,
-							     args, dev_name);
-#else
-			goto out_v4_not_compiled;
-#endif
-
-		nfs_set_port(sap, &args->nfs_server.port, 0);
-
-		nfs_set_mount_transport_protocol(args);
-
-		status = nfs_parse_devname(dev_name,
-					   &args->nfs_server.hostname,
-					   PAGE_SIZE,
-					   &args->nfs_server.export_path,
-					   NFS_MAXPATHLEN);
-
-		if (status)
-			return status;
-
-		break;
-		}
+	default:
+		return NFS_TEXT_DATA;
 	}
 
 #ifndef CONFIG_NFS_V3
@@ -2022,12 +1996,6 @@ out_v3_not_compiled:
 	return -EPROTONOSUPPORT;
 #endif /* !CONFIG_NFS_V3 */
 
-#ifndef CONFIG_NFS_V4
-out_v4_not_compiled:
-	dfprintk(MOUNT, "NFS: NFSv4 is not compiled into kernel\n");
-	return -EPROTONOSUPPORT;
-#endif /* !CONFIG_NFS_V4 */
-
 out_nomem:
 	dfprintk(MOUNT, "NFS: not enough memory to handle mount options\n");
 	return -ENOMEM;
@@ -2041,6 +2009,60 @@ out_invalid_fh:
 	return -EINVAL;
 }
 
+static int nfs_validate_text_mount_data(void *options,
+					struct nfs_parsed_mount_data *args,
+					const char *dev_name)
+{
+	int port = 0;
+	int max_namelen = PAGE_SIZE;
+	int max_pathlen = NFS_MAXPATHLEN;
+	struct sockaddr *sap = (struct sockaddr *)&args->nfs_server.address;
+
+	if (nfs_parse_mount_options((char *)options, args) == 0)
+		return -EINVAL;
+
+	if (!nfs_verify_server_address(sap))
+		goto out_no_address;
+
+	if (args->version == 4) {
+#ifdef CONFIG_NFS_V4
+		port = NFS_PORT;
+		max_namelen = NFS4_MAXNAMLEN;
+		max_pathlen = NFS4_MAXPATHLEN;
+		nfs_validate_transport_protocol(args);
+		nfs4_validate_mount_flags(args);
+#else
+		goto out_v4_not_compiled;
+#endif /* CONFIG_NFS_V4 */
+	} else
+		nfs_set_mount_transport_protocol(args);
+
+	nfs_set_port(sap, &args->nfs_server.port, port);
+
+	if (args->auth_flavor_len > 1)
+		goto out_bad_auth;
+
+	return nfs_parse_devname(dev_name,
+				   &args->nfs_server.hostname,
+				   max_namelen,
+				   &args->nfs_server.export_path,
+				   max_pathlen);
+
+#ifndef CONFIG_NFS_V4
+out_v4_not_compiled:
+	dfprintk(MOUNT, "NFS: NFSv4 is not compiled into kernel\n");
+	return -EPROTONOSUPPORT;
+#endif /* !CONFIG_NFS_V4 */
+
+out_no_address:
+	dfprintk(MOUNT, "NFS: mount program didn't pass remote address\n");
+	return -EINVAL;
+
+out_bad_auth:
+	dfprintk(MOUNT, "NFS: Too many RPC auth flavours specified\n");
+	return -EINVAL;
+}
+
 static int
 nfs_compare_remount_data(struct nfs_server *nfss,
 			 struct nfs_parsed_mount_data *data)
@@ -2438,6 +2460,8 @@ static struct dentry *nfs_fs_mount(struct file_system_type *fs_type,
 
 	/* Validate the mount data */
 	error = nfs_validate_mount_data(raw_data, data, mntfh, dev_name);
+	if (error == NFS_TEXT_DATA)
+		error = nfs_validate_text_mount_data(raw_data, data, dev_name);
 	if (error < 0) {
 		mntroot = ERR_PTR(error);
 		goto out;
@@ -2572,37 +2596,6 @@ static void nfs4_validate_mount_flags(struct nfs_parsed_mount_data *args)
 			 NFS_MOUNT_LOCAL_FLOCK|NFS_MOUNT_LOCAL_FCNTL);
 }
 
-static int nfs4_validate_text_mount_data(void *options,
-					 struct nfs_parsed_mount_data *args,
-					 const char *dev_name)
-{
-	struct sockaddr *sap = (struct sockaddr *)&args->nfs_server.address;
-
-	nfs_set_port(sap, &args->nfs_server.port, NFS_PORT);
-
-	nfs_validate_transport_protocol(args);
-
-	nfs4_validate_mount_flags(args);
-
-	if (args->version != 4) {
-		dfprintk(MOUNT,
-			 "NFS4: Illegal mount version\n");
-		return -EINVAL;
-	}
-
-	if (args->auth_flavor_len > 1) {
-		dfprintk(MOUNT,
-			 "NFS4: Too many RPC auth flavours specified\n");
-		return -EINVAL;
-	}
-
-	return nfs_parse_devname(dev_name,
-				   &args->nfs_server.hostname,
-				   NFS4_MAXNAMLEN,
-				   &args->nfs_server.export_path,
-				   NFS4_MAXPATHLEN);
-}
-
 /*
  * Validate NFSv4 mount options
  */
@@ -2673,13 +2666,7 @@ static int nfs4_validate_mount_data(void *options,
 
 		break;
 	default:
-		if (nfs_parse_mount_options((char *)options, args) == 0)
-			return -EINVAL;
-
-		if (!nfs_verify_server_address(sap))
-			return -EINVAL;
-
-		return nfs4_validate_text_mount_data(options, args, dev_name);
+		return NFS_TEXT_DATA;
 	}
 
 	return 0;
@@ -2880,6 +2867,8 @@ static struct dentry *nfs4_mount(struct file_system_type *fs_type,
 
 	/* Validate the mount data */
 	error = nfs4_validate_mount_data(raw_data, data, dev_name);
+	if (error == NFS_TEXT_DATA)
+		error = nfs_validate_text_mount_data(raw_data, data, dev_name);
 	if (error < 0) {
 		res = ERR_PTR(error);
 		goto out;
-- 
cgit v0.10.2


From d72c727cd9de490f936a41634e34cd4a61ba6dd6 Mon Sep 17 00:00:00 2001
From: Bryan Schumaker <bjschuma@netapp.com>
Date: Thu, 10 May 2012 15:07:41 -0400
Subject: NFS: Create a single nfs_validate_mount_data() function

This new function chooses between the v2/3 parser and the v4 parser by
filesystem type.

Signed-off-by: Bryan Schumaker <bjschuma@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index db636d7..5b025b0 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -335,6 +335,8 @@ static const struct super_operations nfs_sops = {
 
 #ifdef CONFIG_NFS_V4
 static void nfs4_validate_mount_flags(struct nfs_parsed_mount_data *);
+static int nfs4_validate_mount_data(void *options,
+	struct nfs_parsed_mount_data *args, const char *dev_name);
 static struct dentry *nfs4_try_mount(int flags, const char *dev_name,
 	struct nfs_parsed_mount_data *data);
 static struct dentry *nfs4_mount(struct file_system_type *fs_type,
@@ -1857,10 +1859,10 @@ out_path:
  * + breaking back: trying proto=udp after proto=tcp, v2 after v3,
  *   mountproto=tcp after mountproto=udp, and so on
  */
-static int nfs_validate_mount_data(void *options,
-				   struct nfs_parsed_mount_data *args,
-				   struct nfs_fh *mntfh,
-				   const char *dev_name)
+static int nfs23_validate_mount_data(void *options,
+				     struct nfs_parsed_mount_data *args,
+				     struct nfs_fh *mntfh,
+				     const char *dev_name)
 {
 	struct nfs_mount_data *data = (struct nfs_mount_data *)options;
 	struct sockaddr *sap = (struct sockaddr *)&args->nfs_server.address;
@@ -2009,6 +2011,28 @@ out_invalid_fh:
 	return -EINVAL;
 }
 
+#ifdef CONFIG_NFS_V4
+static int nfs_validate_mount_data(struct file_system_type *fs_type,
+				   void *options,
+				   struct nfs_parsed_mount_data *args,
+				   struct nfs_fh *mntfh,
+				   const char *dev_name)
+{
+	if (fs_type == &nfs_fs_type)
+		return nfs23_validate_mount_data(options, args, mntfh, dev_name);
+	return nfs4_validate_mount_data(options, args, dev_name);
+}
+#else
+static int nfs_validate_mount_data(struct file_system_type *fs_type,
+				   void *options,
+				   struct nfs_parsed_mount_data *args,
+				   struct nfs_fh *mntfh,
+				   const char *dev_name)
+{
+	return nfs23_validate_mount_data(options, args, mntfh, dev_name);
+}
+#endif
+
 static int nfs_validate_text_mount_data(void *options,
 					struct nfs_parsed_mount_data *args,
 					const char *dev_name)
@@ -2459,7 +2483,7 @@ static struct dentry *nfs_fs_mount(struct file_system_type *fs_type,
 		goto out;
 
 	/* Validate the mount data */
-	error = nfs_validate_mount_data(raw_data, data, mntfh, dev_name);
+	error = nfs_validate_mount_data(fs_type, raw_data, data, mntfh, dev_name);
 	if (error == NFS_TEXT_DATA)
 		error = nfs_validate_text_mount_data(raw_data, data, dev_name);
 	if (error < 0) {
@@ -2866,7 +2890,7 @@ static struct dentry *nfs4_mount(struct file_system_type *fs_type,
 		goto out;
 
 	/* Validate the mount data */
-	error = nfs4_validate_mount_data(raw_data, data, dev_name);
+	error = nfs_validate_mount_data(fs_type, raw_data, data, NULL, dev_name);
 	if (error == NFS_TEXT_DATA)
 		error = nfs_validate_text_mount_data(raw_data, data, dev_name);
 	if (error < 0) {
-- 
cgit v0.10.2


From 46058d46d3fcf2900f18d9bd5585c8f89d59e1c4 Mon Sep 17 00:00:00 2001
From: Bryan Schumaker <bjschuma@netapp.com>
Date: Thu, 10 May 2012 15:07:42 -0400
Subject: NFS: Allocate parsed mount data directly to the nfs_mount_info
 structure

Signed-off-by: Bryan Schumaker <bjschuma@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 5b025b0..fc62701 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -2468,7 +2468,6 @@ error_splat_bdi:
 static struct dentry *nfs_fs_mount(struct file_system_type *fs_type,
 	int flags, const char *dev_name, void *raw_data)
 {
-	struct nfs_parsed_mount_data *data = NULL;
 	struct nfs_mount_info mount_info = {
 		.fill_super = nfs_fill_super,
 		.set_security = nfs_set_sb_security,
@@ -2477,30 +2476,29 @@ static struct dentry *nfs_fs_mount(struct file_system_type *fs_type,
 	struct dentry *mntroot = ERR_PTR(-ENOMEM);
 	int error;
 
-	data = nfs_alloc_parsed_mount_data();
+	mount_info.parsed = nfs_alloc_parsed_mount_data();
 	mntfh = nfs_alloc_fhandle();
-	if (data == NULL || mntfh == NULL)
+	if (mount_info.parsed == NULL || mntfh == NULL)
 		goto out;
 
 	/* Validate the mount data */
-	error = nfs_validate_mount_data(fs_type, raw_data, data, mntfh, dev_name);
+	error = nfs_validate_mount_data(fs_type, raw_data, mount_info.parsed, mntfh, dev_name);
 	if (error == NFS_TEXT_DATA)
-		error = nfs_validate_text_mount_data(raw_data, data, dev_name);
+		error = nfs_validate_text_mount_data(raw_data, mount_info.parsed, dev_name);
 	if (error < 0) {
 		mntroot = ERR_PTR(error);
 		goto out;
 	}
-	mount_info.parsed = data;
 
 #ifdef CONFIG_NFS_V4
-	if (data->version == 4)
-		mntroot = nfs4_try_mount(flags, dev_name, data);
+	if (mount_info.parsed->version == 4)
+		mntroot = nfs4_try_mount(flags, dev_name, mount_info.parsed);
 	else
 #endif	/* CONFIG_NFS_V4 */
 		mntroot = nfs_try_mount(flags, dev_name, mntfh, &mount_info);
 
 out:
-	nfs_free_parsed_mount_data(data);
+	nfs_free_parsed_mount_data(mount_info.parsed);
 	nfs_free_fhandle(mntfh);
 	return mntroot;
 }
-- 
cgit v0.10.2


From 87c7083dc3eba802d6e9f312ec520a4814f59871 Mon Sep 17 00:00:00 2001
From: Bryan Schumaker <bjschuma@netapp.com>
Date: Thu, 10 May 2012 15:07:43 -0400
Subject: NFS: Pass mntfh as part of the nfs_mount_info structure

This allows me to use the filehandle allocated in nfs_fs_mount() for nfs
v4 mounts instead of allocating a new one.  Rather than change
nfs4_mount() to look almost exactly like nfs_fs_mount(), I instead
remove the function.

Signed-off-by: Bryan Schumaker <bjschuma@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index fc62701..c3ae819 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -283,6 +283,7 @@ struct nfs_mount_info {
 	int (*set_security)(struct super_block *, struct dentry *, struct nfs_mount_info *);
 	struct nfs_parsed_mount_data *parsed;
 	struct nfs_clone_mount *cloned;
+	struct nfs_fh *mntfh;
 };
 
 static void nfs_umount_begin(struct super_block *);
@@ -292,8 +293,7 @@ static int  nfs_show_devname(struct seq_file *, struct dentry *);
 static int  nfs_show_path(struct seq_file *, struct dentry *);
 static int  nfs_show_stats(struct seq_file *, struct dentry *);
 static struct dentry *nfs_fs_mount_common(struct file_system_type *,
-		struct nfs_server *, int, const char *, struct nfs_fh *,
-		struct nfs_mount_info *);
+		struct nfs_server *, int, const char *, struct nfs_mount_info *);
 static struct dentry *nfs_fs_mount(struct file_system_type *,
 		int, const char *, void *);
 static struct dentry *nfs_xdev_mount(struct file_system_type *fs_type,
@@ -338,9 +338,7 @@ static void nfs4_validate_mount_flags(struct nfs_parsed_mount_data *);
 static int nfs4_validate_mount_data(void *options,
 	struct nfs_parsed_mount_data *args, const char *dev_name);
 static struct dentry *nfs4_try_mount(int flags, const char *dev_name,
-	struct nfs_parsed_mount_data *data);
-static struct dentry *nfs4_mount(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *raw_data);
+	struct nfs_mount_info *mount_info);
 static struct dentry *nfs4_remote_mount(struct file_system_type *fs_type,
 	int flags, const char *dev_name, void *raw_data);
 static struct dentry *nfs4_xdev_mount(struct file_system_type *fs_type,
@@ -354,7 +352,7 @@ static void nfs4_kill_super(struct super_block *sb);
 static struct file_system_type nfs4_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "nfs4",
-	.mount		= nfs4_mount,
+	.mount		= nfs_fs_mount,
 	.kill_sb	= nfs4_kill_super,
 	.fs_flags	= FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
 };
@@ -1751,24 +1749,23 @@ static int nfs_request_mount(struct nfs_parsed_mount_data *args,
 }
 
 static struct dentry *nfs_try_mount(int flags, const char *dev_name,
-				    struct nfs_fh *mntfh,
 				    struct nfs_mount_info *mount_info)
 {
 	int status;
 	struct nfs_server *server;
 
 	if (mount_info->parsed->need_mount) {
-		status = nfs_request_mount(mount_info->parsed, mntfh);
+		status = nfs_request_mount(mount_info->parsed, mount_info->mntfh);
 		if (status)
 			return ERR_PTR(status);
 	}
 
 	/* Get a volume representation */
-	server = nfs_create_server(mount_info->parsed, mntfh);
+	server = nfs_create_server(mount_info->parsed, mount_info->mntfh);
 	if (IS_ERR(server))
 		return ERR_CAST(server);
 
-	return nfs_fs_mount_common(&nfs_fs_type, server, flags, dev_name, mntfh, mount_info);
+	return nfs_fs_mount_common(&nfs_fs_type, server, flags, dev_name, mount_info);
 }
 
 /*
@@ -2394,7 +2391,6 @@ static int nfs_clone_sb_security(struct super_block *s, struct dentry *mntroot,
 static struct dentry *nfs_fs_mount_common(struct file_system_type *fs_type,
 					  struct nfs_server *server,
 					  int flags, const char *dev_name,
-					  struct nfs_fh *mntfh,
 					  struct nfs_mount_info *mount_info)
 {
 	struct super_block *s;
@@ -2437,7 +2433,7 @@ static struct dentry *nfs_fs_mount_common(struct file_system_type *fs_type,
 		nfs_get_cache_cookie(s, mount_info->parsed, mount_info->cloned);
 	}
 
-	mntroot = nfs_get_root(s, mntfh, dev_name);
+	mntroot = nfs_get_root(s, mount_info->mntfh, dev_name);
 	if (IS_ERR(mntroot))
 		goto error_splat_super;
 
@@ -2472,17 +2468,16 @@ static struct dentry *nfs_fs_mount(struct file_system_type *fs_type,
 		.fill_super = nfs_fill_super,
 		.set_security = nfs_set_sb_security,
 	};
-	struct nfs_fh *mntfh;
 	struct dentry *mntroot = ERR_PTR(-ENOMEM);
 	int error;
 
 	mount_info.parsed = nfs_alloc_parsed_mount_data();
-	mntfh = nfs_alloc_fhandle();
-	if (mount_info.parsed == NULL || mntfh == NULL)
+	mount_info.mntfh = nfs_alloc_fhandle();
+	if (mount_info.parsed == NULL || mount_info.mntfh == NULL)
 		goto out;
 
 	/* Validate the mount data */
-	error = nfs_validate_mount_data(fs_type, raw_data, mount_info.parsed, mntfh, dev_name);
+	error = nfs_validate_mount_data(fs_type, raw_data, mount_info.parsed, mount_info.mntfh, dev_name);
 	if (error == NFS_TEXT_DATA)
 		error = nfs_validate_text_mount_data(raw_data, mount_info.parsed, dev_name);
 	if (error < 0) {
@@ -2492,14 +2487,14 @@ static struct dentry *nfs_fs_mount(struct file_system_type *fs_type,
 
 #ifdef CONFIG_NFS_V4
 	if (mount_info.parsed->version == 4)
-		mntroot = nfs4_try_mount(flags, dev_name, mount_info.parsed);
+		mntroot = nfs4_try_mount(flags, dev_name, &mount_info);
 	else
 #endif	/* CONFIG_NFS_V4 */
-		mntroot = nfs_try_mount(flags, dev_name, mntfh, &mount_info);
+		mntroot = nfs_try_mount(flags, dev_name, &mount_info);
 
 out:
 	nfs_free_parsed_mount_data(mount_info.parsed);
-	nfs_free_fhandle(mntfh);
+	nfs_free_fhandle(mount_info.mntfh);
 	return mntroot;
 }
 
@@ -2540,6 +2535,8 @@ nfs_xdev_mount_common(struct file_system_type *fs_type, int flags,
 
 	dprintk("--> nfs_xdev_mount_common()\n");
 
+	mount_info->mntfh = data->fh;
+
 	/* create a new volume representation */
 	server = nfs_clone_server(NFS_SB(data->sb), data->fh, data->fattr, data->authflavor);
 	if (IS_ERR(server)) {
@@ -2547,7 +2544,7 @@ nfs_xdev_mount_common(struct file_system_type *fs_type, int flags,
 		goto out_err;
 	}
 
-	mntroot = nfs_fs_mount_common(fs_type, server, flags, dev_name, data->fh, mount_info);
+	mntroot = nfs_fs_mount_common(fs_type, server, flags, dev_name, mount_info);
 	dprintk("<-- nfs_xdev_mount_common() = 0\n");
 out:
 	return mntroot;
@@ -2712,33 +2709,25 @@ out_no_address:
  */
 static struct dentry *
 nfs4_remote_mount(struct file_system_type *fs_type, int flags,
-		  const char *dev_name, void *raw_data)
+		  const char *dev_name, void *info)
 {
-	struct nfs_mount_info mount_info = {
-		.fill_super = nfs4_fill_super,
-		.set_security = nfs_set_sb_security,
-		.parsed = raw_data,
-	};
+	struct nfs_mount_info *mount_info = info;
 	struct nfs_server *server;
-	struct nfs_fh *mntfh;
 	struct dentry *mntroot = ERR_PTR(-ENOMEM);
 
-	mntfh = nfs_alloc_fhandle();
-	if (mount_info.parsed == NULL || mntfh == NULL)
-		goto out;
+	mount_info->fill_super = nfs4_fill_super;
+	mount_info->set_security = nfs_set_sb_security;
 
 	/* Get a volume representation */
-	server = nfs4_create_server(mount_info.parsed, mntfh);
+	server = nfs4_create_server(mount_info->parsed, mount_info->mntfh);
 	if (IS_ERR(server)) {
 		mntroot = ERR_CAST(server);
 		goto out;
 	}
 
-	mntroot = nfs_fs_mount_common(fs_type, server, flags,
-				      dev_name, mntfh, &mount_info);
+	mntroot = nfs_fs_mount_common(fs_type, server, flags, dev_name, mount_info);
 
 out:
-	nfs_free_fhandle(mntfh);
 	return mntroot;
 }
 
@@ -2851,17 +2840,18 @@ static struct dentry *nfs_follow_remote_path(struct vfsmount *root_mnt,
 }
 
 static struct dentry *nfs4_try_mount(int flags, const char *dev_name,
-			 struct nfs_parsed_mount_data *data)
+			 struct nfs_mount_info *mount_info)
 {
 	char *export_path;
 	struct vfsmount *root_mnt;
 	struct dentry *res;
+	struct nfs_parsed_mount_data *data = mount_info->parsed;
 
 	dfprintk(MOUNT, "--> nfs4_try_mount()\n");
 
 	export_path = data->nfs_server.export_path;
 	data->nfs_server.export_path = "/";
-	root_mnt = nfs_do_root_mount(&nfs4_remote_fs_type, flags, data,
+	root_mnt = nfs_do_root_mount(&nfs4_remote_fs_type, flags, mount_info,
 			data->nfs_server.hostname);
 	data->nfs_server.export_path = export_path;
 
@@ -2873,40 +2863,6 @@ static struct dentry *nfs4_try_mount(int flags, const char *dev_name,
 	return res;
 }
 
-/*
- * Get the superblock for an NFS4 mountpoint
- */
-static struct dentry *nfs4_mount(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *raw_data)
-{
-	struct nfs_parsed_mount_data *data;
-	int error = -ENOMEM;
-	struct dentry *res = ERR_PTR(-ENOMEM);
-
-	data = nfs_alloc_parsed_mount_data();
-	if (data == NULL)
-		goto out;
-
-	/* Validate the mount data */
-	error = nfs_validate_mount_data(fs_type, raw_data, data, NULL, dev_name);
-	if (error == NFS_TEXT_DATA)
-		error = nfs_validate_text_mount_data(raw_data, data, dev_name);
-	if (error < 0) {
-		res = ERR_PTR(error);
-		goto out;
-	}
-
-	res = nfs4_try_mount(flags, dev_name, data);
-	if (IS_ERR(res))
-		error = PTR_ERR(res);
-
-out:
-	nfs_free_parsed_mount_data(data);
-	dprintk("<-- nfs4_mount() = %d%s\n", error,
-			error != 0 ? " [error]" : "");
-	return res;
-}
-
 static void nfs4_kill_super(struct super_block *sb)
 {
 	struct nfs_server *server = NFS_SB(sb);
@@ -2945,24 +2901,23 @@ nfs4_remote_referral_mount(struct file_system_type *fs_type, int flags,
 	};
 	struct nfs_server *server;
 	struct dentry *mntroot = ERR_PTR(-ENOMEM);
-	struct nfs_fh *mntfh;
 
 	dprintk("--> nfs4_referral_get_sb()\n");
 
-	mntfh = nfs_alloc_fhandle();
-	if (mount_info.cloned == NULL || mntfh == NULL)
+	mount_info.mntfh = nfs_alloc_fhandle();
+	if (mount_info.cloned == NULL || mount_info.mntfh == NULL)
 		goto out;
 
 	/* create a new volume representation */
-	server = nfs4_create_referral_server(mount_info.cloned, mntfh);
+	server = nfs4_create_referral_server(mount_info.cloned, mount_info.mntfh);
 	if (IS_ERR(server)) {
 		mntroot = ERR_CAST(server);
 		goto out;
 	}
 
-	mntroot = nfs_fs_mount_common(&nfs4_fs_type, server, flags, dev_name, mntfh, &mount_info);
+	mntroot = nfs_fs_mount_common(&nfs4_fs_type, server, flags, dev_name, &mount_info);
 out:
-	nfs_free_fhandle(mntfh);
+	nfs_free_fhandle(mount_info.mntfh);
 	return mntroot;
 }
 
-- 
cgit v0.10.2


From 5e7e5a0da28216fb9d0a49e93ee27668ef4f04f7 Mon Sep 17 00:00:00 2001
From: Bryan Schumaker <bjschuma@netapp.com>
Date: Thu, 10 May 2012 16:47:18 -0400
Subject: NFS: Create an NFS v3 stat_to_errno()

In theory, NFS v3 can have different error versions than NFS v2. v4 is
already using its own nfs4_stat_to_errno() to map error codes, so
rather than create something in the generic client for v2 and v3 to
share I instead give v3 its own function.

Signed-off-by: Bryan Schumaker <bjschuma@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 4e9b0ff..bf64095 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -208,7 +208,6 @@ extern void nfs_pgheader_init(struct nfs_pageio_descriptor *desc,
 void nfs_set_pgio_error(struct nfs_pgio_header *hdr, int error, loff_t pos);
 
 /* nfs2xdr.c */
-extern int nfs_stat_to_errno(enum nfs_stat);
 extern struct rpc_procinfo nfs_procedures[];
 extern int nfs2_decode_dirent(struct xdr_stream *,
 				struct nfs_entry *, int);
diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
index c99008e..baf759b 100644
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c
@@ -61,6 +61,7 @@
 #define NFS_readdirres_sz	(1)
 #define NFS_statfsres_sz	(1+NFS_info_sz)
 
+static int nfs_stat_to_errno(enum nfs_stat);
 
 /*
  * While encoding arguments, set up the reply buffer in advance to
@@ -1111,7 +1112,7 @@ static const struct {
  * Returns a local errno value, or -EIO if the NFS status code is
  * not recognized.  This function is used jointly by NFSv2 and NFSv3.
  */
-int nfs_stat_to_errno(enum nfs_stat status)
+static int nfs_stat_to_errno(enum nfs_stat status)
 {
 	int i;
 
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index ee284c2..902de48 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -86,6 +86,8 @@
 				XDR_QUADLEN(NFS_ACL_INLINE_BUFSIZE))
 #define ACL3_setaclres_sz	(1+NFS3_post_op_attr_sz)
 
+static int nfs3_stat_to_errno(enum nfs_stat);
+
 /*
  * Map file type to S_IFMT bits
  */
@@ -1388,7 +1390,7 @@ static int nfs3_xdr_dec_getattr3res(struct rpc_rqst *req,
 out:
 	return error;
 out_default:
-	return nfs_stat_to_errno(status);
+	return nfs3_stat_to_errno(status);
 }
 
 /*
@@ -1427,7 +1429,7 @@ static int nfs3_xdr_dec_setattr3res(struct rpc_rqst *req,
 out:
 	return error;
 out_status:
-	return nfs_stat_to_errno(status);
+	return nfs3_stat_to_errno(status);
 }
 
 /*
@@ -1475,7 +1477,7 @@ out_default:
 	error = decode_post_op_attr(xdr, result->dir_attr);
 	if (unlikely(error))
 		goto out;
-	return nfs_stat_to_errno(status);
+	return nfs3_stat_to_errno(status);
 }
 
 /*
@@ -1516,7 +1518,7 @@ static int nfs3_xdr_dec_access3res(struct rpc_rqst *req,
 out:
 	return error;
 out_default:
-	return nfs_stat_to_errno(status);
+	return nfs3_stat_to_errno(status);
 }
 
 /*
@@ -1557,7 +1559,7 @@ static int nfs3_xdr_dec_readlink3res(struct rpc_rqst *req,
 out:
 	return error;
 out_default:
-	return nfs_stat_to_errno(status);
+	return nfs3_stat_to_errno(status);
 }
 
 /*
@@ -1639,7 +1641,7 @@ static int nfs3_xdr_dec_read3res(struct rpc_rqst *req, struct xdr_stream *xdr,
 out:
 	return error;
 out_status:
-	return nfs_stat_to_errno(status);
+	return nfs3_stat_to_errno(status);
 }
 
 /*
@@ -1709,7 +1711,7 @@ static int nfs3_xdr_dec_write3res(struct rpc_rqst *req, struct xdr_stream *xdr,
 out:
 	return error;
 out_status:
-	return nfs_stat_to_errno(status);
+	return nfs3_stat_to_errno(status);
 }
 
 /*
@@ -1773,7 +1775,7 @@ out_default:
 	error = decode_wcc_data(xdr, result->dir_attr);
 	if (unlikely(error))
 		goto out;
-	return nfs_stat_to_errno(status);
+	return nfs3_stat_to_errno(status);
 }
 
 /*
@@ -1812,7 +1814,7 @@ static int nfs3_xdr_dec_remove3res(struct rpc_rqst *req,
 out:
 	return error;
 out_status:
-	return nfs_stat_to_errno(status);
+	return nfs3_stat_to_errno(status);
 }
 
 /*
@@ -1856,7 +1858,7 @@ static int nfs3_xdr_dec_rename3res(struct rpc_rqst *req,
 out:
 	return error;
 out_status:
-	return nfs_stat_to_errno(status);
+	return nfs3_stat_to_errno(status);
 }
 
 /*
@@ -1899,7 +1901,7 @@ static int nfs3_xdr_dec_link3res(struct rpc_rqst *req, struct xdr_stream *xdr,
 out:
 	return error;
 out_status:
-	return nfs_stat_to_errno(status);
+	return nfs3_stat_to_errno(status);
 }
 
 /**
@@ -2091,7 +2093,7 @@ out_default:
 	error = decode_post_op_attr(xdr, result->dir_attr);
 	if (unlikely(error))
 		goto out;
-	return nfs_stat_to_errno(status);
+	return nfs3_stat_to_errno(status);
 }
 
 /*
@@ -2159,7 +2161,7 @@ static int nfs3_xdr_dec_fsstat3res(struct rpc_rqst *req,
 out:
 	return error;
 out_status:
-	return nfs_stat_to_errno(status);
+	return nfs3_stat_to_errno(status);
 }
 
 /*
@@ -2235,7 +2237,7 @@ static int nfs3_xdr_dec_fsinfo3res(struct rpc_rqst *req,
 out:
 	return error;
 out_status:
-	return nfs_stat_to_errno(status);
+	return nfs3_stat_to_errno(status);
 }
 
 /*
@@ -2298,7 +2300,7 @@ static int nfs3_xdr_dec_pathconf3res(struct rpc_rqst *req,
 out:
 	return error;
 out_status:
-	return nfs_stat_to_errno(status);
+	return nfs3_stat_to_errno(status);
 }
 
 /*
@@ -2339,7 +2341,7 @@ static int nfs3_xdr_dec_commit3res(struct rpc_rqst *req,
 out:
 	return error;
 out_status:
-	return nfs_stat_to_errno(status);
+	return nfs3_stat_to_errno(status);
 }
 
 #ifdef CONFIG_NFS_V3_ACL
@@ -2404,7 +2406,7 @@ static int nfs3_xdr_dec_getacl3res(struct rpc_rqst *req,
 out:
 	return error;
 out_default:
-	return nfs_stat_to_errno(status);
+	return nfs3_stat_to_errno(status);
 }
 
 static int nfs3_xdr_dec_setacl3res(struct rpc_rqst *req,
@@ -2423,11 +2425,76 @@ static int nfs3_xdr_dec_setacl3res(struct rpc_rqst *req,
 out:
 	return error;
 out_default:
-	return nfs_stat_to_errno(status);
+	return nfs3_stat_to_errno(status);
 }
 
 #endif  /* CONFIG_NFS_V3_ACL */
 
+
+/*
+ * We need to translate between nfs status return values and
+ * the local errno values which may not be the same.
+ */
+static const struct {
+	int stat;
+	int errno;
+} nfs_errtbl[] = {
+	{ NFS_OK,		0		},
+	{ NFSERR_PERM,		-EPERM		},
+	{ NFSERR_NOENT,		-ENOENT		},
+	{ NFSERR_IO,		-errno_NFSERR_IO},
+	{ NFSERR_NXIO,		-ENXIO		},
+/*	{ NFSERR_EAGAIN,	-EAGAIN		}, */
+	{ NFSERR_ACCES,		-EACCES		},
+	{ NFSERR_EXIST,		-EEXIST		},
+	{ NFSERR_XDEV,		-EXDEV		},
+	{ NFSERR_NODEV,		-ENODEV		},
+	{ NFSERR_NOTDIR,	-ENOTDIR	},
+	{ NFSERR_ISDIR,		-EISDIR		},
+	{ NFSERR_INVAL,		-EINVAL		},
+	{ NFSERR_FBIG,		-EFBIG		},
+	{ NFSERR_NOSPC,		-ENOSPC		},
+	{ NFSERR_ROFS,		-EROFS		},
+	{ NFSERR_MLINK,		-EMLINK		},
+	{ NFSERR_NAMETOOLONG,	-ENAMETOOLONG	},
+	{ NFSERR_NOTEMPTY,	-ENOTEMPTY	},
+	{ NFSERR_DQUOT,		-EDQUOT		},
+	{ NFSERR_STALE,		-ESTALE		},
+	{ NFSERR_REMOTE,	-EREMOTE	},
+#ifdef EWFLUSH
+	{ NFSERR_WFLUSH,	-EWFLUSH	},
+#endif
+	{ NFSERR_BADHANDLE,	-EBADHANDLE	},
+	{ NFSERR_NOT_SYNC,	-ENOTSYNC	},
+	{ NFSERR_BAD_COOKIE,	-EBADCOOKIE	},
+	{ NFSERR_NOTSUPP,	-ENOTSUPP	},
+	{ NFSERR_TOOSMALL,	-ETOOSMALL	},
+	{ NFSERR_SERVERFAULT,	-EREMOTEIO	},
+	{ NFSERR_BADTYPE,	-EBADTYPE	},
+	{ NFSERR_JUKEBOX,	-EJUKEBOX	},
+	{ -1,			-EIO		}
+};
+
+/**
+ * nfs3_stat_to_errno - convert an NFS status code to a local errno
+ * @status: NFS status code to convert
+ *
+ * Returns a local errno value, or -EIO if the NFS status code is
+ * not recognized.  This function is used jointly by NFSv2 and NFSv3.
+ */
+static int nfs3_stat_to_errno(enum nfs_stat status)
+{
+	int i;
+
+	for (i = 0; nfs_errtbl[i].stat != -1; i++) {
+		if (nfs_errtbl[i].stat == (int)status)
+			return nfs_errtbl[i].errno;
+	}
+	dprintk("NFS: Unrecognized nfs status value: %u\n", status);
+	return nfs_errtbl[i].errno;
+}
+
+
 #define PROC(proc, argtype, restype, timer)				\
 [NFS3PROC_##proc] = {							\
 	.p_proc      = NFS3PROC_##proc,					\
-- 
cgit v0.10.2


From 2ba68002a74fb167b68844077d36e5ccfc87f323 Mon Sep 17 00:00:00 2001
From: Bryan Schumaker <bjschuma@netapp.com>
Date: Thu, 10 May 2012 16:47:19 -0400
Subject: NFS: Make v2 configurable

With this patch NFS v2 can be disabled during Kconfig.  I default the
option to "y" to match the current behavior.

Signed-off-by: Bryan Schumaker <bjschuma@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig
index 2a0e6c5..66b0f4c 100644
--- a/fs/nfs/Kconfig
+++ b/fs/nfs/Kconfig
@@ -29,6 +29,16 @@ config NFS_FS
 
 	  If unsure, say N.
 
+config NFS_V2
+	bool "NFS client support for NFS version 2"
+	depends on NFS_FS
+	default y
+	help
+	  This option enables support for version 2 of the NFS protocol
+	  (RFC 1094) in the kernel's NFS client.
+
+	  If unsure, say Y.
+
 config NFS_V3
 	bool "NFS client support for NFS version 3"
 	depends on NFS_FS
diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile
index b58613d..7ddd45d 100644
--- a/fs/nfs/Makefile
+++ b/fs/nfs/Makefile
@@ -4,11 +4,12 @@
 
 obj-$(CONFIG_NFS_FS) += nfs.o
 
-nfs-y 			:= client.o dir.o file.o getroot.o inode.o super.o nfs2xdr.o \
-			   direct.o pagelist.o proc.o read.o symlink.o unlink.o \
+nfs-y 			:= client.o dir.o file.o getroot.o inode.o super.o \
+			   direct.o pagelist.o read.o symlink.o unlink.o \
 			   write.o namespace.o mount_clnt.o \
 			   dns_resolve.o cache_lib.o
 nfs-$(CONFIG_ROOT_NFS)	+= nfsroot.o
+nfs-$(CONFIG_NFS_V2)	+= proc.o nfs2xdr.o
 nfs-$(CONFIG_NFS_V3)	+= nfs3proc.o nfs3xdr.o
 nfs-$(CONFIG_NFS_V3_ACL)	+= nfs3acl.o
 nfs-$(CONFIG_NFS_V4)	+= nfs4proc.o nfs4xdr.o nfs4state.o nfs4renewd.o \
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index a8f8de6..8f1c652 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -90,7 +90,9 @@ static bool nfs4_disable_idmapping = true;
  * RPC cruft for NFS
  */
 static const struct rpc_version *nfs_version[5] = {
+#ifdef CONFIG_NFS_V2
 	[2]			= &nfs_version2,
+#endif
 #ifdef CONFIG_NFS_V3
 	[3]			= &nfs_version3,
 #endif
@@ -847,7 +849,7 @@ static int nfs_init_server(struct nfs_server *server,
 		.hostname = data->nfs_server.hostname,
 		.addr = (const struct sockaddr *)&data->nfs_server.address,
 		.addrlen = data->nfs_server.addrlen,
-		.rpc_ops = &nfs_v2_clientops,
+		.rpc_ops = NULL,
 		.proto = data->nfs_server.protocol,
 		.net = data->net,
 	};
@@ -857,10 +859,20 @@ static int nfs_init_server(struct nfs_server *server,
 
 	dprintk("--> nfs_init_server()\n");
 
+	switch (data->version) {
+#ifdef CONFIG_NFS_V2
+	case 2:
+		cl_init.rpc_ops = &nfs_v2_clientops;
+		break;
+#endif
 #ifdef CONFIG_NFS_V3
-	if (data->version == 3)
+	case 3:
 		cl_init.rpc_ops = &nfs_v3_clientops;
+		break;
 #endif
+	default:
+		return -EPROTONOSUPPORT;
+	}
 
 	nfs_init_timeout_values(&timeparms, data->nfs_server.protocol,
 			data->timeo, data->retrans);
-- 
cgit v0.10.2


From 981f9face8fc21bede7b7a56a7cf6c176da5ab05 Mon Sep 17 00:00:00 2001
From: Bryan Schumaker <bjschuma@netapp.com>
Date: Thu, 10 May 2012 16:47:20 -0400
Subject: NFS: Turn v3 on by default

Most users will use NFS v3 or possibly v4 so this makes it easier for
them.

Signed-off-by: Bryan Schumaker <bjschuma@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig
index 66b0f4c..f90f4f5 100644
--- a/fs/nfs/Kconfig
+++ b/fs/nfs/Kconfig
@@ -42,6 +42,7 @@ config NFS_V2
 config NFS_V3
 	bool "NFS client support for NFS version 3"
 	depends on NFS_FS
+	default y
 	help
 	  This option enables support for version 3 of the NFS protocol
 	  (RFC 1813) in the kernel's NFS client.
-- 
cgit v0.10.2


From 5abc03cd919535c61b813f2319cb38326a41e810 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Mon, 14 May 2012 22:45:28 +0300
Subject: NFS: kmalloc() doesn't return an ERR_PTR()

Obviously we should check for NULL here instead of IS_ERR().

Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Cc: stable@vger.kernel.org [3.4]
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c
index ba3019f..3e8edbe 100644
--- a/fs/nfs/idmap.c
+++ b/fs/nfs/idmap.c
@@ -640,20 +640,16 @@ static int nfs_idmap_legacy_upcall(struct key_construction *cons,
 	struct idmap_msg *im;
 	struct idmap *idmap = (struct idmap *)aux;
 	struct key *key = cons->key;
-	int ret;
+	int ret = -ENOMEM;
 
 	/* msg and im are freed in idmap_pipe_destroy_msg */
 	msg = kmalloc(sizeof(*msg), GFP_KERNEL);
-	if (IS_ERR(msg)) {
-		ret = PTR_ERR(msg);
+	if (!msg)
 		goto out0;
-	}
 
 	im = kmalloc(sizeof(*im), GFP_KERNEL);
-	if (IS_ERR(im)) {
-		ret = PTR_ERR(im);
+	if (!im)
 		goto out1;
-	}
 
 	ret = nfs_idmap_prepare_message(key->description, im, msg);
 	if (ret < 0)
-- 
cgit v0.10.2


From bda14606a3c055dbbccd998fa91eb87c4c7b2027 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@xenotime.net>
Date: Sun, 13 May 2012 10:35:40 -0700
Subject: sunrpc: fix kernel-doc warnings

Fix kernel-doc warnings in sunrpc/rpc_pipe.c and
sunrpc/rpcb_clnt.c:

Warning(net/sunrpc/rpcb_clnt.c:428): No description found for parameter 'net'
Warning(net/sunrpc/rpcb_clnt.c:567): No description found for parameter 'net'

Warning(net/sunrpc/rpc_pipe.c:133): No description found for parameter 'pipe'
Warning(net/sunrpc/rpc_pipe.c:133): Excess function parameter 'inode' description in 'rpc_queue_upcall'
Warning(net/sunrpc/rpc_pipe.c:839): No description found for parameter 'pipe'
Warning(net/sunrpc/rpc_pipe.c:839): Excess function parameter 'ops' description in 'rpc_mkpipe_dentry'
Warning(net/sunrpc/rpc_pipe.c:839): Excess function parameter 'flags' description in 'rpc_mkpipe_dentry'
Warning(net/sunrpc/rpc_pipe.c:949): No description found for parameter 'dentry'
Warning(net/sunrpc/rpc_pipe.c:949): Excess function parameter 'clnt' description in 'rpc_remove_client_dir'

Signed-off-by: Randy Dunlap <rdunlap@xenotime.net>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c
index 3b62cf2..7fb49c5 100644
--- a/net/sunrpc/rpc_pipe.c
+++ b/net/sunrpc/rpc_pipe.c
@@ -120,7 +120,7 @@ EXPORT_SYMBOL_GPL(rpc_pipe_generic_upcall);
 
 /**
  * rpc_queue_upcall - queue an upcall message to userspace
- * @inode: inode of upcall pipe on which to queue given message
+ * @pipe: upcall pipe on which to queue given message
  * @msg: message to queue
  *
  * Call with an @inode created by rpc_mkpipe() to queue an upcall.
@@ -819,9 +819,7 @@ static int rpc_rmdir_depopulate(struct dentry *dentry,
  * @parent: dentry of directory to create new "pipe" in
  * @name: name of pipe
  * @private: private data to associate with the pipe, for the caller's use
- * @ops: operations defining the behavior of the pipe: upcall, downcall,
- *	release_pipe, open_pipe, and destroy_msg.
- * @flags: rpc_pipe flags
+ * @pipe: &rpc_pipe containing input parameters
  *
  * Data is made available for userspace to read by calls to
  * rpc_queue_upcall().  The actual reads will result in calls to
@@ -943,7 +941,7 @@ struct dentry *rpc_create_client_dir(struct dentry *dentry,
 
 /**
  * rpc_remove_client_dir - Remove a directory created with rpc_create_client_dir()
- * @clnt: rpc client
+ * @dentry: dentry for the pipe
  */
 int rpc_remove_client_dir(struct dentry *dentry)
 {
diff --git a/net/sunrpc/rpcb_clnt.c b/net/sunrpc/rpcb_clnt.c
index 78ac39f..3c06534 100644
--- a/net/sunrpc/rpcb_clnt.c
+++ b/net/sunrpc/rpcb_clnt.c
@@ -394,6 +394,7 @@ static int rpcb_register_call(struct rpc_clnt *clnt, struct rpc_message *msg)
 
 /**
  * rpcb_register - set or unset a port registration with the local rpcbind svc
+ * @net: target network namespace
  * @prog: RPC program number to bind
  * @vers: RPC version number to bind
  * @prot: transport protocol to register
@@ -521,6 +522,7 @@ static int rpcb_unregister_all_protofamilies(struct sunrpc_net *sn,
 
 /**
  * rpcb_v4_register - set or unset a port registration with the local rpcbind
+ * @net: target network namespace
  * @program: RPC program number of service to (un)register
  * @version: RPC version number of service to (un)register
  * @address: address family, IP address, and port to (un)register
-- 
cgit v0.10.2


From 7e450b4e47d14429d0cc17cf4ce389fc027937be Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Wed, 16 May 2012 13:04:26 -0400
Subject: rpc_pipefs: clear write bit from top level rpc_pipefs directory

We can't create new files or directories here from userspace, so let's
not pretend that this directory is writable.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c
index 3b62cf2..f955d72 100644
--- a/net/sunrpc/rpc_pipe.c
+++ b/net/sunrpc/rpc_pipe.c
@@ -1118,7 +1118,7 @@ rpc_fill_super(struct super_block *sb, void *data, int silent)
 	sb->s_op = &s_ops;
 	sb->s_time_gran = 1;
 
-	inode = rpc_get_inode(sb, S_IFDIR | 0755);
+	inode = rpc_get_inode(sb, S_IFDIR | S_IRUGO | S_IXUGO);
 	sb->s_root = root = d_make_root(inode);
 	if (!root)
 		return -ENOMEM;
-- 
cgit v0.10.2


From 39ffb9218e41b1ef4920432776791f5e9ed2eff3 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 16 May 2012 10:21:30 -0700
Subject: NFS: Fix a compile issue when CONFIG_NFS_FSCACHE was undefined

Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/fscache.h b/fs/nfs/fscache.h
index 2a08b91..c5b11b5 100644
--- a/fs/nfs/fscache.h
+++ b/fs/nfs/fscache.h
@@ -170,12 +170,6 @@ static inline void nfs_fscache_unregister(void) {}
 static inline void nfs_fscache_get_client_cookie(struct nfs_client *clp) {}
 static inline void nfs_fscache_release_client_cookie(struct nfs_client *clp) {}
 
-static inline void nfs_fscache_get_super_cookie(
-	struct super_block *sb,
-	const char *uniq,
-	struct nfs_clone_mount *mntdata)
-{
-}
 static inline void nfs_fscache_release_super_cookie(struct super_block *sb) {}
 
 static inline void nfs_fscache_init_inode_cookie(struct inode *inode) {}
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index c3ae819..a973eb1 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -2346,6 +2346,7 @@ static int nfs_compare_super(struct super_block *sb, void *data)
 	return nfs_compare_mount_options(sb, server, mntflags);
 }
 
+#ifdef CONFIG_NFS_FSCACHE
 static void nfs_get_cache_cookie(struct super_block *sb,
 				 struct nfs_parsed_mount_data *parsed,
 				 struct nfs_clone_mount *cloned)
@@ -2366,6 +2367,13 @@ static void nfs_get_cache_cookie(struct super_block *sb,
 
 	nfs_fscache_get_super_cookie(sb, uniq, ulen);
 }
+#else
+static void nfs_get_cache_cookie(struct super_block *sb,
+				 struct nfs_parsed_mount_data *parsed,
+				 struct nfs_clone_mount *cloned)
+{
+}
+#endif
 
 static int nfs_bdi_register(struct nfs_server *server)
 {
-- 
cgit v0.10.2


From 5f23eff3814e9d255464e7a03dba47c27069ac78 Mon Sep 17 00:00:00 2001
From: Benny Halevy <bhalevy@tonian.com>
Date: Wed, 16 May 2012 11:35:36 +0300
Subject: NFS: fix unsigned comparison in nfs4_create_sec_client
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

fs/nfs/nfs4namespace.c: In function ‘nfs4_create_sec_client’:
fs/nfs/nfs4namespace.c:171:2: error: comparison of unsigned expression < 0 is always false [-Werror=type-limits]

Introduced by commit 72de53ec4bca39c26709122a8f78bfefe7b6bca4
"NFS: Do secinfo as part of lookup"

Signed-off-by: Benny Halevy <bhalevy@tonian.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c
index a7f3ded..3f5519b 100644
--- a/fs/nfs/nfs4namespace.c
+++ b/fs/nfs/nfs4namespace.c
@@ -168,7 +168,7 @@ struct rpc_clnt *nfs4_create_sec_client(struct rpc_clnt *clnt, struct inode *ino
 	rpc_authflavor_t flavor;
 
 	flavor = nfs4_negotiate_security(inode, name);
-	if (flavor < 0)
+	if ((int)flavor < 0)
 		return ERR_PTR(flavor);
 
 	clone = rpc_clone_client(clnt);
-- 
cgit v0.10.2


From 6b34309936ed5c85cbe5868655814065f42c2f38 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Wed, 16 May 2012 13:30:35 -0400
Subject: sunrpc: suppress page allocation warnings in xprt_alloc_slot()

It's easily possible for these allocations to fail since we're using
GFP_NOWAIT here. We don't want to spam the logs with warnings about
that though.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index 0cbcd1a..b239e75 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -979,7 +979,7 @@ static void xprt_alloc_slot(struct rpc_task *task)
 		list_del(&req->rq_list);
 		goto out_init_req;
 	}
-	req = xprt_dynamic_alloc_slot(xprt, GFP_NOWAIT);
+	req = xprt_dynamic_alloc_slot(xprt, GFP_NOWAIT|__GFP_NOWARN);
 	if (!IS_ERR(req))
 		goto out_init_req;
 	switch (PTR_ERR(req)) {
-- 
cgit v0.10.2


From 1afeaf5c29aa07db25760d2fbed5c08a3aec3498 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Sat, 19 May 2012 12:12:53 -0400
Subject: sunrpc: fix loss of task->tk_status after rpc_delay call in
 xprt_alloc_slot

xprt_alloc_slot will call rpc_delay() to make the task wait a bit before
retrying when it gets back an -ENOMEM error from xprt_dynamic_alloc_slot.
The problem is that rpc_delay will clear the task->tk_status, causing
call_reserveresult to abort the task.

The solution is simply to let call_reserveresult handle the ENOMEM error
directly.

Reported-by: Jeff Layton <jlayton@redhat.com>
Cc: stable@vger.kernel.org [>= 3.1]
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index adf2990..25302c8 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -1288,6 +1288,8 @@ call_reserveresult(struct rpc_task *task)
 	}
 
 	switch (status) {
+	case -ENOMEM:
+		rpc_delay(task, HZ >> 2);
 	case -EAGAIN:	/* woken up; retry */
 		task->tk_action = call_reserve;
 		return;
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index b239e75..d7ccd79 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -984,15 +984,16 @@ static void xprt_alloc_slot(struct rpc_task *task)
 		goto out_init_req;
 	switch (PTR_ERR(req)) {
 	case -ENOMEM:
-		rpc_delay(task, HZ >> 2);
 		dprintk("RPC:       dynamic allocation of request slot "
 				"failed! Retrying\n");
+		task->tk_status = -ENOMEM;
 		break;
 	case -EAGAIN:
 		rpc_sleep_on(&xprt->backlog, task, NULL);
 		dprintk("RPC:       waiting for request slot\n");
+	default:
+		task->tk_status = -EAGAIN;
 	}
-	task->tk_status = -EAGAIN;
 	return;
 out_init_req:
 	task->tk_status = 0;
-- 
cgit v0.10.2


From e73e6c9e85ed91187c1d21cb9238e86a116bf3db Mon Sep 17 00:00:00 2001
From: Matthew Treinish <treinish@linux.vnet.ibm.com>
Date: Mon, 30 Apr 2012 11:32:57 -0400
Subject: Fixed goto readability in nfs_update_inode.

Simplified error gotos to make it slightly easier to read,
it doesn't affect the functionality of the routine.

Signed-off-by: Matthew Treinish <treinish@linux.vnet.ibm.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 9f17cd1..9ad81ce 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -1275,14 +1275,26 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 			nfs_display_fhandle_hash(NFS_FH(inode)),
 			atomic_read(&inode->i_count), fattr->valid);
 
-	if ((fattr->valid & NFS_ATTR_FATTR_FILEID) && nfsi->fileid != fattr->fileid)
-		goto out_fileid;
+	if ((fattr->valid & NFS_ATTR_FATTR_FILEID) && nfsi->fileid != fattr->fileid) {
+		printk(KERN_ERR "NFS: server %s error: fileid changed\n"
+			"fsid %s: expected fileid 0x%Lx, got 0x%Lx\n",
+			NFS_SERVER(inode)->nfs_client->cl_hostname,
+			inode->i_sb->s_id, (long long)nfsi->fileid,
+			(long long)fattr->fileid);
+		goto out_err;
+	}
 
 	/*
 	 * Make sure the inode's type hasn't changed.
 	 */
-	if ((fattr->valid & NFS_ATTR_FATTR_TYPE) && (inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT))
-		goto out_changed;
+	if ((fattr->valid & NFS_ATTR_FATTR_TYPE) && (inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT)) {
+		/*
+		* Big trouble! The inode has become a different object.
+		*/
+		printk(KERN_DEBUG "NFS: %s: inode %ld mode changed, %07o to %07o\n",
+				__func__, inode->i_ino, inode->i_mode, fattr->mode);
+		goto out_err;
+	}
 
 	server = NFS_SERVER(inode);
 	/* Update the fsid? */
@@ -1443,12 +1455,6 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 		nfsi->cache_validity |= invalid;
 
 	return 0;
- out_changed:
-	/*
-	 * Big trouble! The inode has become a different object.
-	 */
-	printk(KERN_DEBUG "NFS: %s: inode %ld mode changed, %07o to %07o\n",
-			__func__, inode->i_ino, inode->i_mode, fattr->mode);
  out_err:
 	/*
 	 * No need to worry about unhashing the dentry, as the
@@ -1457,13 +1463,6 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 	 */
 	nfs_invalidate_inode(inode);
 	return -ESTALE;
-
- out_fileid:
-	printk(KERN_ERR "NFS: server %s error: fileid changed\n"
-		"fsid %s: expected fileid 0x%Lx, got 0x%Lx\n",
-		NFS_SERVER(inode)->nfs_client->cl_hostname, inode->i_sb->s_id,
-		(long long)nfsi->fileid, (long long)fattr->fileid);
-	goto out_err;
 }
 
 
-- 
cgit v0.10.2


From 554d458d79fa34acc73bc5128ba7bbf6b3007dfd Mon Sep 17 00:00:00 2001
From: Andy Adamson <andros@netapp.com>
Date: Fri, 27 Apr 2012 17:53:42 -0400
Subject: NFSv4.1: cleanup filelayout invalid deviceid handling

Move the invalid deviceid test into nfs4_fl_prepare_ds, called by the
filelayout read, write, and commit routines. NFS4_DEVICE_ID_NEG_ENTRY
is no longer needed.
Remove redundant printk's - filelayout_mark_devid_invalid prints a KERN_WARNING.

An invalid device prevents pNFS io.

Signed-off-by: Andy Adamson <andros@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
index 80a63f6..eebec9a 100644
--- a/fs/nfs/nfs4filelayout.c
+++ b/fs/nfs/nfs4filelayout.c
@@ -389,9 +389,6 @@ filelayout_read_pagelist(struct nfs_read_data *data)
 		__func__, hdr->inode->i_ino,
 		data->args.pgbase, (size_t)data->args.count, offset);
 
-	if (test_bit(NFS_DEVICEID_INVALID, &FILELAYOUT_DEVID_NODE(lseg)->flags))
-		return PNFS_NOT_ATTEMPTED;
-
 	/* Retrieve the correct rpc_client for the byte range */
 	j = nfs4_fl_calc_j_index(lseg, offset);
 	idx = nfs4_fl_calc_ds_index(lseg, j);
@@ -432,16 +429,11 @@ filelayout_write_pagelist(struct nfs_write_data *data, int sync)
 	struct nfs_fh *fh;
 	int status;
 
-	if (test_bit(NFS_DEVICEID_INVALID, &FILELAYOUT_DEVID_NODE(lseg)->flags))
-		return PNFS_NOT_ATTEMPTED;
-
 	/* Retrieve the correct rpc_client for the byte range */
 	j = nfs4_fl_calc_j_index(lseg, offset);
 	idx = nfs4_fl_calc_ds_index(lseg, j);
 	ds = nfs4_fl_prepare_ds(lseg, idx);
 	if (!ds) {
-		printk(KERN_ERR "NFS: %s: prepare_ds failed, use MDS\n",
-			__func__);
 		set_bit(lo_fail_bit(IOMODE_RW), &lseg->pls_layout->plh_flags);
 		set_bit(lo_fail_bit(IOMODE_READ), &lseg->pls_layout->plh_flags);
 		return PNFS_NOT_ATTEMPTED;
@@ -977,8 +969,6 @@ static int filelayout_initiate_commit(struct nfs_commit_data *data, int how)
 	idx = calc_ds_index_from_commit(lseg, data->ds_commit_index);
 	ds = nfs4_fl_prepare_ds(lseg, idx);
 	if (!ds) {
-		printk(KERN_ERR "NFS: %s: prepare_ds failed, use MDS\n",
-			__func__);
 		set_bit(lo_fail_bit(IOMODE_RW), &lseg->pls_layout->plh_flags);
 		set_bit(lo_fail_bit(IOMODE_READ), &lseg->pls_layout->plh_flags);
 		prepare_to_resend_writes(data);
diff --git a/fs/nfs/nfs4filelayout.h b/fs/nfs/nfs4filelayout.h
index 96b89bb..2f6330c 100644
--- a/fs/nfs/nfs4filelayout.h
+++ b/fs/nfs/nfs4filelayout.h
@@ -62,12 +62,8 @@ struct nfs4_pnfs_ds {
 	atomic_t		ds_count;
 };
 
-/* nfs4_file_layout_dsaddr flags */
-#define NFS4_DEVICE_ID_NEG_ENTRY	0x00000001
-
 struct nfs4_file_layout_dsaddr {
 	struct nfs4_deviceid_node	id_node;
-	unsigned long			flags;
 	u32				stripe_count;
 	u8				*stripe_indices;
 	u32				ds_num;
@@ -111,6 +107,23 @@ FILELAYOUT_DEVID_NODE(struct pnfs_layout_segment *lseg)
 	return &FILELAYOUT_LSEG(lseg)->dsaddr->id_node;
 }
 
+static inline void
+filelayout_mark_devid_invalid(struct nfs4_deviceid_node *node)
+{
+	u32 *p = (u32 *)&node->deviceid;
+
+	printk(KERN_WARNING "NFS: Deviceid [%x%x%x%x] marked out of use.\n",
+		p[0], p[1], p[2], p[3]);
+
+	set_bit(NFS_DEVICEID_INVALID, &node->flags);
+}
+
+static inline bool
+filelayout_test_devid_invalid(struct nfs4_deviceid_node *node)
+{
+	return test_bit(NFS_DEVICEID_INVALID, &node->flags);
+}
+
 extern struct nfs_fh *
 nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, u32 j);
 
diff --git a/fs/nfs/nfs4filelayoutdev.c b/fs/nfs/nfs4filelayoutdev.c
index c9cff9a..7e0be3e 100644
--- a/fs/nfs/nfs4filelayoutdev.c
+++ b/fs/nfs/nfs4filelayoutdev.c
@@ -791,48 +791,33 @@ nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, u32 j)
 	return flseg->fh_array[i];
 }
 
-static void
-filelayout_mark_devid_negative(struct nfs4_file_layout_dsaddr *dsaddr,
-			       int err, const char *ds_remotestr)
-{
-	u32 *p = (u32 *)&dsaddr->id_node.deviceid;
-
-	printk(KERN_ERR "NFS: data server %s connection error %d."
-		" Deviceid [%x%x%x%x] marked out of use.\n",
-		ds_remotestr, err, p[0], p[1], p[2], p[3]);
-
-	spin_lock(&nfs4_ds_cache_lock);
-	dsaddr->flags |= NFS4_DEVICE_ID_NEG_ENTRY;
-	spin_unlock(&nfs4_ds_cache_lock);
-}
-
 struct nfs4_pnfs_ds *
 nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx)
 {
 	struct nfs4_file_layout_dsaddr *dsaddr = FILELAYOUT_LSEG(lseg)->dsaddr;
 	struct nfs4_pnfs_ds *ds = dsaddr->ds_list[ds_idx];
+	struct nfs4_deviceid_node *devid = FILELAYOUT_DEVID_NODE(lseg);
+
+	if (filelayout_test_devid_invalid(devid))
+		return NULL;
 
 	if (ds == NULL) {
 		printk(KERN_ERR "NFS: %s: No data server for offset index %d\n",
 			__func__, ds_idx);
-		return NULL;
+		goto mark_dev_invalid;
 	}
 
 	if (!ds->ds_clp) {
 		struct nfs_server *s = NFS_SERVER(lseg->pls_layout->plh_inode);
 		int err;
 
-		if (dsaddr->flags & NFS4_DEVICE_ID_NEG_ENTRY) {
-			/* Already tried to connect, don't try again */
-			dprintk("%s Deviceid marked out of use\n", __func__);
-			return NULL;
-		}
 		err = nfs4_ds_connect(s, ds);
-		if (err) {
-			filelayout_mark_devid_negative(dsaddr, err,
-						       ds->ds_remotestr);
-			return NULL;
-		}
+		if (err)
+			goto mark_dev_invalid;
 	}
 	return ds;
+
+mark_dev_invalid:
+	filelayout_mark_devid_invalid(devid);
+	return NULL;
 }
-- 
cgit v0.10.2


From 90fecfcb3437dfc9bec4ee3306584dcd6843701b Mon Sep 17 00:00:00 2001
From: Andy Adamson <andros@netapp.com>
Date: Fri, 27 Apr 2012 17:53:43 -0400
Subject: NFSv4.1 cleanup filelayout invalid layout handling

The invalid layout bits are should only be used to block LAYOUTGETs.

Do not invalidate a layout on deviceid invalidation.
Do not invalidate a layout on un-handled READ, WRITE, COMMIT errors.

Signed-off-by: Andy Adamson <andros@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
index eebec9a..b9edc88 100644
--- a/fs/nfs/nfs4filelayout.c
+++ b/fs/nfs/nfs4filelayout.c
@@ -148,7 +148,6 @@ wait_on_recovery:
 static int filelayout_read_done_cb(struct rpc_task *task,
 				struct nfs_read_data *data)
 {
-	struct nfs_pgio_header *hdr = data->header;
 	int reset = 0;
 
 	dprintk("%s DS read\n", __func__);
@@ -157,10 +156,8 @@ static int filelayout_read_done_cb(struct rpc_task *task,
 					  data->ds_clp, &reset) == -EAGAIN) {
 		dprintk("%s calling restart ds_clp %p ds_clp->cl_session %p\n",
 			__func__, data->ds_clp, data->ds_clp->cl_session);
-		if (reset) {
-			pnfs_set_lo_fail(hdr->lseg);
+		if (reset)
 			nfs4_reset_read(task, data);
-		}
 		rpc_restart_call_prepare(task);
 		return -EAGAIN;
 	}
@@ -233,17 +230,14 @@ static void filelayout_read_release(void *data)
 static int filelayout_write_done_cb(struct rpc_task *task,
 				struct nfs_write_data *data)
 {
-	struct nfs_pgio_header *hdr = data->header;
 	int reset = 0;
 
 	if (filelayout_async_handle_error(task, data->args.context->state,
 					  data->ds_clp, &reset) == -EAGAIN) {
 		dprintk("%s calling restart ds_clp %p ds_clp->cl_session %p\n",
 			__func__, data->ds_clp, data->ds_clp->cl_session);
-		if (reset) {
-			pnfs_set_lo_fail(hdr->lseg);
+		if (reset)
 			nfs4_reset_write(task, data);
-		}
 		rpc_restart_call_prepare(task);
 		return -EAGAIN;
 	}
@@ -272,10 +266,9 @@ static int filelayout_commit_done_cb(struct rpc_task *task,
 					  data->ds_clp, &reset) == -EAGAIN) {
 		dprintk("%s calling restart ds_clp %p ds_clp->cl_session %p\n",
 			__func__, data->ds_clp, data->ds_clp->cl_session);
-		if (reset) {
+		if (reset)
 			prepare_to_resend_writes(data);
-			pnfs_set_lo_fail(data->lseg);
-		} else
+		else
 			rpc_restart_call_prepare(task);
 		return -EAGAIN;
 	}
@@ -393,12 +386,8 @@ filelayout_read_pagelist(struct nfs_read_data *data)
 	j = nfs4_fl_calc_j_index(lseg, offset);
 	idx = nfs4_fl_calc_ds_index(lseg, j);
 	ds = nfs4_fl_prepare_ds(lseg, idx);
-	if (!ds) {
-		/* Either layout fh index faulty, or ds connect failed */
-		set_bit(lo_fail_bit(IOMODE_RW), &lseg->pls_layout->plh_flags);
-		set_bit(lo_fail_bit(IOMODE_READ), &lseg->pls_layout->plh_flags);
+	if (!ds)
 		return PNFS_NOT_ATTEMPTED;
-	}
 	dprintk("%s USE DS: %s\n", __func__, ds->ds_remotestr);
 
 	/* No multipath support. Use first DS */
@@ -433,11 +422,8 @@ filelayout_write_pagelist(struct nfs_write_data *data, int sync)
 	j = nfs4_fl_calc_j_index(lseg, offset);
 	idx = nfs4_fl_calc_ds_index(lseg, j);
 	ds = nfs4_fl_prepare_ds(lseg, idx);
-	if (!ds) {
-		set_bit(lo_fail_bit(IOMODE_RW), &lseg->pls_layout->plh_flags);
-		set_bit(lo_fail_bit(IOMODE_READ), &lseg->pls_layout->plh_flags);
+	if (!ds)
 		return PNFS_NOT_ATTEMPTED;
-	}
 	dprintk("%s ino %lu sync %d req %Zu@%llu DS: %s\n", __func__,
 		hdr->inode->i_ino, sync, (size_t) data->args.count, offset,
 		ds->ds_remotestr);
@@ -969,8 +955,6 @@ static int filelayout_initiate_commit(struct nfs_commit_data *data, int how)
 	idx = calc_ds_index_from_commit(lseg, data->ds_commit_index);
 	ds = nfs4_fl_prepare_ds(lseg, idx);
 	if (!ds) {
-		set_bit(lo_fail_bit(IOMODE_RW), &lseg->pls_layout->plh_flags);
-		set_bit(lo_fail_bit(IOMODE_READ), &lseg->pls_layout->plh_flags);
 		prepare_to_resend_writes(data);
 		filelayout_commit_release(data);
 		return -EAGAIN;
-- 
cgit v0.10.2


From 9f0ec176b3071e0472582c07ae1e68055b28184d Mon Sep 17 00:00:00 2001
From: Andy Adamson <andros@netapp.com>
Date: Fri, 27 Apr 2012 17:53:44 -0400
Subject: NFSv4.1 set RPC_TASK_SOFTCONN for filelayout DS RPC calls

RPC_TASK_SOFTCONN returns connection errors to the caller which allows the pNFS
file layout to quickly try the MDS or perhaps another DS.

Signed-off-by: Andy Adamson <andros@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index bf64095..6ed96c7 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -299,7 +299,7 @@ extern void nfs_pageio_init_read(struct nfs_pageio_descriptor *pgio,
 			const struct nfs_pgio_completion_ops *compl_ops);
 extern int nfs_initiate_read(struct rpc_clnt *clnt,
 			     struct nfs_read_data *data,
-			     const struct rpc_call_ops *call_ops);
+			     const struct rpc_call_ops *call_ops, int flags);
 extern void nfs_read_prepare(struct rpc_task *task, void *calldata);
 extern int nfs_generic_pagein(struct nfs_pageio_descriptor *desc,
 			      struct nfs_pgio_header *hdr);
@@ -326,13 +326,13 @@ extern void nfs_commit_free(struct nfs_commit_data *p);
 extern int nfs_initiate_write(struct rpc_clnt *clnt,
 			      struct nfs_write_data *data,
 			      const struct rpc_call_ops *call_ops,
-			      int how);
+			      int how, int flags);
 extern void nfs_write_prepare(struct rpc_task *task, void *calldata);
 extern void nfs_commit_prepare(struct rpc_task *task, void *calldata);
 extern int nfs_initiate_commit(struct rpc_clnt *clnt,
 			       struct nfs_commit_data *data,
 			       const struct rpc_call_ops *call_ops,
-			       int how);
+			       int how, int flags);
 extern void nfs_init_commit(struct nfs_commit_data *data,
 			    struct list_head *head,
 			    struct pnfs_layout_segment *lseg,
diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
index b9edc88..0db8c07 100644
--- a/fs/nfs/nfs4filelayout.c
+++ b/fs/nfs/nfs4filelayout.c
@@ -401,7 +401,7 @@ filelayout_read_pagelist(struct nfs_read_data *data)
 
 	/* Perform an asynchronous read to ds */
 	status = nfs_initiate_read(ds->ds_clp->cl_rpcclient, data,
-				   &filelayout_read_call_ops);
+				  &filelayout_read_call_ops, RPC_TASK_SOFTCONN);
 	BUG_ON(status != 0);
 	return PNFS_ATTEMPTED;
 }
@@ -441,7 +441,8 @@ filelayout_write_pagelist(struct nfs_write_data *data, int sync)
 
 	/* Perform an asynchronous write */
 	status = nfs_initiate_write(ds->ds_clp->cl_rpcclient, data,
-				    &filelayout_write_call_ops, sync);
+				    &filelayout_write_call_ops, sync,
+				    RPC_TASK_SOFTCONN);
 	BUG_ON(status != 0);
 	return PNFS_ATTEMPTED;
 }
@@ -966,7 +967,8 @@ static int filelayout_initiate_commit(struct nfs_commit_data *data, int how)
 	if (fh)
 		data->args.fh = fh;
 	return nfs_initiate_commit(ds->ds_clp->cl_rpcclient, data,
-				   &filelayout_commit_call_ops, how);
+				   &filelayout_commit_call_ops, how,
+				   RPC_TASK_SOFTCONN);
 }
 
 static int
@@ -1120,7 +1122,7 @@ filelayout_commit_pagelist(struct inode *inode, struct list_head *mds_pages,
 		if (!data->lseg) {
 			nfs_init_commit(data, mds_pages, NULL, cinfo);
 			nfs_initiate_commit(NFS_CLIENT(inode), data,
-					    data->mds_ops, how);
+					    data->mds_ops, how, 0);
 		} else {
 			struct pnfs_commit_bucket *buckets;
 
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index f23cf25..2cfdd77 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -206,7 +206,7 @@ out:
 
 int nfs_initiate_read(struct rpc_clnt *clnt,
 		      struct nfs_read_data *data,
-		      const struct rpc_call_ops *call_ops)
+		      const struct rpc_call_ops *call_ops, int flags)
 {
 	struct inode *inode = data->header->inode;
 	int swap_flags = IS_SWAPFILE(inode) ? NFS_RPC_SWAPFLAGS : 0;
@@ -223,7 +223,7 @@ int nfs_initiate_read(struct rpc_clnt *clnt,
 		.callback_ops = call_ops,
 		.callback_data = data,
 		.workqueue = nfsiod_workqueue,
-		.flags = RPC_TASK_ASYNC | swap_flags,
+		.flags = RPC_TASK_ASYNC | swap_flags | flags,
 	};
 
 	/* Set up the initial task struct. */
@@ -272,7 +272,7 @@ static int nfs_do_read(struct nfs_read_data *data,
 {
 	struct inode *inode = data->header->inode;
 
-	return nfs_initiate_read(NFS_CLIENT(inode), data, call_ops);
+	return nfs_initiate_read(NFS_CLIENT(inode), data, call_ops, 0);
 }
 
 static int
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 8ffd7d5..e6fe3d6 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -916,7 +916,7 @@ static int flush_task_priority(int how)
 int nfs_initiate_write(struct rpc_clnt *clnt,
 		       struct nfs_write_data *data,
 		       const struct rpc_call_ops *call_ops,
-		       int how)
+		       int how, int flags)
 {
 	struct inode *inode = data->header->inode;
 	int priority = flush_task_priority(how);
@@ -933,7 +933,7 @@ int nfs_initiate_write(struct rpc_clnt *clnt,
 		.callback_ops = call_ops,
 		.callback_data = data,
 		.workqueue = nfsiod_workqueue,
-		.flags = RPC_TASK_ASYNC,
+		.flags = RPC_TASK_ASYNC | flags,
 		.priority = priority,
 	};
 	int ret = 0;
@@ -1009,7 +1009,7 @@ static int nfs_do_write(struct nfs_write_data *data,
 {
 	struct inode *inode = data->header->inode;
 
-	return nfs_initiate_write(NFS_CLIENT(inode), data, call_ops, how);
+	return nfs_initiate_write(NFS_CLIENT(inode), data, call_ops, how, 0);
 }
 
 static int nfs_do_multiple_writes(struct list_head *head,
@@ -1394,7 +1394,7 @@ EXPORT_SYMBOL_GPL(nfs_commitdata_release);
 
 int nfs_initiate_commit(struct rpc_clnt *clnt, struct nfs_commit_data *data,
 			const struct rpc_call_ops *call_ops,
-			int how)
+			int how, int flags)
 {
 	struct rpc_task *task;
 	int priority = flush_task_priority(how);
@@ -1410,7 +1410,7 @@ int nfs_initiate_commit(struct rpc_clnt *clnt, struct nfs_commit_data *data,
 		.callback_ops = call_ops,
 		.callback_data = data,
 		.workqueue = nfsiod_workqueue,
-		.flags = RPC_TASK_ASYNC,
+		.flags = RPC_TASK_ASYNC | flags,
 		.priority = priority,
 	};
 	/* Set up the initial task struct.  */
@@ -1499,7 +1499,8 @@ nfs_commit_list(struct inode *inode, struct list_head *head, int how,
 	/* Set up the argument struct */
 	nfs_init_commit(data, head, NULL, cinfo);
 	atomic_inc(&cinfo->mds->rpcs_out);
-	return nfs_initiate_commit(NFS_CLIENT(inode), data, data->mds_ops, how);
+	return nfs_initiate_commit(NFS_CLIENT(inode), data, data->mds_ops,
+				   how, 0);
  out_bad:
 	nfs_retry_commit(head, NULL, cinfo);
 	cinfo->completion_ops->error_cleanup(NFS_I(inode));
-- 
cgit v0.10.2


From 98fc685ae2aa24eae98526e9196b3229d519083a Mon Sep 17 00:00:00 2001
From: Andy Adamson <andros@netapp.com>
Date: Fri, 27 Apr 2012 17:53:45 -0400
Subject: NFSv4.1 data server timeo and retrans module parameters

Set the recovery parameters for data servers.

Signed-off-by: Andy Adamson <andros@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 8f1c652..b4e2199 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -1477,8 +1477,8 @@ error:
  * the MDS.
  */
 struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp,
-		const struct sockaddr *ds_addr,
-		int ds_addrlen, int ds_proto)
+		const struct sockaddr *ds_addr, int ds_addrlen,
+		int ds_proto, unsigned int ds_timeo, unsigned int ds_retrans)
 {
 	struct nfs_client_initdata cl_init = {
 		.addr = ds_addr,
@@ -1488,12 +1488,7 @@ struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp,
 		.minorversion = mds_clp->cl_minorversion,
 		.net = mds_clp->net,
 	};
-	struct rpc_timeout ds_timeout = {
-		.to_initval = 15 * HZ,
-		.to_maxval = 15 * HZ,
-		.to_retries = 1,
-		.to_exponential = 1,
-	};
+	struct rpc_timeout ds_timeout;
 	struct nfs_client *clp;
 
 	/*
@@ -1501,6 +1496,7 @@ struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp,
 	 * cl_ipaddr so as to use the same EXCHANGE_ID co_ownerid as the MDS
 	 * (section 13.1 RFC 5661).
 	 */
+	nfs_init_timeout_values(&ds_timeout, ds_proto, ds_timeo, ds_retrans);
 	clp = nfs_get_client(&cl_init, &ds_timeout, mds_clp->cl_ipaddr,
 			     mds_clp->cl_rpcclient->cl_auth->au_flavor, 0);
 
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 6ed96c7..1466c5d 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -172,7 +172,9 @@ extern void nfs_mark_client_ready(struct nfs_client *clp, int state);
 extern int nfs4_check_client_ready(struct nfs_client *clp);
 extern struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp,
 					     const struct sockaddr *ds_addr,
-					     int ds_addrlen, int ds_proto);
+					     int ds_addrlen, int ds_proto,
+					     unsigned int ds_timeo,
+					     unsigned int ds_retrans);
 #ifdef CONFIG_PROC_FS
 extern int __init nfs_fs_proc_init(void);
 extern void nfs_fs_proc_exit(void);
diff --git a/fs/nfs/nfs4filelayout.h b/fs/nfs/nfs4filelayout.h
index 2f6330c..6fb1901 100644
--- a/fs/nfs/nfs4filelayout.h
+++ b/fs/nfs/nfs4filelayout.h
@@ -33,6 +33,13 @@
 #include "pnfs.h"
 
 /*
+ * Default data server connection timeout and retrans vaules.
+ * Set by module paramters dataserver_timeo and dataserver_retrans.
+ */
+#define NFS4_DEF_DS_TIMEO   60
+#define NFS4_DEF_DS_RETRANS 5
+
+/*
  * Field testing shows we need to support up to 4096 stripe indices.
  * We store each index as a u8 (u32 on the wire) to keep the memory footprint
  * reasonable. This in turn means we support a maximum of 256
diff --git a/fs/nfs/nfs4filelayoutdev.c b/fs/nfs/nfs4filelayoutdev.c
index 7e0be3e..d4d2032 100644
--- a/fs/nfs/nfs4filelayoutdev.c
+++ b/fs/nfs/nfs4filelayoutdev.c
@@ -30,12 +30,16 @@
 
 #include <linux/nfs_fs.h>
 #include <linux/vmalloc.h>
+#include <linux/module.h>
 
 #include "internal.h"
 #include "nfs4filelayout.h"
 
 #define NFSDBG_FACILITY		NFSDBG_PNFS_LD
 
+static unsigned int dataserver_timeo = NFS4_DEF_DS_TIMEO;
+static unsigned int dataserver_retrans = NFS4_DEF_DS_RETRANS;
+
 /*
  * Data server cache
  *
@@ -165,8 +169,9 @@ nfs4_ds_connect(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds)
 			__func__, ds->ds_remotestr, da->da_remotestr);
 
 		clp = nfs4_set_ds_client(mds_srv->nfs_client,
-				 (struct sockaddr *)&da->da_addr,
-				 da->da_addrlen, IPPROTO_TCP);
+					(struct sockaddr *)&da->da_addr,
+					da->da_addrlen, IPPROTO_TCP,
+					dataserver_timeo, dataserver_retrans);
 		if (!IS_ERR(clp))
 			break;
 	}
@@ -821,3 +826,12 @@ mark_dev_invalid:
 	filelayout_mark_devid_invalid(devid);
 	return NULL;
 }
+
+module_param(dataserver_retrans, uint, 0644);
+MODULE_PARM_DESC(dataserver_retrans, "The  number of times the NFSv4.1 client "
+			"retries a request before it attempts further "
+			" recovery  action.");
+module_param(dataserver_timeo, uint, 0644);
+MODULE_PARM_DESC(dataserver_timeo, "The time (in tenths of a second) the "
+			"NFSv4.1  client  waits for a response from a "
+			" data server before it retries an NFS request.");
-- 
cgit v0.10.2


From e7dd79af01e7ca932c5168a708e77750659f7a9e Mon Sep 17 00:00:00 2001
From: Andy Adamson <andros@netapp.com>
Date: Fri, 27 Apr 2012 17:53:46 -0400
Subject: NFSv4.1: mark deviceid invalid on filelayout DS connection errors

This prevents the use of any layout for i/o that references the deviceid.
I/O is redirected through the MDS.

Redirect the unhandled failed I/O to the MDS without marking either the
layout or the deviceid invalid.

Signed-off-by: Andy Adamson <andros@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
index 0db8c07..f503cbe5 100644
--- a/fs/nfs/nfs4filelayout.c
+++ b/fs/nfs/nfs4filelayout.c
@@ -82,29 +82,77 @@ filelayout_get_dserver_offset(struct pnfs_layout_segment *lseg, loff_t offset)
 	BUG();
 }
 
+static void filelayout_reset_write(struct nfs_write_data *data)
+{
+	struct nfs_pgio_header *hdr = data->header;
+	struct inode *inode = hdr->inode;
+	struct rpc_task *task = &data->task;
+
+	if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
+		dprintk("%s Reset task %5u for i/o through MDS "
+			"(req %s/%lld, %u bytes @ offset %llu)\n", __func__,
+			data->task.tk_pid,
+			inode->i_sb->s_id,
+			(long long)NFS_FILEID(inode),
+			data->args.count,
+			(unsigned long long)data->args.offset);
+
+		task->tk_status = pnfs_write_done_resend_to_mds(hdr->inode,
+							&hdr->pages,
+							hdr->completion_ops);
+	}
+}
+
+static void filelayout_reset_read(struct nfs_read_data *data)
+{
+	struct nfs_pgio_header *hdr = data->header;
+	struct inode *inode = hdr->inode;
+	struct rpc_task *task = &data->task;
+
+	if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
+		dprintk("%s Reset task %5u for i/o through MDS "
+			"(req %s/%lld, %u bytes @ offset %llu)\n", __func__,
+			data->task.tk_pid,
+			inode->i_sb->s_id,
+			(long long)NFS_FILEID(inode),
+			data->args.count,
+			(unsigned long long)data->args.offset);
+
+		task->tk_status = pnfs_read_done_resend_to_mds(hdr->inode,
+							&hdr->pages,
+							hdr->completion_ops);
+	}
+}
+
 static int filelayout_async_handle_error(struct rpc_task *task,
 					 struct nfs4_state *state,
 					 struct nfs_client *clp,
-					 int *reset)
+					 struct pnfs_layout_segment *lseg)
 {
-	struct nfs_server *mds_server = NFS_SERVER(state->inode);
+	struct inode *inode = lseg->pls_layout->plh_inode;
+	struct nfs_server *mds_server = NFS_SERVER(inode);
+	struct nfs4_deviceid_node *devid = FILELAYOUT_DEVID_NODE(lseg);
 	struct nfs_client *mds_client = mds_server->nfs_client;
 
 	if (task->tk_status >= 0)
 		return 0;
-	*reset = 0;
 
 	switch (task->tk_status) {
 	/* MDS state errors */
 	case -NFS4ERR_DELEG_REVOKED:
 	case -NFS4ERR_ADMIN_REVOKED:
 	case -NFS4ERR_BAD_STATEID:
+		if (state == NULL)
+			break;
 		nfs_remove_bad_delegation(state->inode);
 	case -NFS4ERR_OPENMODE:
+		if (state == NULL)
+			break;
 		nfs4_schedule_stateid_recovery(mds_server, state);
 		goto wait_on_recovery;
 	case -NFS4ERR_EXPIRED:
-		nfs4_schedule_stateid_recovery(mds_server, state);
+		if (state != NULL)
+			nfs4_schedule_stateid_recovery(mds_server, state);
 		nfs4_schedule_lease_recovery(mds_client);
 		goto wait_on_recovery;
 	/* DS session errors */
@@ -127,11 +175,22 @@ static int filelayout_async_handle_error(struct rpc_task *task,
 		break;
 	case -NFS4ERR_RETRY_UNCACHED_REP:
 		break;
+	/* RPC connection errors */
+	case -ECONNREFUSED:
+	case -EHOSTDOWN:
+	case -EHOSTUNREACH:
+	case -ENETUNREACH:
+	case -EIO:
+	case -ETIMEDOUT:
+	case -EPIPE:
+		dprintk("%s DS connection error %d\n", __func__,
+			task->tk_status);
+		filelayout_mark_devid_invalid(devid);
+		/* fall through */
 	default:
-		dprintk("%s DS error. Retry through MDS %d\n", __func__,
+		dprintk("%s Retry through MDS. Error %d\n", __func__,
 			task->tk_status);
-		*reset = 1;
-		break;
+		return -NFS4ERR_RESET_TO_MDS;
 	}
 out:
 	task->tk_status = 0;
@@ -148,16 +207,17 @@ wait_on_recovery:
 static int filelayout_read_done_cb(struct rpc_task *task,
 				struct nfs_read_data *data)
 {
-	int reset = 0;
+	struct nfs_pgio_header *hdr = data->header;
+	int err;
 
-	dprintk("%s DS read\n", __func__);
+	err = filelayout_async_handle_error(task, data->args.context->state,
+					    data->ds_clp, hdr->lseg);
 
-	if (filelayout_async_handle_error(task, data->args.context->state,
-					  data->ds_clp, &reset) == -EAGAIN) {
-		dprintk("%s calling restart ds_clp %p ds_clp->cl_session %p\n",
-			__func__, data->ds_clp, data->ds_clp->cl_session);
-		if (reset)
-			nfs4_reset_read(task, data);
+	switch (err) {
+	case -NFS4ERR_RESET_TO_MDS:
+		filelayout_reset_read(data);
+		return task->tk_status;
+	case -EAGAIN:
 		rpc_restart_call_prepare(task);
 		return -EAGAIN;
 	}
@@ -230,14 +290,17 @@ static void filelayout_read_release(void *data)
 static int filelayout_write_done_cb(struct rpc_task *task,
 				struct nfs_write_data *data)
 {
-	int reset = 0;
-
-	if (filelayout_async_handle_error(task, data->args.context->state,
-					  data->ds_clp, &reset) == -EAGAIN) {
-		dprintk("%s calling restart ds_clp %p ds_clp->cl_session %p\n",
-			__func__, data->ds_clp, data->ds_clp->cl_session);
-		if (reset)
-			nfs4_reset_write(task, data);
+	struct nfs_pgio_header *hdr = data->header;
+	int err;
+
+	err = filelayout_async_handle_error(task, data->args.context->state,
+					    data->ds_clp, hdr->lseg);
+
+	switch (err) {
+	case -NFS4ERR_RESET_TO_MDS:
+		filelayout_reset_write(data);
+		return task->tk_status;
+	case -EAGAIN:
 		rpc_restart_call_prepare(task);
 		return -EAGAIN;
 	}
@@ -260,16 +323,17 @@ static void prepare_to_resend_writes(struct nfs_commit_data *data)
 static int filelayout_commit_done_cb(struct rpc_task *task,
 				     struct nfs_commit_data *data)
 {
-	int reset = 0;
-
-	if (filelayout_async_handle_error(task, data->context->state,
-					  data->ds_clp, &reset) == -EAGAIN) {
-		dprintk("%s calling restart ds_clp %p ds_clp->cl_session %p\n",
-			__func__, data->ds_clp, data->ds_clp->cl_session);
-		if (reset)
-			prepare_to_resend_writes(data);
-		else
-			rpc_restart_call_prepare(task);
+	int err;
+
+	err = filelayout_async_handle_error(task, NULL, data->ds_clp,
+					    data->lseg);
+
+	switch (err) {
+	case -NFS4ERR_RESET_TO_MDS:
+		prepare_to_resend_writes(data);
+		return -EAGAIN;
+	case -EAGAIN:
+		rpc_restart_call_prepare(task);
 		return -EAGAIN;
 	}
 
diff --git a/fs/nfs/nfs4filelayout.h b/fs/nfs/nfs4filelayout.h
index 6fb1901..3259be6 100644
--- a/fs/nfs/nfs4filelayout.h
+++ b/fs/nfs/nfs4filelayout.h
@@ -48,6 +48,9 @@
 #define NFS4_PNFS_MAX_STRIPE_CNT 4096
 #define NFS4_PNFS_MAX_MULTI_CNT  256 /* 256 fit into a u8 stripe_index */
 
+/* error codes for internal use */
+#define NFS4ERR_RESET_TO_MDS   12001
+
 enum stripetype4 {
 	STRIPE_SPARSE = 1,
 	STRIPE_DENSE = 2
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 6fdeca2..16cc194 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -1175,7 +1175,7 @@ pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
 }
 EXPORT_SYMBOL_GPL(pnfs_generic_pg_test);
 
-static int pnfs_write_done_resend_to_mds(struct inode *inode,
+int pnfs_write_done_resend_to_mds(struct inode *inode,
 				struct list_head *head,
 				const struct nfs_pgio_completion_ops *compl_ops)
 {
@@ -1203,6 +1203,7 @@ static int pnfs_write_done_resend_to_mds(struct inode *inode,
 	}
 	return 0;
 }
+EXPORT_SYMBOL_GPL(pnfs_write_done_resend_to_mds);
 
 static void pnfs_ld_handle_write_error(struct nfs_write_data *data)
 {
@@ -1329,7 +1330,7 @@ pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc)
 }
 EXPORT_SYMBOL_GPL(pnfs_generic_pg_writepages);
 
-static int pnfs_read_done_resend_to_mds(struct inode *inode,
+int pnfs_read_done_resend_to_mds(struct inode *inode,
 				struct list_head *head,
 				const struct nfs_pgio_completion_ops *compl_ops)
 {
@@ -1353,6 +1354,7 @@ static int pnfs_read_done_resend_to_mds(struct inode *inode,
 	}
 	return 0;
 }
+EXPORT_SYMBOL_GPL(pnfs_read_done_resend_to_mds);
 
 static void pnfs_ld_handle_read_error(struct nfs_read_data *data)
 {
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index f20054b..9cf9ede 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -222,6 +222,10 @@ struct pnfs_layout_segment *pnfs_update_layout(struct inode *ino,
 					       gfp_t gfp_flags);
 
 void nfs4_deviceid_mark_client_invalid(struct nfs_client *clp);
+int pnfs_read_done_resend_to_mds(struct inode *inode, struct list_head *head,
+			const struct nfs_pgio_completion_ops *compl_ops);
+int pnfs_write_done_resend_to_mds(struct inode *inode, struct list_head *head,
+			const struct nfs_pgio_completion_ops *compl_ops);
 
 /* nfs4_deviceid_flags */
 enum {
-- 
cgit v0.10.2


From a033a09189c0125d56f2ac17ffb4bec5a3d3323b Mon Sep 17 00:00:00 2001
From: Andy Adamson <andros@netapp.com>
Date: Fri, 27 Apr 2012 17:53:47 -0400
Subject: NFSv4.1 remove nfs4_reset_write and nfs4_reset_read

Replaced by filelayout_reset_write and filelayout_reset_read

Signed-off-by: Andy Adamson <andros@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 1466c5d..989959a 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -373,13 +373,11 @@ void nfs_init_cinfo_from_dreq(struct nfs_commit_info *cinfo,
 
 /* nfs4proc.c */
 extern void __nfs4_read_done_cb(struct nfs_read_data *);
-extern void nfs4_reset_read(struct rpc_task *task, struct nfs_read_data *data);
 extern int nfs4_init_client(struct nfs_client *clp,
 			    const struct rpc_timeout *timeparms,
 			    const char *ip_addr,
 			    rpc_authflavor_t authflavour,
 			    int noresvport);
-extern void nfs4_reset_write(struct rpc_task *task, struct nfs_write_data *data);
 extern int _nfs4_call_sync(struct rpc_clnt *clnt,
 			   struct nfs_server *server,
 			   struct rpc_message *msg,
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index e6ab15f..49eecd5 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -3369,23 +3369,6 @@ static void nfs4_proc_read_rpc_prepare(struct rpc_task *task, struct nfs_read_da
 	rpc_call_start(task);
 }
 
-/* Reset the the nfs_read_data to send the read to the MDS. */
-void nfs4_reset_read(struct rpc_task *task, struct nfs_read_data *data)
-{
-	struct nfs_pgio_header *hdr = data->header;
-	struct inode *inode = hdr->inode;
-
-	dprintk("%s Reset task for i/o through\n", __func__);
-	data->ds_clp = NULL;
-	/* offsets will differ in the dense stripe case */
-	data->args.offset = data->mds_offset;
-	data->args.fh     = NFS_FH(inode);
-	data->read_done_cb = nfs4_read_done_cb;
-	task->tk_ops = hdr->mds_ops;
-	rpc_task_reset_client(task, NFS_CLIENT(inode));
-}
-EXPORT_SYMBOL_GPL(nfs4_reset_read);
-
 static int nfs4_write_done_cb(struct rpc_task *task, struct nfs_write_data *data)
 {
 	struct inode *inode = data->header->inode;
@@ -3409,24 +3392,6 @@ static int nfs4_write_done(struct rpc_task *task, struct nfs_write_data *data)
 		nfs4_write_done_cb(task, data);
 }
 
-/* Reset the the nfs_write_data to send the write to the MDS. */
-void nfs4_reset_write(struct rpc_task *task, struct nfs_write_data *data)
-{
-	struct nfs_pgio_header *hdr = data->header;
-	struct inode *inode = hdr->inode;
-
-	dprintk("%s Reset task for i/o through\n", __func__);
-	data->ds_clp     = NULL;
-	data->write_done_cb = nfs4_write_done_cb;
-	data->args.fh       = NFS_FH(inode);
-	data->args.bitmask  = data->res.server->cache_consistency_bitmask;
-	data->args.offset   = data->mds_offset;
-	data->res.fattr     = &data->fattr;
-	task->tk_ops        = hdr->mds_ops;
-	rpc_task_reset_client(task, NFS_CLIENT(inode));
-}
-EXPORT_SYMBOL_GPL(nfs4_reset_write);
-
 static
 bool nfs4_write_need_cache_consistency_data(const struct nfs_write_data *data)
 {
-- 
cgit v0.10.2


From 0ad2f378e1af7996d6f8355c02181ff3cc7ab260 Mon Sep 17 00:00:00 2001
From: Andy Adamson <andros@netapp.com>
Date: Fri, 27 Apr 2012 17:53:48 -0400
Subject: NFSv4.1 Check invalid deviceid upon slot table waitq wakeup

Tasks sleeping on the slot table waitq wake to the rpc_prepare_task state.
Reset the task for io through the MDS if the deviceid is invalid.

The reset functions put the io pages through the pageio layer which has the
advantage of re-coalescing which allows for the MDS and DS having different
r/wsizes. Exit the awakened task without executing the rpc_call_done routine.

Signed-off-by: Andy Adamson <andros@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
index f503cbe5..1b9bedb 100644
--- a/fs/nfs/nfs4filelayout.c
+++ b/fs/nfs/nfs4filelayout.c
@@ -252,7 +252,14 @@ filelayout_set_layoutcommit(struct nfs_write_data *wdata)
 static void filelayout_read_prepare(struct rpc_task *task, void *data)
 {
 	struct nfs_read_data *rdata = data;
+	struct pnfs_layout_segment *lseg = rdata->header->lseg;
 
+	if (filelayout_test_devid_invalid(FILELAYOUT_DEVID_NODE(lseg))) {
+		dprintk("%s task %u reset io to MDS\n", __func__, task->tk_pid);
+		filelayout_reset_read(rdata);
+		rpc_exit(task, 0);
+		return;
+	}
 	rdata->read_done_cb = filelayout_read_done_cb;
 
 	if (nfs41_setup_sequence(rdata->ds_clp->cl_session,
@@ -269,6 +276,9 @@ static void filelayout_read_call_done(struct rpc_task *task, void *data)
 
 	dprintk("--> %s task->tk_status %d\n", __func__, task->tk_status);
 
+	if (test_bit(NFS_IOHDR_REDO, &rdata->header->flags))
+		return;
+
 	/* Note this may cause RPC to be resent */
 	rdata->header->mds_ops->rpc_call_done(task, data);
 }
@@ -343,7 +353,14 @@ static int filelayout_commit_done_cb(struct rpc_task *task,
 static void filelayout_write_prepare(struct rpc_task *task, void *data)
 {
 	struct nfs_write_data *wdata = data;
+	struct pnfs_layout_segment *lseg = wdata->header->lseg;
 
+	if (filelayout_test_devid_invalid(FILELAYOUT_DEVID_NODE(lseg))) {
+		dprintk("%s task %u reset io to MDS\n", __func__, task->tk_pid);
+		filelayout_reset_write(wdata);
+		rpc_exit(task, 0);
+		return;
+	}
 	if (nfs41_setup_sequence(wdata->ds_clp->cl_session,
 				&wdata->args.seq_args, &wdata->res.seq_res,
 				task))
@@ -356,6 +373,9 @@ static void filelayout_write_call_done(struct rpc_task *task, void *data)
 {
 	struct nfs_write_data *wdata = data;
 
+	if (test_bit(NFS_IOHDR_REDO, &wdata->header->flags))
+		return;
+
 	/* Note this may cause RPC to be resent */
 	wdata->header->mds_ops->rpc_call_done(task, data);
 }
-- 
cgit v0.10.2


From 671fb89695fee0c70a969371efd38ed30be76a8a Mon Sep 17 00:00:00 2001
From: Andy Adamson <andros@netapp.com>
Date: Fri, 27 Apr 2012 17:53:49 -0400
Subject: NFSv4.1 wake up all tasks on un-connected DS slot table waitq

The DS has a connection error (invalid deviceid). Drain the fore channel
slot table waitq.

Signed-off-by: Andy Adamson <andros@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
index 1b9bedb..a63062d 100644
--- a/fs/nfs/nfs4filelayout.c
+++ b/fs/nfs/nfs4filelayout.c
@@ -133,6 +133,7 @@ static int filelayout_async_handle_error(struct rpc_task *task,
 	struct nfs_server *mds_server = NFS_SERVER(inode);
 	struct nfs4_deviceid_node *devid = FILELAYOUT_DEVID_NODE(lseg);
 	struct nfs_client *mds_client = mds_server->nfs_client;
+	struct nfs4_slot_table *tbl = &clp->cl_session->fc_slot_table;
 
 	if (task->tk_status >= 0)
 		return 0;
@@ -186,6 +187,7 @@ static int filelayout_async_handle_error(struct rpc_task *task,
 		dprintk("%s DS connection error %d\n", __func__,
 			task->tk_status);
 		filelayout_mark_devid_invalid(devid);
+		rpc_wake_up(&tbl->slot_tbl_waitq);
 		/* fall through */
 	default:
 		dprintk("%s Retry through MDS. Error %d\n", __func__,
-- 
cgit v0.10.2


From 0a57cdac3fb9d249f4fbbc745c01b9292ef8c1b7 Mon Sep 17 00:00:00 2001
From: Andy Adamson <andros@netapp.com>
Date: Fri, 27 Apr 2012 17:53:50 -0400
Subject: NFSv4.1 send layoutreturn to fence disconnected data server

Let the MDS know that you are redirecting I/O from pNFS to MDS.

Signed-off-by: Andy Adamson <andros@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
index a63062d..c6b7c18 100644
--- a/fs/nfs/nfs4filelayout.c
+++ b/fs/nfs/nfs4filelayout.c
@@ -186,6 +186,8 @@ static int filelayout_async_handle_error(struct rpc_task *task,
 	case -EPIPE:
 		dprintk("%s DS connection error %d\n", __func__,
 			task->tk_status);
+		if (!filelayout_test_devid_invalid(devid))
+			_pnfs_return_layout(state->inode);
 		filelayout_mark_devid_invalid(devid);
 		rpc_wake_up(&tbl->slot_tbl_waitq);
 		/* fall through */
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 16cc194..e48017f 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -692,6 +692,7 @@ out:
 	dprintk("<-- %s status: %d\n", __func__, status);
 	return status;
 }
+EXPORT_SYMBOL_GPL(_pnfs_return_layout);
 
 bool pnfs_roc(struct inode *ino)
 {
-- 
cgit v0.10.2


From 3a7936c3fc469c196d9163abfea6b7aa9572d443 Mon Sep 17 00:00:00 2001
From: Andy Adamson <andros@netapp.com>
Date: Fri, 27 Apr 2012 17:53:51 -0400
Subject: NFSv4.1 ref count nfs_client across filelayout data server io

Prepare to put a dis-connected DS client record.

Signed-off-by: Andy Adamson <andros@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
index c6b7c18..eb8eb00 100644
--- a/fs/nfs/nfs4filelayout.c
+++ b/fs/nfs/nfs4filelayout.c
@@ -101,6 +101,9 @@ static void filelayout_reset_write(struct nfs_write_data *data)
 							&hdr->pages,
 							hdr->completion_ops);
 	}
+	/* balance nfs_get_client in filelayout_write_pagelist */
+	nfs_put_client(data->ds_clp);
+	data->ds_clp     = NULL;
 }
 
 static void filelayout_reset_read(struct nfs_read_data *data)
@@ -122,6 +125,9 @@ static void filelayout_reset_read(struct nfs_read_data *data)
 							&hdr->pages,
 							hdr->completion_ops);
 	}
+	/* balance nfs_get_client in filelayout_read_pagelist */
+	nfs_put_client(data->ds_clp);
+	data->ds_clp = NULL;
 }
 
 static int filelayout_async_handle_error(struct rpc_task *task,
@@ -298,6 +304,8 @@ static void filelayout_read_release(void *data)
 {
 	struct nfs_read_data *rdata = data;
 
+	if (!test_bit(NFS_IOHDR_REDO, &rdata->header->flags))
+		nfs_put_client(rdata->ds_clp);
 	rdata->header->mds_ops->rpc_release(data);
 }
 
@@ -395,6 +403,8 @@ static void filelayout_write_release(void *data)
 {
 	struct nfs_write_data *wdata = data;
 
+	if (!test_bit(NFS_IOHDR_REDO, &wdata->header->flags))
+		nfs_put_client(wdata->ds_clp);
 	wdata->header->mds_ops->rpc_release(data);
 }
 
@@ -431,6 +441,7 @@ static void filelayout_commit_release(void *calldata)
 
 	data->completion_ops->completion(data);
 	put_lseg(data->lseg);
+	nfs_put_client(data->ds_clp);
 	nfs_commitdata_release(data);
 }
 
@@ -476,9 +487,11 @@ filelayout_read_pagelist(struct nfs_read_data *data)
 	ds = nfs4_fl_prepare_ds(lseg, idx);
 	if (!ds)
 		return PNFS_NOT_ATTEMPTED;
-	dprintk("%s USE DS: %s\n", __func__, ds->ds_remotestr);
+	dprintk("%s USE DS: %s cl_count %d\n", __func__,
+		ds->ds_remotestr, atomic_read(&ds->ds_clp->cl_count));
 
 	/* No multipath support. Use first DS */
+	atomic_inc(&ds->ds_clp->cl_count);
 	data->ds_clp = ds->ds_clp;
 	fh = nfs4_fl_select_ds_fh(lseg, j);
 	if (fh)
@@ -512,11 +525,12 @@ filelayout_write_pagelist(struct nfs_write_data *data, int sync)
 	ds = nfs4_fl_prepare_ds(lseg, idx);
 	if (!ds)
 		return PNFS_NOT_ATTEMPTED;
-	dprintk("%s ino %lu sync %d req %Zu@%llu DS: %s\n", __func__,
-		hdr->inode->i_ino, sync, (size_t) data->args.count, offset,
-		ds->ds_remotestr);
+	dprintk("%s ino %lu sync %d req %Zu@%llu DS: %s cl_count %d\n",
+		__func__, hdr->inode->i_ino, sync, (size_t) data->args.count,
+		offset, ds->ds_remotestr, atomic_read(&ds->ds_clp->cl_count));
 
 	data->write_done_cb = filelayout_write_done_cb;
+	atomic_inc(&ds->ds_clp->cl_count);
 	data->ds_clp = ds->ds_clp;
 	fh = nfs4_fl_select_ds_fh(lseg, j);
 	if (fh)
@@ -1048,8 +1062,10 @@ static int filelayout_initiate_commit(struct nfs_commit_data *data, int how)
 		filelayout_commit_release(data);
 		return -EAGAIN;
 	}
-	dprintk("%s ino %lu, how %d\n", __func__, data->inode->i_ino, how);
+	dprintk("%s ino %lu, how %d cl_count %d\n", __func__,
+		data->inode->i_ino, how, atomic_read(&ds->ds_clp->cl_count));
 	data->commit_done_cb = filelayout_commit_done_cb;
+	atomic_inc(&ds->ds_clp->cl_count);
 	data->ds_clp = ds->ds_clp;
 	fh = select_ds_fh_from_commit(lseg, data->ds_commit_index);
 	if (fh)
-- 
cgit v0.10.2


From b4a2967e52523dbf0281b52c042f9042c6082f99 Mon Sep 17 00:00:00 2001
From: Andy Adamson <andros@netapp.com>
Date: Fri, 27 Apr 2012 17:53:52 -0400
Subject: NFSv4.1 dereference a disconnected data server client record

When the last DS io is processed, the data server client record will be
freed.

Signed-off-by: Andy Adamson <andros@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
index eb8eb00..eaaca89 100644
--- a/fs/nfs/nfs4filelayout.c
+++ b/fs/nfs/nfs4filelayout.c
@@ -196,6 +196,7 @@ static int filelayout_async_handle_error(struct rpc_task *task,
 			_pnfs_return_layout(state->inode);
 		filelayout_mark_devid_invalid(devid);
 		rpc_wake_up(&tbl->slot_tbl_waitq);
+		nfs4_ds_disconnect(clp);
 		/* fall through */
 	default:
 		dprintk("%s Retry through MDS. Error %d\n", __func__,
diff --git a/fs/nfs/nfs4filelayout.h b/fs/nfs/nfs4filelayout.h
index 3259be6..95562ad 100644
--- a/fs/nfs/nfs4filelayout.h
+++ b/fs/nfs/nfs4filelayout.h
@@ -146,5 +146,6 @@ extern void nfs4_fl_put_deviceid(struct nfs4_file_layout_dsaddr *dsaddr);
 extern void nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr);
 struct nfs4_file_layout_dsaddr *
 get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id, gfp_t gfp_flags);
+void nfs4_ds_disconnect(struct nfs_client *clp);
 
 #endif /* FS_NFS_NFS4FILELAYOUT_H */
diff --git a/fs/nfs/nfs4filelayoutdev.c b/fs/nfs/nfs4filelayoutdev.c
index d4d2032..bf49b78 100644
--- a/fs/nfs/nfs4filelayoutdev.c
+++ b/fs/nfs/nfs4filelayoutdev.c
@@ -149,6 +149,28 @@ _data_server_lookup_locked(const struct list_head *dsaddrs)
 }
 
 /*
+ * Lookup DS by nfs_client pointer. Zero data server client pointer
+ */
+void nfs4_ds_disconnect(struct nfs_client *clp)
+{
+	struct nfs4_pnfs_ds *ds;
+	struct nfs_client *found = NULL;
+
+	dprintk("%s clp %p\n", __func__, clp);
+	spin_lock(&nfs4_ds_cache_lock);
+	list_for_each_entry(ds, &nfs4_data_server_cache, ds_node)
+		if (ds->ds_clp && ds->ds_clp == clp) {
+			found = ds->ds_clp;
+			ds->ds_clp = NULL;
+		}
+	spin_unlock(&nfs4_ds_cache_lock);
+	if (found) {
+		set_bit(NFS_CS_STOP_RENEW, &clp->cl_res_state);
+		nfs_put_client(clp);
+	}
+}
+
+/*
  * Create an rpc connection to the nfs4_pnfs_ds data server
  * Currently only supports IPv4 and IPv6 addresses
  */
-- 
cgit v0.10.2


From 041245c88a29273788e8eff1353bc6e1f56c61df Mon Sep 17 00:00:00 2001
From: Andy Adamson <andros@netapp.com>
Date: Fri, 27 Apr 2012 17:53:53 -0400
Subject: NFSv4.1 resend LAYOUTGET on data server invalid layout errors

The "invalid layout" class of errors is handled by destroying the layout and
getting a new layout from the server.  Currently, the layout must be
destroyed before a new layout can be obtained.

This means that all references (e.g.lsegs) to the "to be destroyed" layout
header must be dropped before it can be destroyed. This in turn means waiting
for all in flight RPC's using the old layout as well as draining the data
server session slot table wait queue.

Set the NFS_LAYOUT_INVALID flag to redirect I/O to the MDS while waiting for
the old layout to be destroyed.

Signed-off-by: Andy Adamson <andros@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
index eaaca89..474c630 100644
--- a/fs/nfs/nfs4filelayout.c
+++ b/fs/nfs/nfs4filelayout.c
@@ -182,6 +182,27 @@ static int filelayout_async_handle_error(struct rpc_task *task,
 		break;
 	case -NFS4ERR_RETRY_UNCACHED_REP:
 		break;
+	/* Invalidate Layout errors */
+	case -NFS4ERR_PNFS_NO_LAYOUT:
+	case -ESTALE:           /* mapped NFS4ERR_STALE */
+	case -EBADHANDLE:       /* mapped NFS4ERR_BADHANDLE */
+	case -EISDIR:           /* mapped NFS4ERR_ISDIR */
+	case -NFS4ERR_FHEXPIRED:
+	case -NFS4ERR_WRONG_TYPE:
+		dprintk("%s Invalid layout error %d\n", __func__,
+			task->tk_status);
+		/*
+		 * Destroy layout so new i/o will get a new layout.
+		 * Layout will not be destroyed until all current lseg
+		 * references are put. Mark layout as invalid to resend failed
+		 * i/o and all i/o waiting on the slot table to the MDS until
+		 * layout is destroyed and a new valid layout is obtained.
+		 */
+		set_bit(NFS_LAYOUT_INVALID,
+				&NFS_I(state->inode)->layout->plh_flags);
+		pnfs_destroy_layout(NFS_I(state->inode));
+		rpc_wake_up(&tbl->slot_tbl_waitq);
+		goto reset;
 	/* RPC connection errors */
 	case -ECONNREFUSED:
 	case -EHOSTDOWN:
@@ -199,6 +220,7 @@ static int filelayout_async_handle_error(struct rpc_task *task,
 		nfs4_ds_disconnect(clp);
 		/* fall through */
 	default:
+reset:
 		dprintk("%s Retry through MDS. Error %d\n", __func__,
 			task->tk_status);
 		return -NFS4ERR_RESET_TO_MDS;
@@ -263,9 +285,8 @@ filelayout_set_layoutcommit(struct nfs_write_data *wdata)
 static void filelayout_read_prepare(struct rpc_task *task, void *data)
 {
 	struct nfs_read_data *rdata = data;
-	struct pnfs_layout_segment *lseg = rdata->header->lseg;
 
-	if (filelayout_test_devid_invalid(FILELAYOUT_DEVID_NODE(lseg))) {
+	if (filelayout_reset_to_mds(rdata->header->lseg)) {
 		dprintk("%s task %u reset io to MDS\n", __func__, task->tk_pid);
 		filelayout_reset_read(rdata);
 		rpc_exit(task, 0);
@@ -366,9 +387,8 @@ static int filelayout_commit_done_cb(struct rpc_task *task,
 static void filelayout_write_prepare(struct rpc_task *task, void *data)
 {
 	struct nfs_write_data *wdata = data;
-	struct pnfs_layout_segment *lseg = wdata->header->lseg;
 
-	if (filelayout_test_devid_invalid(FILELAYOUT_DEVID_NODE(lseg))) {
+	if (filelayout_reset_to_mds(wdata->header->lseg)) {
 		dprintk("%s task %u reset io to MDS\n", __func__, task->tk_pid);
 		filelayout_reset_write(wdata);
 		rpc_exit(task, 0);
diff --git a/fs/nfs/nfs4filelayout.h b/fs/nfs/nfs4filelayout.h
index 95562ad..43fe802 100644
--- a/fs/nfs/nfs4filelayout.h
+++ b/fs/nfs/nfs4filelayout.h
@@ -129,11 +129,24 @@ filelayout_mark_devid_invalid(struct nfs4_deviceid_node *node)
 }
 
 static inline bool
+filelayout_test_layout_invalid(struct pnfs_layout_hdr *lo)
+{
+	return test_bit(NFS_LAYOUT_INVALID, &lo->plh_flags);
+}
+
+static inline bool
 filelayout_test_devid_invalid(struct nfs4_deviceid_node *node)
 {
 	return test_bit(NFS_DEVICEID_INVALID, &node->flags);
 }
 
+static inline bool
+filelayout_reset_to_mds(struct pnfs_layout_segment *lseg)
+{
+	return filelayout_test_devid_invalid(FILELAYOUT_DEVID_NODE(lseg)) ||
+		filelayout_test_layout_invalid(lseg->pls_layout);
+}
+
 extern struct nfs_fh *
 nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, u32 j);
 
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index e48017f..5d09a36 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -455,6 +455,7 @@ pnfs_destroy_layout(struct nfs_inode *nfsi)
 	spin_unlock(&nfsi->vfs_inode.i_lock);
 	pnfs_free_lseg_list(&tmp_list);
 }
+EXPORT_SYMBOL_GPL(pnfs_destroy_layout);
 
 /*
  * Called by the state manger to remove all layouts established under an
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 9cf9ede..7980756 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -63,6 +63,7 @@ enum {
 	NFS_LAYOUT_BULK_RECALL,		/* bulk recall affecting layout */
 	NFS_LAYOUT_ROC,			/* some lseg had roc bit set */
 	NFS_LAYOUT_DESTROYED,		/* no new use of layout allowed */
+	NFS_LAYOUT_INVALID,		/* layout is being destroyed */
 };
 
 enum layoutdriver_policy_flags {
-- 
cgit v0.10.2


From 3f7e82759c692df473675ed06fb90b20f1f225c3 Mon Sep 17 00:00:00 2001
From: Rhyland Klein <rklein@nvidia.com>
Date: Tue, 8 May 2012 11:42:38 -0700
Subject: mfd: Commonize tps65910 regmap access through header

This change removes the read/write callback functions in favor of common
regmap accessors inside the header file. This change also makes use of
regmap_read/write for single register access which maps better onto what this
driver actually needs.

Signed-off-by: Rhyland Klein <rklein@nvidia.com>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>

diff --git a/drivers/gpio/gpio-tps65910.c b/drivers/gpio/gpio-tps65910.c
index 7eef648..bc155f2 100644
--- a/drivers/gpio/gpio-tps65910.c
+++ b/drivers/gpio/gpio-tps65910.c
@@ -23,9 +23,9 @@
 static int tps65910_gpio_get(struct gpio_chip *gc, unsigned offset)
 {
 	struct tps65910 *tps65910 = container_of(gc, struct tps65910, gpio);
-	uint8_t val;
+	unsigned int val;
 
-	tps65910->read(tps65910, TPS65910_GPIO0 + offset, 1, &val);
+	tps65910_reg_read(tps65910, TPS65910_GPIO0 + offset, &val);
 
 	if (val & GPIO_STS_MASK)
 		return 1;
@@ -39,10 +39,10 @@ static void tps65910_gpio_set(struct gpio_chip *gc, unsigned offset,
 	struct tps65910 *tps65910 = container_of(gc, struct tps65910, gpio);
 
 	if (value)
-		tps65910_set_bits(tps65910, TPS65910_GPIO0 + offset,
+		tps65910_reg_set_bits(tps65910, TPS65910_GPIO0 + offset,
 						GPIO_SET_MASK);
 	else
-		tps65910_clear_bits(tps65910, TPS65910_GPIO0 + offset,
+		tps65910_reg_clear_bits(tps65910, TPS65910_GPIO0 + offset,
 						GPIO_SET_MASK);
 }
 
@@ -54,7 +54,7 @@ static int tps65910_gpio_output(struct gpio_chip *gc, unsigned offset,
 	/* Set the initial value */
 	tps65910_gpio_set(gc, offset, value);
 
-	return tps65910_set_bits(tps65910, TPS65910_GPIO0 + offset,
+	return tps65910_reg_set_bits(tps65910, TPS65910_GPIO0 + offset,
 						GPIO_CFG_MASK);
 }
 
@@ -62,7 +62,7 @@ static int tps65910_gpio_input(struct gpio_chip *gc, unsigned offset)
 {
 	struct tps65910 *tps65910 = container_of(gc, struct tps65910, gpio);
 
-	return tps65910_clear_bits(tps65910, TPS65910_GPIO0 + offset,
+	return tps65910_reg_clear_bits(tps65910, TPS65910_GPIO0 + offset,
 						GPIO_CFG_MASK);
 }
 
@@ -102,7 +102,7 @@ void tps65910_gpio_init(struct tps65910 *tps65910, int gpio_base)
 		int i;
 		for (i = 0; i < tps65910->gpio.ngpio; ++i) {
 			if (board_data->en_gpio_sleep[i]) {
-				ret = tps65910_set_bits(tps65910,
+				ret = tps65910_reg_set_bits(tps65910,
 					TPS65910_GPIO0 + i, GPIO_SLEEP_MASK);
 				if (ret < 0)
 					dev_warn(tps65910->dev,
diff --git a/drivers/mfd/tps65910-irq.c b/drivers/mfd/tps65910-irq.c
index c9ed5c0..0f1ff7f 100644
--- a/drivers/mfd/tps65910-irq.c
+++ b/drivers/mfd/tps65910-irq.c
@@ -41,28 +41,28 @@ static inline int irq_to_tps65910_irq(struct tps65910 *tps65910,
 static irqreturn_t tps65910_irq(int irq, void *irq_data)
 {
 	struct tps65910 *tps65910 = irq_data;
+	unsigned int reg;
 	u32 irq_sts;
 	u32 irq_mask;
-	u8 reg;
 	int i;
 
-	tps65910->read(tps65910, TPS65910_INT_STS, 1, &reg);
+	tps65910_reg_read(tps65910, TPS65910_INT_STS, &reg);
 	irq_sts = reg;
-	tps65910->read(tps65910, TPS65910_INT_STS2, 1, &reg);
+	tps65910_reg_read(tps65910, TPS65910_INT_STS2, &reg);
 	irq_sts |= reg << 8;
 	switch (tps65910_chip_id(tps65910)) {
 	case TPS65911:
-		tps65910->read(tps65910, TPS65910_INT_STS3, 1, &reg);
+		tps65910_reg_read(tps65910, TPS65910_INT_STS3, &reg);
 		irq_sts |= reg << 16;
 	}
 
-	tps65910->read(tps65910, TPS65910_INT_MSK, 1, &reg);
+	tps65910_reg_read(tps65910, TPS65910_INT_MSK, &reg);
 	irq_mask = reg;
-	tps65910->read(tps65910, TPS65910_INT_MSK2, 1, &reg);
+	tps65910_reg_read(tps65910, TPS65910_INT_MSK2, &reg);
 	irq_mask |= reg << 8;
 	switch (tps65910_chip_id(tps65910)) {
 	case TPS65911:
-		tps65910->read(tps65910, TPS65910_INT_MSK3, 1, &reg);
+		tps65910_reg_read(tps65910, TPS65910_INT_MSK3, &reg);
 		irq_mask |= reg << 16;
 	}
 
@@ -82,13 +82,13 @@ static irqreturn_t tps65910_irq(int irq, void *irq_data)
 	/* Write the STS register back to clear IRQs we handled */
 	reg = irq_sts & 0xFF;
 	irq_sts >>= 8;
-	tps65910->write(tps65910, TPS65910_INT_STS, 1, &reg);
+	tps65910_reg_write(tps65910, TPS65910_INT_STS, reg);
 	reg = irq_sts & 0xFF;
-	tps65910->write(tps65910, TPS65910_INT_STS2, 1, &reg);
+	tps65910_reg_write(tps65910, TPS65910_INT_STS2, reg);
 	switch (tps65910_chip_id(tps65910)) {
 	case TPS65911:
 		reg = irq_sts >> 8;
-		tps65910->write(tps65910, TPS65910_INT_STS3, 1, &reg);
+		tps65910_reg_write(tps65910, TPS65910_INT_STS3, reg);
 	}
 
 	return IRQ_HANDLED;
@@ -105,27 +105,27 @@ static void tps65910_irq_sync_unlock(struct irq_data *data)
 {
 	struct tps65910 *tps65910 = irq_data_get_irq_chip_data(data);
 	u32 reg_mask;
-	u8 reg;
+	unsigned int reg;
 
-	tps65910->read(tps65910, TPS65910_INT_MSK, 1, &reg);
+	tps65910_reg_read(tps65910, TPS65910_INT_MSK, &reg);
 	reg_mask = reg;
-	tps65910->read(tps65910, TPS65910_INT_MSK2, 1, &reg);
+	tps65910_reg_read(tps65910, TPS65910_INT_MSK2, &reg);
 	reg_mask |= reg << 8;
 	switch (tps65910_chip_id(tps65910)) {
 	case TPS65911:
-		tps65910->read(tps65910, TPS65910_INT_MSK3, 1, &reg);
+		tps65910_reg_read(tps65910, TPS65910_INT_MSK3, &reg);
 		reg_mask |= reg << 16;
 	}
 
 	if (tps65910->irq_mask != reg_mask) {
 		reg = tps65910->irq_mask & 0xFF;
-		tps65910->write(tps65910, TPS65910_INT_MSK, 1, &reg);
+		tps65910_reg_write(tps65910, TPS65910_INT_MSK, reg);
 		reg = tps65910->irq_mask >> 8 & 0xFF;
-		tps65910->write(tps65910, TPS65910_INT_MSK2, 1, &reg);
+		tps65910_reg_write(tps65910, TPS65910_INT_MSK2, reg);
 		switch (tps65910_chip_id(tps65910)) {
 		case TPS65911:
 			reg = tps65910->irq_mask >> 16;
-			tps65910->write(tps65910, TPS65910_INT_MSK3, 1, &reg);
+			tps65910_reg_write(tps65910, TPS65910_INT_MSK3, reg);
 		}
 	}
 	mutex_unlock(&tps65910->irq_lock);
diff --git a/drivers/mfd/tps65910.c b/drivers/mfd/tps65910.c
index 7a55af9..7dffbe1 100644
--- a/drivers/mfd/tps65910.c
+++ b/drivers/mfd/tps65910.c
@@ -37,30 +37,6 @@ static struct mfd_cell tps65910s[] = {
 };
 
 
-static int tps65910_i2c_read(struct tps65910 *tps65910, u8 reg,
-				  int bytes, void *dest)
-{
-	return regmap_bulk_read(tps65910->regmap, reg, dest, bytes);
-}
-
-static int tps65910_i2c_write(struct tps65910 *tps65910, u8 reg,
-				  int bytes, void *src)
-{
-	return regmap_bulk_write(tps65910->regmap, reg, src, bytes);
-}
-
-int tps65910_set_bits(struct tps65910 *tps65910, u8 reg, u8 mask)
-{
-	return regmap_update_bits(tps65910->regmap, reg, mask, mask);
-}
-EXPORT_SYMBOL_GPL(tps65910_set_bits);
-
-int tps65910_clear_bits(struct tps65910 *tps65910, u8 reg, u8 mask)
-{
-	return regmap_update_bits(tps65910->regmap, reg, mask, 0);
-}
-EXPORT_SYMBOL_GPL(tps65910_clear_bits);
-
 static bool is_volatile_reg(struct device *dev, unsigned int reg)
 {
 	struct tps65910 *tps65910 = dev_get_drvdata(dev);
@@ -102,7 +78,7 @@ static int __devinit tps65910_sleepinit(struct tps65910 *tps65910,
 		return 0;
 
 	/* enabling SLEEP device state */
-	ret = tps65910_set_bits(tps65910, TPS65910_DEVCTRL,
+	ret = tps65910_reg_set_bits(tps65910, TPS65910_DEVCTRL,
 				DEVCTRL_DEV_SLP_MASK);
 	if (ret < 0) {
 		dev_err(dev, "set dev_slp failed: %d\n", ret);
@@ -114,7 +90,8 @@ static int __devinit tps65910_sleepinit(struct tps65910 *tps65910,
 		return 0;
 
 	if (pmic_pdata->slp_keepon->therm_keepon) {
-		ret = tps65910_set_bits(tps65910, TPS65910_SLEEP_KEEP_RES_ON,
+		ret = tps65910_reg_set_bits(tps65910,
+				TPS65910_SLEEP_KEEP_RES_ON,
 				SLEEP_KEEP_RES_ON_THERM_KEEPON_MASK);
 		if (ret < 0) {
 			dev_err(dev, "set therm_keepon failed: %d\n", ret);
@@ -123,7 +100,8 @@ static int __devinit tps65910_sleepinit(struct tps65910 *tps65910,
 	}
 
 	if (pmic_pdata->slp_keepon->clkout32k_keepon) {
-		ret = tps65910_set_bits(tps65910, TPS65910_SLEEP_KEEP_RES_ON,
+		ret = tps65910_reg_set_bits(tps65910,
+				TPS65910_SLEEP_KEEP_RES_ON,
 				SLEEP_KEEP_RES_ON_CLKOUT32K_KEEPON_MASK);
 		if (ret < 0) {
 			dev_err(dev, "set clkout32k_keepon failed: %d\n", ret);
@@ -132,7 +110,8 @@ static int __devinit tps65910_sleepinit(struct tps65910 *tps65910,
 	}
 
 	if (pmic_pdata->slp_keepon->i2chs_keepon) {
-		ret = tps65910_set_bits(tps65910, TPS65910_SLEEP_KEEP_RES_ON,
+		ret = tps65910_reg_set_bits(tps65910,
+				TPS65910_SLEEP_KEEP_RES_ON,
 				SLEEP_KEEP_RES_ON_I2CHS_KEEPON_MASK);
 		if (ret < 0) {
 			dev_err(dev, "set i2chs_keepon failed: %d\n", ret);
@@ -143,7 +122,8 @@ static int __devinit tps65910_sleepinit(struct tps65910 *tps65910,
 	return 0;
 
 disable_dev_slp:
-	tps65910_clear_bits(tps65910, TPS65910_DEVCTRL, DEVCTRL_DEV_SLP_MASK);
+	tps65910_reg_clear_bits(tps65910, TPS65910_DEVCTRL,
+				DEVCTRL_DEV_SLP_MASK);
 
 err_sleep_init:
 	return ret;
@@ -176,8 +156,6 @@ static __devinit int tps65910_i2c_probe(struct i2c_client *i2c,
 	tps65910->dev = &i2c->dev;
 	tps65910->i2c_client = i2c;
 	tps65910->id = id->driver_data;
-	tps65910->read = tps65910_i2c_read;
-	tps65910->write = tps65910_i2c_write;
 	mutex_init(&tps65910->io_mutex);
 
 	tps65910->regmap = regmap_init_i2c(i2c, &tps65910_regmap_config);
diff --git a/drivers/regulator/tps65910-regulator.c b/drivers/regulator/tps65910-regulator.c
index 4a37c2b6..852b05b 100644
--- a/drivers/regulator/tps65910-regulator.c
+++ b/drivers/regulator/tps65910-regulator.c
@@ -331,21 +331,16 @@ struct tps65910_reg {
 
 static inline int tps65910_read(struct tps65910_reg *pmic, u8 reg)
 {
-	u8 val;
+	unsigned int val;
 	int err;
 
-	err = pmic->mfd->read(pmic->mfd, reg, 1, &val);
+	err = tps65910_reg_read(pmic->mfd, reg, &val);
 	if (err)
 		return err;
 
 	return val;
 }
 
-static inline int tps65910_write(struct tps65910_reg *pmic, u8 reg, u8 val)
-{
-	return pmic->mfd->write(pmic->mfd, reg, 1, &val);
-}
-
 static int tps65910_modify_bits(struct tps65910_reg *pmic, u8 reg,
 					u8 set_mask, u8 clear_mask)
 {
@@ -362,7 +357,7 @@ static int tps65910_modify_bits(struct tps65910_reg *pmic, u8 reg,
 
 	data &= ~clear_mask;
 	data |= set_mask;
-	err = tps65910_write(pmic, reg, data);
+	err = tps65910_reg_write(pmic->mfd, reg, data);
 	if (err)
 		dev_err(pmic->mfd->dev, "Write for reg 0x%x failed\n", reg);
 
@@ -371,7 +366,7 @@ out:
 	return err;
 }
 
-static int tps65910_reg_read(struct tps65910_reg *pmic, u8 reg)
+static int tps65910_reg_read_locked(struct tps65910_reg *pmic, u8 reg)
 {
 	int data;
 
@@ -385,13 +380,13 @@ static int tps65910_reg_read(struct tps65910_reg *pmic, u8 reg)
 	return data;
 }
 
-static int tps65910_reg_write(struct tps65910_reg *pmic, u8 reg, u8 val)
+static int tps65910_reg_write_locked(struct tps65910_reg *pmic, u8 reg, u8 val)
 {
 	int err;
 
 	mutex_lock(&pmic->mutex);
 
-	err = tps65910_write(pmic, reg, val);
+	err = tps65910_reg_write(pmic->mfd, reg, val);
 	if (err < 0)
 		dev_err(pmic->mfd->dev, "Write for reg 0x%x failed\n", reg);
 
@@ -476,7 +471,7 @@ static int tps65910_is_enabled(struct regulator_dev *dev)
 	if (reg < 0)
 		return reg;
 
-	value = tps65910_reg_read(pmic, reg);
+	value = tps65910_reg_read_locked(pmic, reg);
 	if (value < 0)
 		return value;
 
@@ -493,7 +488,7 @@ static int tps65910_enable(struct regulator_dev *dev)
 	if (reg < 0)
 		return reg;
 
-	return tps65910_set_bits(mfd, reg, TPS65910_SUPPLY_STATE_ENABLED);
+	return tps65910_reg_set_bits(mfd, reg, TPS65910_SUPPLY_STATE_ENABLED);
 }
 
 static int tps65910_disable(struct regulator_dev *dev)
@@ -506,7 +501,7 @@ static int tps65910_disable(struct regulator_dev *dev)
 	if (reg < 0)
 		return reg;
 
-	return tps65910_clear_bits(mfd, reg, TPS65910_SUPPLY_STATE_ENABLED);
+	return tps65910_reg_clear_bits(mfd, reg, TPS65910_SUPPLY_STATE_ENABLED);
 }
 
 static int tps65910_enable_time(struct regulator_dev *dev)
@@ -532,9 +527,9 @@ static int tps65910_set_mode(struct regulator_dev *dev, unsigned int mode)
 							LDO_ST_MODE_BIT);
 	case REGULATOR_MODE_IDLE:
 		value = LDO_ST_ON_BIT | LDO_ST_MODE_BIT;
-		return tps65910_set_bits(mfd, reg, value);
+		return tps65910_reg_set_bits(mfd, reg, value);
 	case REGULATOR_MODE_STANDBY:
-		return tps65910_clear_bits(mfd, reg, LDO_ST_ON_BIT);
+		return tps65910_reg_clear_bits(mfd, reg, LDO_ST_ON_BIT);
 	}
 
 	return -EINVAL;
@@ -549,7 +544,7 @@ static unsigned int tps65910_get_mode(struct regulator_dev *dev)
 	if (reg < 0)
 		return reg;
 
-	value = tps65910_reg_read(pmic, reg);
+	value = tps65910_reg_read_locked(pmic, reg);
 	if (value < 0)
 		return value;
 
@@ -569,28 +564,28 @@ static int tps65910_get_voltage_dcdc_sel(struct regulator_dev *dev)
 
 	switch (id) {
 	case TPS65910_REG_VDD1:
-		opvsel = tps65910_reg_read(pmic, TPS65910_VDD1_OP);
-		mult = tps65910_reg_read(pmic, TPS65910_VDD1);
+		opvsel = tps65910_reg_read_locked(pmic, TPS65910_VDD1_OP);
+		mult = tps65910_reg_read_locked(pmic, TPS65910_VDD1);
 		mult = (mult & VDD1_VGAIN_SEL_MASK) >> VDD1_VGAIN_SEL_SHIFT;
-		srvsel = tps65910_reg_read(pmic, TPS65910_VDD1_SR);
+		srvsel = tps65910_reg_read_locked(pmic, TPS65910_VDD1_SR);
 		sr = opvsel & VDD1_OP_CMD_MASK;
 		opvsel &= VDD1_OP_SEL_MASK;
 		srvsel &= VDD1_SR_SEL_MASK;
 		vselmax = 75;
 		break;
 	case TPS65910_REG_VDD2:
-		opvsel = tps65910_reg_read(pmic, TPS65910_VDD2_OP);
-		mult = tps65910_reg_read(pmic, TPS65910_VDD2);
+		opvsel = tps65910_reg_read_locked(pmic, TPS65910_VDD2_OP);
+		mult = tps65910_reg_read_locked(pmic, TPS65910_VDD2);
 		mult = (mult & VDD2_VGAIN_SEL_MASK) >> VDD2_VGAIN_SEL_SHIFT;
-		srvsel = tps65910_reg_read(pmic, TPS65910_VDD2_SR);
+		srvsel = tps65910_reg_read_locked(pmic, TPS65910_VDD2_SR);
 		sr = opvsel & VDD2_OP_CMD_MASK;
 		opvsel &= VDD2_OP_SEL_MASK;
 		srvsel &= VDD2_SR_SEL_MASK;
 		vselmax = 75;
 		break;
 	case TPS65911_REG_VDDCTRL:
-		opvsel = tps65910_reg_read(pmic, TPS65911_VDDCTRL_OP);
-		srvsel = tps65910_reg_read(pmic, TPS65911_VDDCTRL_SR);
+		opvsel = tps65910_reg_read_locked(pmic, TPS65911_VDDCTRL_OP);
+		srvsel = tps65910_reg_read_locked(pmic, TPS65911_VDDCTRL_SR);
 		sr = opvsel & VDDCTRL_OP_CMD_MASK;
 		opvsel &= VDDCTRL_OP_SEL_MASK;
 		srvsel &= VDDCTRL_SR_SEL_MASK;
@@ -630,7 +625,7 @@ static int tps65910_get_voltage(struct regulator_dev *dev)
 	if (reg < 0)
 		return reg;
 
-	value = tps65910_reg_read(pmic, reg);
+	value = tps65910_reg_read_locked(pmic, reg);
 	if (value < 0)
 		return value;
 
@@ -669,7 +664,7 @@ static int tps65911_get_voltage(struct regulator_dev *dev)
 
 	reg = pmic->get_ctrl_reg(id);
 
-	value = tps65910_reg_read(pmic, reg);
+	value = tps65910_reg_read_locked(pmic, reg);
 
 	switch (id) {
 	case TPS65911_REG_LDO1:
@@ -728,7 +723,7 @@ static int tps65910_set_voltage_dcdc_sel(struct regulator_dev *dev,
 		tps65910_modify_bits(pmic, TPS65910_VDD1,
 				(dcdc_mult << VDD1_VGAIN_SEL_SHIFT),
 						VDD1_VGAIN_SEL_MASK);
-		tps65910_reg_write(pmic, TPS65910_VDD1_OP, vsel);
+		tps65910_reg_write_locked(pmic, TPS65910_VDD1_OP, vsel);
 		break;
 	case TPS65910_REG_VDD2:
 		dcdc_mult = (selector / VDD1_2_NUM_VOLT_FINE) + 1;
@@ -739,11 +734,11 @@ static int tps65910_set_voltage_dcdc_sel(struct regulator_dev *dev,
 		tps65910_modify_bits(pmic, TPS65910_VDD2,
 				(dcdc_mult << VDD2_VGAIN_SEL_SHIFT),
 						VDD1_VGAIN_SEL_MASK);
-		tps65910_reg_write(pmic, TPS65910_VDD2_OP, vsel);
+		tps65910_reg_write_locked(pmic, TPS65910_VDD2_OP, vsel);
 		break;
 	case TPS65911_REG_VDDCTRL:
 		vsel = selector + 3;
-		tps65910_reg_write(pmic, TPS65911_VDDCTRL_OP, vsel);
+		tps65910_reg_write_locked(pmic, TPS65911_VDDCTRL_OP, vsel);
 	}
 
 	return 0;
@@ -994,10 +989,10 @@ static int tps65910_set_ext_sleep_config(struct tps65910_reg *pmic,
 
 	/* External EN1 control */
 	if (ext_sleep_config & TPS65910_SLEEP_CONTROL_EXT_INPUT_EN1)
-		ret = tps65910_set_bits(mfd,
+		ret = tps65910_reg_set_bits(mfd,
 				TPS65910_EN1_LDO_ASS + regoffs, bit_pos);
 	else
-		ret = tps65910_clear_bits(mfd,
+		ret = tps65910_reg_clear_bits(mfd,
 				TPS65910_EN1_LDO_ASS + regoffs, bit_pos);
 	if (ret < 0) {
 		dev_err(mfd->dev,
@@ -1007,10 +1002,10 @@ static int tps65910_set_ext_sleep_config(struct tps65910_reg *pmic,
 
 	/* External EN2 control */
 	if (ext_sleep_config & TPS65910_SLEEP_CONTROL_EXT_INPUT_EN2)
-		ret = tps65910_set_bits(mfd,
+		ret = tps65910_reg_set_bits(mfd,
 				TPS65910_EN2_LDO_ASS + regoffs, bit_pos);
 	else
-		ret = tps65910_clear_bits(mfd,
+		ret = tps65910_reg_clear_bits(mfd,
 				TPS65910_EN2_LDO_ASS + regoffs, bit_pos);
 	if (ret < 0) {
 		dev_err(mfd->dev,
@@ -1022,10 +1017,10 @@ static int tps65910_set_ext_sleep_config(struct tps65910_reg *pmic,
 	if ((tps65910_chip_id(mfd) == TPS65910) &&
 			(id >= TPS65910_REG_VDIG1)) {
 		if (ext_sleep_config & TPS65910_SLEEP_CONTROL_EXT_INPUT_EN3)
-			ret = tps65910_set_bits(mfd,
+			ret = tps65910_reg_set_bits(mfd,
 				TPS65910_EN3_LDO_ASS + regoffs, bit_pos);
 		else
-			ret = tps65910_clear_bits(mfd,
+			ret = tps65910_reg_clear_bits(mfd,
 				TPS65910_EN3_LDO_ASS + regoffs, bit_pos);
 		if (ret < 0) {
 			dev_err(mfd->dev,
@@ -1037,10 +1032,10 @@ static int tps65910_set_ext_sleep_config(struct tps65910_reg *pmic,
 	/* Return if no external control is selected */
 	if (!(ext_sleep_config & EXT_SLEEP_CONTROL)) {
 		/* Clear all sleep controls */
-		ret = tps65910_clear_bits(mfd,
+		ret = tps65910_reg_clear_bits(mfd,
 			TPS65910_SLEEP_KEEP_LDO_ON + regoffs, bit_pos);
 		if (!ret)
-			ret = tps65910_clear_bits(mfd,
+			ret = tps65910_reg_clear_bits(mfd,
 				TPS65910_SLEEP_SET_LDO_OFF + regoffs, bit_pos);
 		if (ret < 0)
 			dev_err(mfd->dev,
@@ -1059,32 +1054,33 @@ static int tps65910_set_ext_sleep_config(struct tps65910_reg *pmic,
 				(tps65910_chip_id(mfd) == TPS65911))) {
 		int op_reg_add = pmic->get_ctrl_reg(id) + 1;
 		int sr_reg_add = pmic->get_ctrl_reg(id) + 2;
-		int opvsel = tps65910_reg_read(pmic, op_reg_add);
-		int srvsel = tps65910_reg_read(pmic, sr_reg_add);
+		int opvsel = tps65910_reg_read_locked(pmic, op_reg_add);
+		int srvsel = tps65910_reg_read_locked(pmic, sr_reg_add);
 		if (opvsel & VDD1_OP_CMD_MASK) {
 			u8 reg_val = srvsel & VDD1_OP_SEL_MASK;
-			ret = tps65910_reg_write(pmic, op_reg_add, reg_val);
+			ret = tps65910_reg_write_locked(pmic, op_reg_add,
+							reg_val);
 			if (ret < 0) {
 				dev_err(mfd->dev,
 					"Error in configuring op register\n");
 				return ret;
 			}
 		}
-		ret = tps65910_reg_write(pmic, sr_reg_add, 0);
+		ret = tps65910_reg_write_locked(pmic, sr_reg_add, 0);
 		if (ret < 0) {
 			dev_err(mfd->dev, "Error in settting sr register\n");
 			return ret;
 		}
 	}
 
-	ret = tps65910_clear_bits(mfd,
+	ret = tps65910_reg_clear_bits(mfd,
 			TPS65910_SLEEP_KEEP_LDO_ON + regoffs, bit_pos);
 	if (!ret) {
 		if (ext_sleep_config & TPS65911_SLEEP_CONTROL_EXT_INPUT_SLEEP)
-			ret = tps65910_set_bits(mfd,
+			ret = tps65910_reg_set_bits(mfd,
 				TPS65910_SLEEP_SET_LDO_OFF + regoffs, bit_pos);
 		else
-			ret = tps65910_clear_bits(mfd,
+			ret = tps65910_reg_clear_bits(mfd,
 				TPS65910_SLEEP_SET_LDO_OFF + regoffs, bit_pos);
 	}
 	if (ret < 0)
@@ -1117,7 +1113,7 @@ static __devinit int tps65910_probe(struct platform_device *pdev)
 	platform_set_drvdata(pdev, pmic);
 
 	/* Give control of all register to control port */
-	tps65910_set_bits(pmic->mfd, TPS65910_DEVCTRL,
+	tps65910_reg_set_bits(pmic->mfd, TPS65910_DEVCTRL,
 				DEVCTRL_SR_CTL_I2C_SEL_MASK);
 
 	switch(tps65910_chip_id(tps65910)) {
diff --git a/include/linux/mfd/tps65910.h b/include/linux/mfd/tps65910.h
index 56903ad..949f1da 100644
--- a/include/linux/mfd/tps65910.h
+++ b/include/linux/mfd/tps65910.h
@@ -18,6 +18,7 @@
 #define __LINUX_MFD_TPS65910_H
 
 #include <linux/gpio.h>
+#include <linux/regmap.h>
 
 /* TPS chip id list */
 #define TPS65910			0
@@ -823,8 +824,6 @@ struct tps65910 {
 	struct regmap *regmap;
 	struct mutex io_mutex;
 	unsigned int id;
-	int (*read)(struct tps65910 *tps65910, u8 reg, int size, void *dest);
-	int (*write)(struct tps65910 *tps65910, u8 reg, int size, void *src);
 
 	/* Client devices */
 	struct tps65910_pmic *pmic;
@@ -847,8 +846,6 @@ struct tps65910_platform_data {
 	int irq_base;
 };
 
-int tps65910_set_bits(struct tps65910 *tps65910, u8 reg, u8 mask);
-int tps65910_clear_bits(struct tps65910 *tps65910, u8 reg, u8 mask);
 void tps65910_gpio_init(struct tps65910 *tps65910, int gpio_base);
 int tps65910_irq_init(struct tps65910 *tps65910, int irq,
 		struct tps65910_platform_data *pdata);
@@ -859,4 +856,28 @@ static inline int tps65910_chip_id(struct tps65910 *tps65910)
 	return tps65910->id;
 }
 
+static inline int tps65910_reg_read(struct tps65910 *tps65910, u8 reg,
+		unsigned int *val)
+{
+	return regmap_read(tps65910->regmap, reg, val);
+}
+
+static inline int tps65910_reg_write(struct tps65910 *tps65910, u8 reg,
+		unsigned int val)
+{
+	return regmap_write(tps65910->regmap, reg, val);
+}
+
+static inline int tps65910_reg_set_bits(struct tps65910 *tps65910, u8 reg,
+		u8 mask)
+{
+	return regmap_update_bits(tps65910->regmap, reg, mask, mask);
+}
+
+static inline int tps65910_reg_clear_bits(struct tps65910 *tps65910, u8 reg,
+		u8 mask)
+{
+	return regmap_update_bits(tps65910->regmap, reg, mask, 0);
+}
+
 #endif /*  __LINUX_MFD_TPS65910_H */
-- 
cgit v0.10.2


From 1291aa457324e149bbda19bd637b4b8ec8f04bcb Mon Sep 17 00:00:00 2001
From: Rhyland Klein <rklein@nvidia.com>
Date: Tue, 8 May 2012 11:42:39 -0700
Subject: mfd: Add tps65910 device tree bindings documentation

Add device tree bindings for TI's tps65910 pmic.

Signed-off-by: Rhyland Klein <rklein@nvidia.com>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>

diff --git a/Documentation/devicetree/bindings/mfd/tps65910.txt b/Documentation/devicetree/bindings/mfd/tps65910.txt
new file mode 100644
index 0000000..645f5ea
--- /dev/null
+++ b/Documentation/devicetree/bindings/mfd/tps65910.txt
@@ -0,0 +1,133 @@
+TPS65910 Power Management Integrated Circuit
+
+Required properties:
+- compatible: "ti,tps65910" or "ti,tps65911"
+- reg: I2C slave address
+- interrupts: the interrupt outputs of the controller
+- #gpio-cells: number of cells to describe a GPIO, this should be 2.
+  The first cell is the GPIO number.
+  The second cell is used to specify additional options <unused>.
+- gpio-controller: mark the device as a GPIO controller
+- #interrupt-cells: the number of cells to describe an IRQ, this should be 2.
+  The first cell is the IRQ number.
+  The second cell is the flags, encoded as the trigger masks from
+  Documentation/devicetree/bindings/interrupts.txt
+- regulators: This is the list of child nodes that specify the regulator
+  initialization data for defined regulators. Not all regulators for the given
+  device need to be present. The definition for each of these nodes is defined
+  using the standard binding for regulators found at
+  Documentation/devicetree/bindings/regulator/regulator.txt.
+
+  The valid names for regulators are:
+  tps65910: vrtc, vio, vdd1, vdd2, vdd3, vdig1, vdig2, vpll, vdac, vaux1,
+            vaux2, vaux33, vmmc
+  tps65911: vrtc, vio, vdd1, vdd3, vddctrl, ldo1, ldo2, ldo3, ldo4, ldo5,
+            ldo6, ldo7, ldo8
+
+Optional properties:
+- ti,vmbch-threshold: (tps65911) main battery charged threshold
+  comparator. (see VMBCH_VSEL in TPS65910 datasheet)
+- ti,vmbch2-threshold: (tps65911) main battery discharged threshold
+  comparator. (see VMBCH_VSEL in TPS65910 datasheet)
+- ti,en-gpio-sleep: enable sleep control for gpios
+  There should be 9 entries here, one for each gpio.
+
+Regulator Optional properties:
+- ti,regulator-ext-sleep-control: enable external sleep
+  control through external inputs [0 (not enabled), 1 (EN1), 2 (EN2) or 4(EN3)]
+  If this property is not defined, it defaults to 0 (not enabled).
+
+Example:
+
+	pmu: tps65910@d2 {
+		compatible = "ti,tps65910";
+		reg = <0xd2>;
+		interrupt-parent = <&intc>;
+		interrupts = < 0 118 0x04 >;
+
+		#gpio-cells = <2>;
+		gpio-controller;
+
+		#interrupt-cells = <2>;
+		interrupt-controller;
+
+		ti,vmbch-threshold = 0;
+		ti,vmbch2-threshold = 0;
+
+		ti,en-gpio-sleep = <0 0 1 0 0 0 0 0 0>;
+
+		regulators {
+			vdd1_reg: vdd1 {
+				regulator-min-microvolt = < 600000>;
+				regulator-max-microvolt = <1500000>;
+				regulator-always-on;
+				regulator-boot-on;
+				ti,regulator-ext-sleep-control = <0>;
+			};
+			vdd2_reg: vdd2 {
+				regulator-min-microvolt = < 600000>;
+				regulator-max-microvolt = <1500000>;
+				regulator-always-on;
+				regulator-boot-on;
+				ti,regulator-ext-sleep-control = <4>;
+			};
+			vddctrl_reg: vddctrl {
+				regulator-min-microvolt = < 600000>;
+				regulator-max-microvolt = <1400000>;
+				regulator-always-on;
+				regulator-boot-on;
+				ti,regulator-ext-sleep-control = <0>;
+			};
+			vio_reg: vio {
+				regulator-min-microvolt = <1500000>;
+				regulator-max-microvolt = <1800000>;
+				regulator-always-on;
+				regulator-boot-on;
+				ti,regulator-ext-sleep-control = <1>;
+			};
+			ldo1_reg: ldo1 {
+				regulator-min-microvolt = <1000000>;
+				regulator-max-microvolt = <3300000>;
+				ti,regulator-ext-sleep-control = <0>;
+			};
+			ldo2_reg: ldo2 {
+				regulator-min-microvolt = <1050000>;
+				regulator-max-microvolt = <1050000>;
+				ti,regulator-ext-sleep-control = <0>;
+			};
+			ldo3_reg: ldo3 {
+				regulator-min-microvolt = <1000000>;
+				regulator-max-microvolt = <3300000>;
+				ti,regulator-ext-sleep-control = <0>;
+			};
+			ldo4_reg: ldo4 {
+				regulator-min-microvolt = <1000000>;
+				regulator-max-microvolt = <3300000>;
+				regulator-always-on;
+				ti,regulator-ext-sleep-control = <0>;
+			};
+			ldo5_reg: ldo5 {
+				regulator-min-microvolt = <1000000>;
+				regulator-max-microvolt = <3300000>;
+				ti,regulator-ext-sleep-control = <0>;
+			};
+			ldo6_reg: ldo6 {
+				regulator-min-microvolt = <1200000>;
+				regulator-max-microvolt = <1200000>;
+				ti,regulator-ext-sleep-control = <0>;
+			};
+			ldo7_reg: ldo7 {
+				regulator-min-microvolt = <1200000>;
+				regulator-max-microvolt = <1200000>;
+				regulator-always-on;
+				regulator-boot-on;
+				ti,regulator-ext-sleep-control = <1>;
+			};
+			ldo8_reg: ldo8 {
+				regulator-min-microvolt = <1000000>;
+				regulator-max-microvolt = <3300000>;
+				regulator-always-on;
+				ti,regulator-ext-sleep-control = <1>;
+			};
+		};
+	};
-- 
cgit v0.10.2


From cd4209ced4d3936cfe51b7b8833260457e2d9995 Mon Sep 17 00:00:00 2001
From: Rhyland Klein <rklein@nvidia.com>
Date: Fri, 11 May 2012 11:36:26 +0200
Subject: mfd: Add tps65910 device-tree support

Add device tree based initialization support for TI's tps65910 pmic.

Signed-off-by: Rhyland Klein <rklein@nvidia.com>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>

diff --git a/drivers/mfd/tps65910.c b/drivers/mfd/tps65910.c
index 7dffbe1..a00c09e 100644
--- a/drivers/mfd/tps65910.c
+++ b/drivers/mfd/tps65910.c
@@ -23,6 +23,7 @@
 #include <linux/mfd/core.h>
 #include <linux/regmap.h>
 #include <linux/mfd/tps65910.h>
+#include <linux/of_device.h>
 
 static struct mfd_cell tps65910s[] = {
 	{
@@ -129,6 +130,77 @@ err_sleep_init:
 	return ret;
 }
 
+#ifdef CONFIG_OF
+static struct of_device_id tps65910_of_match[] = {
+	{ .compatible = "ti,tps65910", .data = (void *)TPS65910},
+	{ .compatible = "ti,tps65911", .data = (void *)TPS65911},
+	{ },
+};
+MODULE_DEVICE_TABLE(of, tps65910_of_match);
+
+static struct tps65910_board *tps65910_parse_dt(struct i2c_client *client,
+						int *chip_id)
+{
+	struct device_node *np = client->dev.of_node;
+	struct tps65910_board *board_info;
+	unsigned int prop;
+	const struct of_device_id *match;
+	unsigned int prop_array[TPS6591X_MAX_NUM_GPIO];
+	int ret = 0;
+	int idx;
+
+	match = of_match_device(tps65910_of_match, &client->dev);
+	if (!match) {
+		dev_err(&client->dev, "Failed to find matching dt id\n");
+		return NULL;
+	}
+
+	*chip_id  = (int)match->data;
+
+	board_info = devm_kzalloc(&client->dev, sizeof(*board_info),
+			GFP_KERNEL);
+	if (!board_info) {
+		dev_err(&client->dev, "Failed to allocate pdata\n");
+		return NULL;
+	}
+
+	ret = of_property_read_u32(np, "ti,vmbch-threshold", &prop);
+	if (!ret)
+		board_info->vmbch_threshold = prop;
+	else if (*chip_id == TPS65911)
+		dev_warn(&client->dev, "VMBCH-Threshold not specified");
+
+	ret = of_property_read_u32(np, "ti,vmbch2-threshold", &prop);
+	if (!ret)
+		board_info->vmbch2_threshold = prop;
+	else if (*chip_id == TPS65911)
+		dev_warn(&client->dev, "VMBCH2-Threshold not specified");
+
+	ret = of_property_read_u32_array(np, "ti,en-gpio-sleep",
+				   prop_array, TPS6591X_MAX_NUM_GPIO);
+	if (!ret)
+		for (idx = 0; idx < ARRAY_SIZE(prop_array); idx++)
+			board_info->en_gpio_sleep[idx] = (prop_array[idx] != 0);
+	else if (ret != -EINVAL) {
+		dev_err(&client->dev,
+			"error reading property ti,en-gpio-sleep: %d\n.", ret);
+		return NULL;
+	}
+
+
+	board_info->irq = client->irq;
+	board_info->irq_base = -1;
+	board_info->gpio_base = -1;
+
+	return board_info;
+}
+#else
+static inline struct tps65910_board *tps65910_parse_dt(
+					struct i2c_client *client)
+{
+	return NULL;
+}
+#endif
 
 static __devinit int tps65910_i2c_probe(struct i2c_client *i2c,
 					const struct i2c_device_id *id)
@@ -137,8 +209,13 @@ static __devinit int tps65910_i2c_probe(struct i2c_client *i2c,
 	struct tps65910_board *pmic_plat_data;
 	struct tps65910_platform_data *init_data;
 	int ret = 0;
+	int chip_id = id->driver_data;
 
 	pmic_plat_data = dev_get_platdata(&i2c->dev);
+
+	if (!pmic_plat_data && i2c->dev.of_node)
+		pmic_plat_data = tps65910_parse_dt(i2c, &chip_id);
+
 	if (!pmic_plat_data)
 		return -EINVAL;
 
@@ -155,7 +232,7 @@ static __devinit int tps65910_i2c_probe(struct i2c_client *i2c,
 	i2c_set_clientdata(i2c, tps65910);
 	tps65910->dev = &i2c->dev;
 	tps65910->i2c_client = i2c;
-	tps65910->id = id->driver_data;
+	tps65910->id = chip_id;
 	mutex_init(&tps65910->io_mutex);
 
 	tps65910->regmap = regmap_init_i2c(i2c, &tps65910_regmap_config);
@@ -215,6 +292,7 @@ static struct i2c_driver tps65910_i2c_driver = {
 	.driver = {
 		   .name = "tps65910",
 		   .owner = THIS_MODULE,
+		   .of_match_table = of_match_ptr(tps65910_of_match),
 	},
 	.probe = tps65910_i2c_probe,
 	.remove = __devexit_p(tps65910_i2c_remove),
-- 
cgit v0.10.2


From 9577e8c3fbc145b5d2a12d2fbc6a50031573c77d Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Wed, 9 May 2012 05:43:59 +1000
Subject: mfd: Define all twl-regulator feature flags in one place

twl-regulator has a collection of feature flags, some defined
in twl-core.c and  one defined in i2c/twl.h.
This is confusing for anyone adding a new feature flag.

So collect them together and place them in twl.h immediately
after the structure in which they are initially set.

Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>

diff --git a/drivers/mfd/twl-core.c b/drivers/mfd/twl-core.c
index 7c2267e..6fc90be 100644
--- a/drivers/mfd/twl-core.c
+++ b/drivers/mfd/twl-core.c
@@ -224,13 +224,6 @@
 #define HIGH_PERF_SQ			(1 << 3)
 #define CK32K_LOWPWR_EN			(1 << 7)
 
-
-/* chip-specific feature flags, for i2c_device_id.driver_data */
-#define TWL4030_VAUX2		BIT(0)	/* pre-5030 voltage ranges */
-#define TPS_SUBSET		BIT(1)	/* tps659[23]0 have fewer LDOs */
-#define TWL5031			BIT(2)  /* twl5031 has different registers */
-#define TWL6030_CLASS		BIT(3)	/* TWL6030 class */
-
 /*----------------------------------------------------------------------*/
 
 /* is driver active, bound to a chip? */
diff --git a/include/linux/i2c/twl.h b/include/linux/i2c/twl.h
index 1f90de0..d1afedc 100644
--- a/include/linux/i2c/twl.h
+++ b/include/linux/i2c/twl.h
@@ -171,8 +171,6 @@ static inline int twl_class_is_ ##class(void)	\
 TWL_CLASS_IS(4030, TWL4030_CLASS_ID)
 TWL_CLASS_IS(6030, TWL6030_CLASS_ID)
 
-#define TWL6025_SUBCLASS	BIT(4)  /* TWL6025 has changed registers */
-
 /*
  * Read and write single 8-bit registers
  */
@@ -746,6 +744,12 @@ struct twl_regulator_driver_data {
 	void		*data;
 	unsigned long	features;
 };
+/* chip-specific feature flags, for twl_regulator_driver_data.features */
+#define TWL4030_VAUX2		BIT(0)	/* pre-5030 voltage ranges */
+#define TPS_SUBSET		BIT(1)	/* tps659[23]0 have fewer LDOs */
+#define TWL5031			BIT(2)  /* twl5031 has different registers */
+#define TWL6030_CLASS		BIT(3)	/* TWL6030 class */
+#define TWL6025_SUBCLASS	BIT(4)  /* TWL6025 has changed registers */
 
 /*----------------------------------------------------------------------*/
 
-- 
cgit v0.10.2


From 3bf6bf9be51a0195c6b1604454fdd28ed1cc1770 Mon Sep 17 00:00:00 2001
From: Laxman Dewangan <ldewangan@nvidia.com>
Date: Wed, 9 May 2012 18:40:54 +0530
Subject: mfd: Cache tps65910 register when we need it

During regmap initialization, we do not provide the default value and
hence in place of caching register during regmap_init(), cache it
when actually we need it i.e. after reading of that register.

Signed-off-by: Laxman Dewangan <ldewangan@nvidia.com>
Reviewed-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>

diff --git a/drivers/mfd/tps65910.c b/drivers/mfd/tps65910.c
index a00c09e..01570a7 100644
--- a/drivers/mfd/tps65910.c
+++ b/drivers/mfd/tps65910.c
@@ -62,8 +62,7 @@ static const struct regmap_config tps65910_regmap_config = {
 	.reg_bits = 8,
 	.val_bits = 8,
 	.volatile_reg = is_volatile_reg,
-	.max_register = TPS65910_MAX_REGISTER,
-	.num_reg_defaults_raw = TPS65910_MAX_REGISTER,
+	.max_register = TPS65910_MAX_REGISTER - 1,
 	.cache_type = REGCACHE_RBTREE,
 };
 
-- 
cgit v0.10.2


From 63fe7dee9183118716078a9f2503f5f805d37c12 Mon Sep 17 00:00:00 2001
From: Laxman Dewangan <ldewangan@nvidia.com>
Date: Fri, 11 May 2012 12:36:57 +0200
Subject: mfd: Convert all tps65910 allocation to devm_*

Convert memory allocation and regmap initialization to
use devm_* functions.

Signed-off-by: Laxman Dewangan <ldewangan@nvidia.com>
Reviewed-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>

diff --git a/drivers/mfd/tps65910.c b/drivers/mfd/tps65910.c
index 01570a7..22fa430 100644
--- a/drivers/mfd/tps65910.c
+++ b/drivers/mfd/tps65910.c
@@ -218,15 +218,13 @@ static __devinit int tps65910_i2c_probe(struct i2c_client *i2c,
 	if (!pmic_plat_data)
 		return -EINVAL;
 
-	init_data = kzalloc(sizeof(struct tps65910_platform_data), GFP_KERNEL);
+	init_data = devm_kzalloc(&i2c->dev, sizeof(*init_data), GFP_KERNEL);
 	if (init_data == NULL)
 		return -ENOMEM;
 
-	tps65910 = kzalloc(sizeof(struct tps65910), GFP_KERNEL);
-	if (tps65910 == NULL) {
-		kfree(init_data);
+	tps65910 = devm_kzalloc(&i2c->dev, sizeof(*tps65910), GFP_KERNEL);
+	if (tps65910 == NULL)
 		return -ENOMEM;
-	}
 
 	i2c_set_clientdata(i2c, tps65910);
 	tps65910->dev = &i2c->dev;
@@ -234,18 +232,20 @@ static __devinit int tps65910_i2c_probe(struct i2c_client *i2c,
 	tps65910->id = chip_id;
 	mutex_init(&tps65910->io_mutex);
 
-	tps65910->regmap = regmap_init_i2c(i2c, &tps65910_regmap_config);
+	tps65910->regmap = devm_regmap_init_i2c(i2c, &tps65910_regmap_config);
 	if (IS_ERR(tps65910->regmap)) {
 		ret = PTR_ERR(tps65910->regmap);
 		dev_err(&i2c->dev, "regmap initialization failed: %d\n", ret);
-		goto regmap_err;
+		return ret;
 	}
 
 	ret = mfd_add_devices(tps65910->dev, -1,
 			      tps65910s, ARRAY_SIZE(tps65910s),
 			      NULL, 0);
-	if (ret < 0)
-		goto err;
+	if (ret < 0) {
+		dev_err(&i2c->dev, "mfd_add_devices failed: %d\n", ret);
+		return ret;
+	}
 
 	init_data->irq = pmic_plat_data->irq;
 	init_data->irq_base = pmic_plat_data->irq_base;
@@ -256,14 +256,6 @@ static __devinit int tps65910_i2c_probe(struct i2c_client *i2c,
 
 	tps65910_sleepinit(tps65910, pmic_plat_data);
 
-	kfree(init_data);
-	return ret;
-
-err:
-	regmap_exit(tps65910->regmap);
-regmap_err:
-	kfree(tps65910);
-	kfree(init_data);
 	return ret;
 }
 
@@ -273,8 +265,6 @@ static __devexit int tps65910_i2c_remove(struct i2c_client *i2c)
 
 	tps65910_irq_exit(tps65910);
 	mfd_remove_devices(tps65910->dev);
-	regmap_exit(tps65910->regmap);
-	kfree(tps65910);
 
 	return 0;
 }
-- 
cgit v0.10.2


From 32df986e985921386b75b4bd1117102bf65fe095 Mon Sep 17 00:00:00 2001
From: Laxman Dewangan <ldewangan@nvidia.com>
Date: Fri, 11 May 2012 15:07:44 +0200
Subject: mfd: Register tps65910 gpios as an mfd device

As gpio support for tps65910 is on gpio driver, registering
gpio support as the mfd sub devices instead of calling gpio_init()
from the core probe.

Signed-off-by: Laxman Dewangan <ldewangan@nvidia.com>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>

diff --git a/drivers/mfd/Kconfig b/drivers/mfd/Kconfig
index 1e9a7d5..b914483 100644
--- a/drivers/mfd/Kconfig
+++ b/drivers/mfd/Kconfig
@@ -189,7 +189,6 @@ config MFD_TPS65910
 	bool "TPS65910 Power Management chip"
 	depends on I2C=y && GPIOLIB
 	select MFD_CORE
-	select GPIO_TPS65910
 	select REGMAP_I2C
 	help
 	  if you say yes here you get support for the TPS65910 series of
diff --git a/drivers/mfd/tps65910.c b/drivers/mfd/tps65910.c
index 22fa430..553574d 100644
--- a/drivers/mfd/tps65910.c
+++ b/drivers/mfd/tps65910.c
@@ -19,7 +19,6 @@
 #include <linux/err.h>
 #include <linux/slab.h>
 #include <linux/i2c.h>
-#include <linux/gpio.h>
 #include <linux/mfd/core.h>
 #include <linux/regmap.h>
 #include <linux/mfd/tps65910.h>
@@ -27,6 +26,9 @@
 
 static struct mfd_cell tps65910s[] = {
 	{
+		.name = "tps65910-gpio",
+	},
+	{
 		.name = "tps65910-pmic",
 	},
 	{
@@ -250,8 +252,6 @@ static __devinit int tps65910_i2c_probe(struct i2c_client *i2c,
 	init_data->irq = pmic_plat_data->irq;
 	init_data->irq_base = pmic_plat_data->irq_base;
 
-	tps65910_gpio_init(tps65910, pmic_plat_data->gpio_base);
-
 	tps65910_irq_init(tps65910, init_data->irq, init_data);
 
 	tps65910_sleepinit(tps65910, pmic_plat_data);
diff --git a/include/linux/mfd/tps65910.h b/include/linux/mfd/tps65910.h
index 949f1da..c2673ee 100644
--- a/include/linux/mfd/tps65910.h
+++ b/include/linux/mfd/tps65910.h
@@ -830,9 +830,6 @@ struct tps65910 {
 	struct tps65910_rtc *rtc;
 	struct tps65910_power *power;
 
-	/* GPIO Handling */
-	struct gpio_chip gpio;
-
 	/* IRQ Handling */
 	struct mutex irq_lock;
 	int chip_irq;
@@ -846,7 +843,6 @@ struct tps65910_platform_data {
 	int irq_base;
 };
 
-void tps65910_gpio_init(struct tps65910 *tps65910, int gpio_base);
 int tps65910_irq_init(struct tps65910 *tps65910, int irq,
 		struct tps65910_platform_data *pdata);
 int tps65910_irq_exit(struct tps65910 *tps65910);
-- 
cgit v0.10.2


From 7f65f74ccee15f6eb0009921a428e3c5d5d06ae0 Mon Sep 17 00:00:00 2001
From: Samuel Ortiz <sameo@linux.intel.com>
Date: Fri, 11 May 2012 15:10:28 +0200
Subject: mfd: Fix tps65910 build failure
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The tps65910_parse_dt() prototype for !CONFIG_OF was not correct, leading to:

drivers/mfd/tps65910.c: In function ‘tps65910_i2c_probe’:
drivers/mfd/tps65910.c:218:3: error: too many arguments to function ‘tps65910_parse_dt’

Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>

diff --git a/drivers/mfd/tps65910.c b/drivers/mfd/tps65910.c
index 553574d..18b30cf 100644
--- a/drivers/mfd/tps65910.c
+++ b/drivers/mfd/tps65910.c
@@ -196,8 +196,9 @@ static struct tps65910_board *tps65910_parse_dt(struct i2c_client *client,
 	return board_info;
 }
 #else
-static inline struct tps65910_board *tps65910_parse_dt(
-					struct i2c_client *client)
+static inline
+struct tps65910_board *tps65910_parse_dt(struct i2c_client *client,
+					 int *chip_id)
 {
 	return NULL;
 }
-- 
cgit v0.10.2


From 27757e8262669321b496c55f06f4844e827fd1c5 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@opensource.wolfsonmicro.com>
Date: Wed, 9 May 2012 21:18:05 +0100
Subject: mfd: Staticise non-exported tps65217_update_bits()

Signed-off-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
Acked-by: AnilKumar Ch <anilkumar@ti.com>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>

diff --git a/drivers/mfd/tps65217.c b/drivers/mfd/tps65217.c
index c064c0a..db194e4 100644
--- a/drivers/mfd/tps65217.c
+++ b/drivers/mfd/tps65217.c
@@ -96,7 +96,7 @@ EXPORT_SYMBOL_GPL(tps65217_reg_write);
  * @val: Value to write.
  * @level: Password protected level
  */
-int tps65217_update_bits(struct tps65217 *tps, unsigned int reg,
+static int tps65217_update_bits(struct tps65217 *tps, unsigned int reg,
 		unsigned int mask, unsigned int val, unsigned int level)
 {
 	int ret;
-- 
cgit v0.10.2


From 879eed68265c8dcb2f2856ec96820fc93b7038c9 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@opensource.wolfsonmicro.com>
Date: Wed, 9 May 2012 22:53:48 +0100
Subject: mfd: Remove wm8400 custom cache implementation

Save a useful amount of code by removing the custom cache implementation
for wm8400 and using the regmap cache. Also simplify things by not
separately reseting the CODEC registers, this is a sufficiently infrequent
operation that we can simply invalidate the entire cache when this happens.

Signed-off-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>

diff --git a/drivers/mfd/wm8400-core.c b/drivers/mfd/wm8400-core.c
index 1189a17..9083b77 100644
--- a/drivers/mfd/wm8400-core.c
+++ b/drivers/mfd/wm8400-core.c
@@ -23,136 +23,16 @@
 #include <linux/regmap.h>
 #include <linux/slab.h>
 
-static struct {
-	u16  readable;    /* Mask of readable bits */
-	u16  writable;    /* Mask of writable bits */
-	u16  vol;         /* Mask of volatile bits */
-	int  is_codec;    /* Register controlled by codec reset */
-	u16  default_val; /* Value on reset */
-} reg_data[] = {
-	{ 0xFFFF, 0xFFFF, 0x0000, 0, 0x6172 }, /* R0 */
-	{ 0x7000, 0x0000, 0x8000, 0, 0x0000 }, /* R1 */
-	{ 0xFF17, 0xFF17, 0x0000, 0, 0x0000 }, /* R2 */
-	{ 0xEBF3, 0xEBF3, 0x0000, 1, 0x6000 }, /* R3 */
-	{ 0x3CF3, 0x3CF3, 0x0000, 1, 0x0000 }, /* R4  */
-	{ 0xF1F8, 0xF1F8, 0x0000, 1, 0x4050 }, /* R5  */
-	{ 0xFC1F, 0xFC1F, 0x0000, 1, 0x4000 }, /* R6  */
-	{ 0xDFDE, 0xDFDE, 0x0000, 1, 0x01C8 }, /* R7  */
-	{ 0xFCFC, 0xFCFC, 0x0000, 1, 0x0000 }, /* R8  */
-	{ 0xEFFF, 0xEFFF, 0x0000, 1, 0x0040 }, /* R9  */
-	{ 0xEFFF, 0xEFFF, 0x0000, 1, 0x0040 }, /* R10 */
-	{ 0x27F7, 0x27F7, 0x0000, 1, 0x0004 }, /* R11 */
-	{ 0x01FF, 0x01FF, 0x0000, 1, 0x00C0 }, /* R12 */
-	{ 0x01FF, 0x01FF, 0x0000, 1, 0x00C0 }, /* R13 */
-	{ 0x1FEF, 0x1FEF, 0x0000, 1, 0x0000 }, /* R14 */
-	{ 0x0163, 0x0163, 0x0000, 1, 0x0100 }, /* R15 */
-	{ 0x01FF, 0x01FF, 0x0000, 1, 0x00C0 }, /* R16 */
-	{ 0x01FF, 0x01FF, 0x0000, 1, 0x00C0 }, /* R17 */
-	{ 0x1FFF, 0x0FFF, 0x0000, 1, 0x0000 }, /* R18 */
-	{ 0xFFFF, 0xFFFF, 0x0000, 1, 0x1000 }, /* R19 */
-	{ 0xFFFF, 0xFFFF, 0x0000, 1, 0x1010 }, /* R20 */
-	{ 0xFFFF, 0xFFFF, 0x0000, 1, 0x1010 }, /* R21 */
-	{ 0x0FDD, 0x0FDD, 0x0000, 1, 0x8000 }, /* R22 */
-	{ 0x1FFF, 0x1FFF, 0x0000, 1, 0x0800 }, /* R23 */
-	{ 0x0000, 0x01DF, 0x0000, 1, 0x008B }, /* R24 */
-	{ 0x0000, 0x01DF, 0x0000, 1, 0x008B }, /* R25 */
-	{ 0x0000, 0x01DF, 0x0000, 1, 0x008B }, /* R26 */
-	{ 0x0000, 0x01DF, 0x0000, 1, 0x008B }, /* R27 */
-	{ 0x0000, 0x01FF, 0x0000, 1, 0x0000 }, /* R28 */
-	{ 0x0000, 0x01FF, 0x0000, 1, 0x0000 }, /* R29 */
-	{ 0x0000, 0x0077, 0x0000, 1, 0x0066 }, /* R30 */
-	{ 0x0000, 0x0033, 0x0000, 1, 0x0022 }, /* R31 */
-	{ 0x0000, 0x01FF, 0x0000, 1, 0x0079 }, /* R32 */
-	{ 0x0000, 0x01FF, 0x0000, 1, 0x0079 }, /* R33 */
-	{ 0x0000, 0x0003, 0x0000, 1, 0x0003 }, /* R34 */
-	{ 0x0000, 0x01FF, 0x0000, 1, 0x0003 }, /* R35 */
-	{ 0x0000, 0x0000, 0x0000, 0, 0x0000 }, /* R36 */
-	{ 0x0000, 0x003F, 0x0000, 1, 0x0100 }, /* R37 */
-	{ 0x0000, 0x0000, 0x0000, 0, 0x0000 }, /* R38 */
-	{ 0x0000, 0x000F, 0x0000, 0, 0x0000 }, /* R39 */
-	{ 0x0000, 0x00FF, 0x0000, 1, 0x0000 }, /* R40 */
-	{ 0x0000, 0x01B7, 0x0000, 1, 0x0000 }, /* R41 */
-	{ 0x0000, 0x01B7, 0x0000, 1, 0x0000 }, /* R42 */
-	{ 0x0000, 0x01FF, 0x0000, 1, 0x0000 }, /* R43 */
-	{ 0x0000, 0x01FF, 0x0000, 1, 0x0000 }, /* R44 */
-	{ 0x0000, 0x00FD, 0x0000, 1, 0x0000 }, /* R45 */
-	{ 0x0000, 0x00FD, 0x0000, 1, 0x0000 }, /* R46 */
-	{ 0x0000, 0x01FF, 0x0000, 1, 0x0000 }, /* R47 */
-	{ 0x0000, 0x01FF, 0x0000, 1, 0x0000 }, /* R48 */
-	{ 0x0000, 0x01FF, 0x0000, 1, 0x0000 }, /* R49 */
-	{ 0x0000, 0x01FF, 0x0000, 1, 0x0000 }, /* R50 */
-	{ 0x0000, 0x01B3, 0x0000, 1, 0x0180 }, /* R51 */
-	{ 0x0000, 0x0077, 0x0000, 1, 0x0000 }, /* R52 */
-	{ 0x0000, 0x0077, 0x0000, 1, 0x0000 }, /* R53 */
-	{ 0x0000, 0x00FF, 0x0000, 1, 0x0000 }, /* R54 */
-	{ 0x0000, 0x0001, 0x0000, 1, 0x0000 }, /* R55 */
-	{ 0x0000, 0x003F, 0x0000, 1, 0x0000 }, /* R56 */
-	{ 0x0000, 0x004F, 0x0000, 1, 0x0000 }, /* R57 */
-	{ 0x0000, 0x00FD, 0x0000, 1, 0x0000 }, /* R58 */
-	{ 0x0000, 0x0000, 0x0000, 0, 0x0000 }, /* R59 */
-	{ 0x1FFF, 0x1FFF, 0x0000, 1, 0x0000 }, /* R60 */
-	{ 0xFFFF, 0xFFFF, 0x0000, 1, 0x0000 }, /* R61 */
-	{ 0x03FF, 0x03FF, 0x0000, 1, 0x0000 }, /* R62 */
-	{ 0x007F, 0x007F, 0x0000, 1, 0x0000 }, /* R63 */
-	{ 0x0000, 0x0000, 0x0000, 0, 0x0000 }, /* R64 */
-	{ 0xDFFF, 0xDFFF, 0x0000, 0, 0x0000 }, /* R65 */
-	{ 0xDFFF, 0xDFFF, 0x0000, 0, 0x0000 }, /* R66 */
-	{ 0xDFFF, 0xDFFF, 0x0000, 0, 0x0000 }, /* R67 */
-	{ 0xDFFF, 0xDFFF, 0x0000, 0, 0x0000 }, /* R68 */
-	{ 0x0000, 0x0000, 0x0000, 0, 0x0000 }, /* R69 */
-	{ 0xFFFF, 0xFFFF, 0x0000, 0, 0x4400 }, /* R70 */
-	{ 0x23FF, 0x23FF, 0x0000, 0, 0x0000 }, /* R71 */
-	{ 0xFFFF, 0xFFFF, 0x0000, 0, 0x4400 }, /* R72 */
-	{ 0x23FF, 0x23FF, 0x0000, 0, 0x0000 }, /* R73 */
-	{ 0x0000, 0x0000, 0x0000, 0, 0x0000 }, /* R74 */
-	{ 0x000E, 0x000E, 0x0000, 0, 0x0008 }, /* R75 */
-	{ 0xE00F, 0xE00F, 0x0000, 0, 0x0000 }, /* R76 */
-	{ 0x0000, 0x0000, 0x0000, 0, 0x0000 }, /* R77 */
-	{ 0x03C0, 0x03C0, 0x0000, 0, 0x02C0 }, /* R78 */
-	{ 0xFFFF, 0x0000, 0xffff, 0, 0x0000 }, /* R79 */
-	{ 0xFFFF, 0xFFFF, 0x0000, 0, 0x0000 }, /* R80 */
-	{ 0xFFFF, 0x0000, 0xffff, 0, 0x0000 }, /* R81 */
-	{ 0x2BFF, 0x0000, 0xffff, 0, 0x0000 }, /* R82 */
-	{ 0x0000, 0x0000, 0x0000, 0, 0x0000 }, /* R83 */
-	{ 0x80FF, 0x80FF, 0x0000, 0, 0x00ff }, /* R84 */
-};
-
-static int wm8400_read(struct wm8400 *wm8400, u8 reg, int num_regs, u16 *dest)
+static bool wm8400_volatile(struct device *dev, unsigned int reg)
 {
-	int i, ret = 0;
-
-	BUG_ON(reg + num_regs > ARRAY_SIZE(wm8400->reg_cache));
-
-	/* If there are any volatile reads then read back the entire block */
-	for (i = reg; i < reg + num_regs; i++)
-		if (reg_data[i].vol) {
-			ret = regmap_bulk_read(wm8400->regmap, reg, dest,
-					       num_regs);
-			return ret;
-		}
-
-	/* Otherwise use the cache */
-	memcpy(dest, &wm8400->reg_cache[reg], num_regs * sizeof(u16));
-
-	return 0;
-}
-
-static int wm8400_write(struct wm8400 *wm8400, u8 reg, int num_regs,
-			u16 *src)
-{
-	int ret, i;
-
-	BUG_ON(reg + num_regs > ARRAY_SIZE(wm8400->reg_cache));
-
-	for (i = 0; i < num_regs; i++) {
-		BUG_ON(!reg_data[reg + i].writable);
-		wm8400->reg_cache[reg + i] = src[i];
-		ret = regmap_write(wm8400->regmap, reg, src[i]);
-		if (ret != 0)
-			return ret;
+	switch (reg) {
+	case WM8400_INTERRUPT_STATUS_1:
+	case WM8400_INTERRUPT_LEVELS:
+	case WM8400_SHUTDOWN_REASON:
+		return true;
+	default:
+		return false;
 	}
-
-	return 0;
 }
 
 /**
@@ -165,13 +45,12 @@ static int wm8400_write(struct wm8400 *wm8400, u8 reg, int num_regs,
  */
 u16 wm8400_reg_read(struct wm8400 *wm8400, u8 reg)
 {
-	u16 val;
-
-	mutex_lock(&wm8400->io_lock);
-
-	wm8400_read(wm8400, reg, 1, &val);
+	unsigned int val;
+	int ret;
 
-	mutex_unlock(&wm8400->io_lock);
+	ret = regmap_read(wm8400->regmap, reg, &val);
+	if (ret < 0)
+		return ret;
 
 	return val;
 }
@@ -179,62 +58,8 @@ EXPORT_SYMBOL_GPL(wm8400_reg_read);
 
 int wm8400_block_read(struct wm8400 *wm8400, u8 reg, int count, u16 *data)
 {
-	int ret;
-
-	mutex_lock(&wm8400->io_lock);
-
-	ret = wm8400_read(wm8400, reg, count, data);
-
-	mutex_unlock(&wm8400->io_lock);
-
-	return ret;
-}
-EXPORT_SYMBOL_GPL(wm8400_block_read);
-
-/**
- * wm8400_set_bits - Bitmask write
- *
- * @wm8400: Pointer to wm8400 control structure
- * @reg:    Register to access
- * @mask:   Mask of bits to change
- * @val:    Value to set for masked bits
- */
-int wm8400_set_bits(struct wm8400 *wm8400, u8 reg, u16 mask, u16 val)
-{
-	u16 tmp;
-	int ret;
-
-	mutex_lock(&wm8400->io_lock);
-
-	ret = wm8400_read(wm8400, reg, 1, &tmp);
-	tmp = (tmp & ~mask) | val;
-	if (ret == 0)
-		ret = wm8400_write(wm8400, reg, 1, &tmp);
-
-	mutex_unlock(&wm8400->io_lock);
-
-	return ret;
-}
-EXPORT_SYMBOL_GPL(wm8400_set_bits);
-
-/**
- * wm8400_reset_codec_reg_cache - Reset cached codec registers to
- * their default values.
- */
-void wm8400_reset_codec_reg_cache(struct wm8400 *wm8400)
-{
-	int i;
-
-	mutex_lock(&wm8400->io_lock);
-
-	/* Reset all codec registers to their initial value */
-	for (i = 0; i < ARRAY_SIZE(wm8400->reg_cache); i++)
-		if (reg_data[i].is_codec)
-			wm8400->reg_cache[i] = reg_data[i].default_val;
-
-	mutex_unlock(&wm8400->io_lock);
+	return regmap_bulk_read(wm8400->regmap, reg, data, count);
 }
-EXPORT_SYMBOL_GPL(wm8400_reset_codec_reg_cache);
 
 static int wm8400_register_codec(struct wm8400 *wm8400)
 {
@@ -257,44 +82,24 @@ static int wm8400_register_codec(struct wm8400 *wm8400)
 static int wm8400_init(struct wm8400 *wm8400,
 		       struct wm8400_platform_data *pdata)
 {
-	u16 reg;
-	int ret, i;
-
-	mutex_init(&wm8400->io_lock);
+	unsigned int reg;
+	int ret;
 
 	dev_set_drvdata(wm8400->dev, wm8400);
 
 	/* Check that this is actually a WM8400 */
-	ret = regmap_read(wm8400->regmap, WM8400_RESET_ID, &i);
+	ret = regmap_read(wm8400->regmap, WM8400_RESET_ID, &reg);
 	if (ret != 0) {
 		dev_err(wm8400->dev, "Chip ID register read failed\n");
 		return -EIO;
 	}
-	if (i != reg_data[WM8400_RESET_ID].default_val) {
-		dev_err(wm8400->dev, "Device is not a WM8400, ID is %x\n", i);
+	if (reg != 0x6172) {
+		dev_err(wm8400->dev, "Device is not a WM8400, ID is %x\n",
+			reg);
 		return -ENODEV;
 	}
 
-	/* We don't know what state the hardware is in and since this
-	 * is a PMIC we can't reset it safely so initialise the register
-	 * cache from the hardware.
-	 */
-	ret = regmap_raw_read(wm8400->regmap, 0, wm8400->reg_cache,
-			      ARRAY_SIZE(wm8400->reg_cache));
-	if (ret != 0) {
-		dev_err(wm8400->dev, "Register cache read failed\n");
-		return -EIO;
-	}
-	for (i = 0; i < ARRAY_SIZE(wm8400->reg_cache); i++)
-		wm8400->reg_cache[i] = be16_to_cpu(wm8400->reg_cache[i]);
-
-	/* If the codec is in reset use hard coded values */
-	if (!(wm8400->reg_cache[WM8400_POWER_MANAGEMENT_1] & WM8400_CODEC_ENA))
-		for (i = 0; i < ARRAY_SIZE(wm8400->reg_cache); i++)
-			if (reg_data[i].is_codec)
-				wm8400->reg_cache[i] = reg_data[i].default_val;
-
-	ret = wm8400_read(wm8400, WM8400_ID, 1, &reg);
+	ret = regmap_read(wm8400->regmap, WM8400_ID, &reg);
 	if (ret != 0) {
 		dev_err(wm8400->dev, "ID register read failed: %d\n", ret);
 		return ret;
@@ -334,8 +139,22 @@ static const struct regmap_config wm8400_regmap_config = {
 	.reg_bits = 8,
 	.val_bits = 16,
 	.max_register = WM8400_REGISTER_COUNT - 1,
+
+	.volatile_reg = wm8400_volatile,
+
+	.cache_type = REGCACHE_RBTREE,
 };
 
+/**
+ * wm8400_reset_codec_reg_cache - Reset cached codec registers to
+ * their default values.
+ */
+void wm8400_reset_codec_reg_cache(struct wm8400 *wm8400)
+{
+	regmap_reinit_cache(wm8400->regmap, &wm8400_regmap_config);
+}
+EXPORT_SYMBOL_GPL(wm8400_reset_codec_reg_cache);
+
 #if defined(CONFIG_I2C) || defined(CONFIG_I2C_MODULE)
 static int wm8400_i2c_probe(struct i2c_client *i2c,
 			    const struct i2c_device_id *id)
diff --git a/include/linux/mfd/wm8400-private.h b/include/linux/mfd/wm8400-private.h
index 0147b69..2de565b 100644
--- a/include/linux/mfd/wm8400-private.h
+++ b/include/linux/mfd/wm8400-private.h
@@ -24,19 +24,14 @@
 #include <linux/mfd/wm8400.h>
 #include <linux/mutex.h>
 #include <linux/platform_device.h>
-
-struct regmap;
+#include <linux/regmap.h>
 
 #define WM8400_REGISTER_COUNT 0x55
 
 struct wm8400 {
 	struct device *dev;
-
-	struct mutex io_lock;
 	struct regmap *regmap;
 
-	u16 reg_cache[WM8400_REGISTER_COUNT];
-
 	struct platform_device regulators[6];
 };
 
@@ -930,6 +925,11 @@ struct wm8400 {
 
 u16 wm8400_reg_read(struct wm8400 *wm8400, u8 reg);
 int wm8400_block_read(struct wm8400 *wm8400, u8 reg, int count, u16 *data);
-int wm8400_set_bits(struct wm8400 *wm8400, u8 reg, u16 mask, u16 val);
+
+static inline int wm8400_set_bits(struct wm8400 *wm8400, u8 reg,
+				  u16 mask, u16 val)
+{
+	return regmap_update_bits(wm8400->regmap, reg, mask, val);
+}
 
 #endif
-- 
cgit v0.10.2


From d9055dc501da6734e3cfea1ef236173bd8b645b1 Mon Sep 17 00:00:00 2001
From: Johan Hovold <jhovold@gmail.com>
Date: Thu, 10 May 2012 14:11:28 +0200
Subject: mfd: Add boost frequency and ovp to lm3533 platform data

Add boost-frequency and over-voltage-protection settings to platform
data.

Signed-off-by: Johan Hovold <jhovold@gmail.com>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>

diff --git a/drivers/mfd/lm3533-core.c b/drivers/mfd/lm3533-core.c
index 75f4b7f..053438c 100644
--- a/drivers/mfd/lm3533-core.c
+++ b/drivers/mfd/lm3533-core.c
@@ -138,6 +138,35 @@ int lm3533_update(struct lm3533 *lm3533, u8 reg, u8 val, u8 mask)
 }
 EXPORT_SYMBOL_GPL(lm3533_update);
 
+static int lm3533_set_boost_freq(struct lm3533 *lm3533,
+						enum lm3533_boost_freq freq)
+{
+	int ret;
+
+	ret = lm3533_update(lm3533, LM3533_REG_BOOST_PWM,
+					freq << LM3533_BOOST_FREQ_SHIFT,
+					LM3533_BOOST_FREQ_MASK);
+	if (ret)
+		dev_err(lm3533->dev, "failed to set boost frequency\n");
+
+	return ret;
+}
+
+
+static int lm3533_set_boost_ovp(struct lm3533 *lm3533,
+						enum lm3533_boost_ovp ovp)
+{
+	int ret;
+
+	ret = lm3533_update(lm3533, LM3533_REG_BOOST_PWM,
+					ovp << LM3533_BOOST_OVP_SHIFT,
+					LM3533_BOOST_OVP_MASK);
+	if (ret)
+		dev_err(lm3533->dev, "failed to set boost ovp\n");
+
+	return ret;
+}
+
 /*
  * HVLED output config -- output hvled controlled by backlight bl
  */
@@ -521,6 +550,22 @@ static int __devinit lm3533_device_led_init(struct lm3533 *lm3533)
 	return 0;
 }
 
+static int __devinit lm3533_device_setup(struct lm3533 *lm3533,
+					struct lm3533_platform_data *pdata)
+{
+	int ret;
+
+	ret = lm3533_set_boost_freq(lm3533, pdata->boost_freq);
+	if (ret)
+		return ret;
+
+	ret = lm3533_set_boost_ovp(lm3533, pdata->boost_ovp);
+	if (ret)
+		return ret;
+
+	return 0;
+}
+
 static int __devinit lm3533_device_init(struct lm3533 *lm3533)
 {
 	struct lm3533_platform_data *pdata = lm3533->dev->platform_data;
@@ -550,6 +595,10 @@ static int __devinit lm3533_device_init(struct lm3533 *lm3533)
 
 	lm3533_enable(lm3533);
 
+	ret = lm3533_device_setup(lm3533, pdata);
+	if (ret)
+		goto err_disable;
+
 	lm3533_device_als_init(lm3533);
 	lm3533_device_bl_init(lm3533);
 	lm3533_device_led_init(lm3533);
@@ -564,6 +613,7 @@ static int __devinit lm3533_device_init(struct lm3533 *lm3533)
 
 err_unregister:
 	mfd_remove_devices(lm3533->dev);
+err_disable:
 	lm3533_disable(lm3533);
 	if (gpio_is_valid(lm3533->gpio_hwen))
 		gpio_free(lm3533->gpio_hwen);
diff --git a/include/linux/mfd/lm3533.h b/include/linux/mfd/lm3533.h
index 75f85f3..3361137 100644
--- a/include/linux/mfd/lm3533.h
+++ b/include/linux/mfd/lm3533.h
@@ -59,9 +59,24 @@ struct lm3533_led_platform_data {
 	u8 pwm;				/* 0 - 0x3f */
 };
 
+enum lm3533_boost_freq {
+	LM3533_BOOST_FREQ_500KHZ,
+	LM3533_BOOST_FREQ_1000KHZ,
+};
+
+enum lm3533_boost_ovp {
+	LM3533_BOOST_OVP_16V,
+	LM3533_BOOST_OVP_24V,
+	LM3533_BOOST_OVP_32V,
+	LM3533_BOOST_OVP_40V,
+};
+
 struct lm3533_platform_data {
 	int gpio_hwen;
 
+	enum lm3533_boost_ovp boost_ovp;
+	enum lm3533_boost_freq boost_freq;
+
 	struct lm3533_als_platform_data *als;
 
 	struct lm3533_bl_platform_data *backlights;
-- 
cgit v0.10.2


From f4cf18ca5914bc1edb03c9049198738255fa4a23 Mon Sep 17 00:00:00 2001
From: Johan Hovold <jhovold@gmail.com>
Date: Thu, 10 May 2012 14:11:29 +0200
Subject: mfd: Remove lm3533 boost attributes

Remove boost-frequency and ovp attributes, which can be set through
platform data, from sysfs.

Signed-off-by: Johan Hovold <jhovold@gmail.com>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>

diff --git a/Documentation/ABI/testing/sysfs-bus-i2c-devices-lm3533 b/Documentation/ABI/testing/sysfs-bus-i2c-devices-lm3533
index 5700721..1b62230 100644
--- a/Documentation/ABI/testing/sysfs-bus-i2c-devices-lm3533
+++ b/Documentation/ABI/testing/sysfs-bus-i2c-devices-lm3533
@@ -1,26 +1,3 @@
-What:		/sys/bus/i2c/devices/.../boost_freq
-Date:		April 2012
-KernelVersion:	3.5
-Contact:	Johan Hovold <jhovold@gmail.com>
-Description:
-		Set the boost converter switching frequency (0, 1), where
-
-		0 -  500Hz
-		1 - 1000Hz
-
-What:		/sys/bus/i2c/devices/.../boost_ovp
-Date:		April 2012
-KernelVersion:	3.5
-Contact:	Johan Hovold <jhovold@gmail.com>
-Description:
-		Set the boost converter over-voltage protection threshold
-		(0..3), where
-
-		0 - 16V
-		1 - 24V
-		2 - 32V
-		3 - 40V
-
 What:		/sys/bus/i2c/devices/.../output_hvled[n]
 Date:		April 2012
 KernelVersion:	3.5
diff --git a/drivers/mfd/lm3533-core.c b/drivers/mfd/lm3533-core.c
index 053438c..d670543 100644
--- a/drivers/mfd/lm3533-core.c
+++ b/drivers/mfd/lm3533-core.c
@@ -26,11 +26,9 @@
 #include <linux/mfd/lm3533.h>
 
 
-#define LM3533_BOOST_OVP_MAX		0x03
 #define LM3533_BOOST_OVP_MASK		0x06
 #define LM3533_BOOST_OVP_SHIFT		1
 
-#define LM3533_BOOST_FREQ_MAX		0x01
 #define LM3533_BOOST_FREQ_MASK		0x01
 #define LM3533_BOOST_FREQ_SHIFT		0
 
@@ -253,96 +251,12 @@ struct lm3533_device_attribute {
 		struct {
 			u8 id;
 		} output;
-		struct {
-			u8 reg;
-			u8 shift;
-			u8 mask;
-			u8 max;
-		} generic;
 	} u;
 };
 
 #define to_lm3533_dev_attr(_attr) \
 	container_of(_attr, struct lm3533_device_attribute, dev_attr)
 
-static ssize_t show_lm3533_reg(struct device *dev,
-				struct device_attribute *attr, char *buf)
-{
-	struct lm3533 *lm3533 = dev_get_drvdata(dev);
-	struct lm3533_device_attribute *lattr = to_lm3533_dev_attr(attr);
-	u8 val;
-	int ret;
-
-	ret = lm3533_read(lm3533, lattr->u.generic.reg, &val);
-	if (ret)
-		return ret;
-
-	val = (val & lattr->u.generic.mask) >> lattr->u.generic.shift;
-
-	return scnprintf(buf, PAGE_SIZE, "%u\n", val);
-}
-
-static ssize_t store_lm3533_reg(struct device *dev,
-						struct device_attribute *attr,
-						const char *buf, size_t len)
-{
-	struct lm3533 *lm3533 = dev_get_drvdata(dev);
-	struct lm3533_device_attribute *lattr = to_lm3533_dev_attr(attr);
-	u8 val;
-	int ret;
-
-	if (kstrtou8(buf, 0, &val) || val > lattr->u.generic.max)
-		return -EINVAL;
-
-	val = val << lattr->u.generic.shift;
-	ret = lm3533_update(lm3533, lattr->u.generic.reg, val,
-							lattr->u.generic.mask);
-	if (ret)
-		return ret;
-
-	return len;
-}
-
-#define GENERIC_ATTR(_reg, _max, _mask, _shift) \
-	{ .reg		= _reg, \
-	  .max		= _max, \
-	  .mask		= _mask, \
-	  .shift	= _shift }
-
-#define LM3533_GENERIC_ATTR(_name, _mode, _show, _store, _type,	\
-						_reg, _max, _mask, _shift) \
-	struct lm3533_device_attribute lm3533_dev_attr_##_name = { \
-		.dev_attr	= __ATTR(_name, _mode, _show, _store), \
-		.type		= _type, \
-		.u.generic	= GENERIC_ATTR(_reg, _max, _mask, _shift) }
-
-#define LM3533_GENERIC_ATTR_RW(_name, _type, _reg, _max, _mask, _shift) \
-	LM3533_GENERIC_ATTR(_name, S_IRUGO | S_IWUSR, \
-					show_lm3533_reg, store_lm3533_reg, \
-					_type, _reg, _max, _mask, _shift)
-
-#define LM3533_BOOST_ATTR_RW(_name, _NAME) \
-	LM3533_GENERIC_ATTR_RW(_name, LM3533_ATTR_TYPE_BACKLIGHT, \
-				LM3533_REG_BOOST_PWM, LM3533_##_NAME##_MAX, \
-				LM3533_##_NAME##_MASK, LM3533_##_NAME##_SHIFT)
-/*
- * Boost Over Voltage Protection Select
- *
- *   0 - 16 V (default)
- *   1 - 24 V
- *   2 - 32 V
- *   3 - 40 V
- */
-static LM3533_BOOST_ATTR_RW(boost_ovp, BOOST_OVP);
-
-/*
- * Boost Frequency Select
- *
- *   0 - 500 kHz (default)
- *   1 - 1 MHz
- */
-static LM3533_BOOST_ATTR_RW(boost_freq, BOOST_FREQ);
-
 static ssize_t show_output(struct device *dev,
 				struct device_attribute *attr, char *buf)
 {
@@ -432,8 +346,6 @@ static LM3533_OUTPUT_LVLED_ATTR_RW(4);
 static LM3533_OUTPUT_LVLED_ATTR_RW(5);
 
 static struct attribute *lm3533_attributes[] = {
-	&lm3533_dev_attr_boost_freq.dev_attr.attr,
-	&lm3533_dev_attr_boost_ovp.dev_attr.attr,
 	&lm3533_dev_attr_output_hvled1.dev_attr.attr,
 	&lm3533_dev_attr_output_hvled2.dev_attr.attr,
 	&lm3533_dev_attr_output_lvled1.dev_attr.attr,
-- 
cgit v0.10.2


From 7af5e87dc5e6b6f413ba95b06e06ebf810687858 Mon Sep 17 00:00:00 2001
From: Johan Hovold <jhovold@gmail.com>
Date: Thu, 10 May 2012 19:18:28 +0200
Subject: mfd: Remove unused max-current lm3533 function

The max-current attributes of the subdrivers have been dropped so
remove the no longer used lm3533_ctrlbank_get_max_current function.

Signed-off-by: Johan Hovold <jhovold@gmail.com>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>

diff --git a/drivers/mfd/lm3533-ctrlbank.c b/drivers/mfd/lm3533-ctrlbank.c
index c2732a3..adf4c1a 100644
--- a/drivers/mfd/lm3533-ctrlbank.c
+++ b/drivers/mfd/lm3533-ctrlbank.c
@@ -113,7 +113,6 @@ lm3533_ctrlbank_get(brightness, BRIGHTNESS);
  *   31 - 29.8 mA
  */
 lm3533_ctrlbank_set(max_current, MAX_CURRENT);
-lm3533_ctrlbank_get(max_current, MAX_CURRENT);
 
 /*
  * PWM-input control mask:
diff --git a/include/linux/mfd/lm3533.h b/include/linux/mfd/lm3533.h
index 3361137..7cfef9e 100644
--- a/include/linux/mfd/lm3533.h
+++ b/include/linux/mfd/lm3533.h
@@ -92,8 +92,6 @@ extern int lm3533_ctrlbank_disable(struct lm3533_ctrlbank *cb);
 extern int lm3533_ctrlbank_set_brightness(struct lm3533_ctrlbank *cb, u8 val);
 extern int lm3533_ctrlbank_get_brightness(struct lm3533_ctrlbank *cb, u8 *val);
 extern int lm3533_ctrlbank_set_max_current(struct lm3533_ctrlbank *cb, u8 val);
-extern int lm3533_ctrlbank_get_max_current(struct lm3533_ctrlbank *cb,
-								u8 *val);
 extern int lm3533_ctrlbank_set_pwm(struct lm3533_ctrlbank *cb, u8 val);
 extern int lm3533_ctrlbank_get_pwm(struct lm3533_ctrlbank *cb, u8 *val);
 
-- 
cgit v0.10.2


From 6fa4b9d802610116adf4b89c2f9bd155829aafd3 Mon Sep 17 00:00:00 2001
From: Johan Hovold <jhovold@gmail.com>
Date: Thu, 10 May 2012 19:18:29 +0200
Subject: mfd: Use SI-units for the lm3533 max-current interface

Use SI-units (uA) for max-current interface (5000 - 29800 uA).

Signed-off-by: Johan Hovold <jhovold@gmail.com>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>

diff --git a/drivers/mfd/lm3533-ctrlbank.c b/drivers/mfd/lm3533-ctrlbank.c
index adf4c1a..a4cb7a5 100644
--- a/drivers/mfd/lm3533-ctrlbank.c
+++ b/drivers/mfd/lm3533-ctrlbank.c
@@ -17,8 +17,11 @@
 #include <linux/mfd/lm3533.h>
 
 
+#define LM3533_MAX_CURRENT_MIN		5000
+#define LM3533_MAX_CURRENT_MAX		29800
+#define LM3533_MAX_CURRENT_STEP		800
+
 #define LM3533_BRIGHTNESS_MAX		255
-#define LM3533_MAX_CURRENT_MAX		31
 #define LM3533_PWM_MAX			0x3f
 
 #define LM3533_REG_PWM_BASE		0x14
@@ -65,6 +68,31 @@ int lm3533_ctrlbank_disable(struct lm3533_ctrlbank *cb)
 }
 EXPORT_SYMBOL_GPL(lm3533_ctrlbank_disable);
 
+/*
+ * Full-scale current.
+ *
+ * imax		5000 - 29800 uA (800 uA step)
+ */
+int lm3533_ctrlbank_set_max_current(struct lm3533_ctrlbank *cb, u16 imax)
+{
+	u8 reg;
+	u8 val;
+	int ret;
+
+	if (imax < LM3533_MAX_CURRENT_MIN || imax > LM3533_MAX_CURRENT_MAX)
+		return -EINVAL;
+
+	val = (imax - LM3533_MAX_CURRENT_MIN) / LM3533_MAX_CURRENT_STEP;
+
+	reg = lm3533_ctrlbank_get_reg(cb, LM3533_REG_MAX_CURRENT_BASE);
+	ret = lm3533_write(cb->lm3533, reg, val);
+	if (ret)
+		dev_err(cb->dev, "failed to set max current\n");
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(lm3533_ctrlbank_set_max_current);
+
 #define lm3533_ctrlbank_set(_name, _NAME)				\
 int lm3533_ctrlbank_set_##_name(struct lm3533_ctrlbank *cb, u8 val)	\
 {									\
@@ -102,19 +130,6 @@ lm3533_ctrlbank_set(brightness, BRIGHTNESS);
 lm3533_ctrlbank_get(brightness, BRIGHTNESS);
 
 /*
- * Full scale current.
- *
- * Imax = 5 + val * 0.8 mA, e.g.:
- *
- *    0 - 5 mA
- *     ...
- *   19 - 20.2 mA (default)
- *     ...
- *   31 - 29.8 mA
- */
-lm3533_ctrlbank_set(max_current, MAX_CURRENT);
-
-/*
  * PWM-input control mask:
  *
  *   bit 5 - PWM-input enabled in Zone 4
diff --git a/include/linux/mfd/lm3533.h b/include/linux/mfd/lm3533.h
index 7cfef9e..9660feb 100644
--- a/include/linux/mfd/lm3533.h
+++ b/include/linux/mfd/lm3533.h
@@ -47,15 +47,15 @@ struct lm3533_als_platform_data {
 
 struct lm3533_bl_platform_data {
 	char *name;
+	u16 max_current;		/* 5000 - 29800 uA (800 uA step) */
 	u8 default_brightness;		/* 0 - 255 */
-	u8 max_current;			/* 0 - 31 */
 	u8 pwm;				/* 0 - 0x3f */
 };
 
 struct lm3533_led_platform_data {
 	char *name;
 	const char *default_trigger;
-	u8 max_current;			/* 0 - 31 */
+	u16 max_current;		/* 5000 - 29800 uA (800 uA step) */
 	u8 pwm;				/* 0 - 0x3f */
 };
 
@@ -91,7 +91,8 @@ extern int lm3533_ctrlbank_disable(struct lm3533_ctrlbank *cb);
 
 extern int lm3533_ctrlbank_set_brightness(struct lm3533_ctrlbank *cb, u8 val);
 extern int lm3533_ctrlbank_get_brightness(struct lm3533_ctrlbank *cb, u8 *val);
-extern int lm3533_ctrlbank_set_max_current(struct lm3533_ctrlbank *cb, u8 val);
+extern int lm3533_ctrlbank_set_max_current(struct lm3533_ctrlbank *cb,
+								u16 imax);
 extern int lm3533_ctrlbank_set_pwm(struct lm3533_ctrlbank *cb, u8 val);
 extern int lm3533_ctrlbank_get_pwm(struct lm3533_ctrlbank *cb, u8 *val);
 
-- 
cgit v0.10.2


From 664dd0665ef18462b7fc62dd8bbb3ad5d6e5a7de Mon Sep 17 00:00:00 2001
From: Axel Lin <axel.lin@gmail.com>
Date: Thu, 10 May 2012 23:35:02 +0800
Subject: mfd: Fix lm3533 regmap_update_bits() call

Current code calls regmap_update_bits() with mask and val arguments swapped.

Signed-off-by: Axel Lin <axel.lin@gmail.com>
Acked-by: Johan Hovold <jhovold@gmail.com>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>

diff --git a/drivers/mfd/lm3533-core.c b/drivers/mfd/lm3533-core.c
index d670543..d11c891 100644
--- a/drivers/mfd/lm3533-core.c
+++ b/drivers/mfd/lm3533-core.c
@@ -126,7 +126,7 @@ int lm3533_update(struct lm3533 *lm3533, u8 reg, u8 val, u8 mask)
 
 	dev_dbg(lm3533->dev, "update [%02x]: %02x/%02x\n", reg, val, mask);
 
-	ret = regmap_update_bits(lm3533->regmap, reg, val, mask);
+	ret = regmap_update_bits(lm3533->regmap, reg, mask, val);
 	if (ret < 0) {
 		dev_err(lm3533->dev, "failed to update register %02x: %d\n",
 								reg, ret);
-- 
cgit v0.10.2


From 805b237a63f686f87870af000a5ac464633cb9c8 Mon Sep 17 00:00:00 2001
From: Wim Van Sebroeck <wim@iguana.be>
Date: Thu, 10 May 2012 20:54:32 +0200
Subject: mfd: Change tunnelcreek watchdog name on the lpc_sch subdevices array

The name of the tunnelcreek watchdog device is not tunnelcreek_wdt but
ie6xx_wdt.

Signed-off-by: Wim Van Sebroeck <wim@iguana.be>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>

diff --git a/drivers/mfd/lpc_sch.c b/drivers/mfd/lpc_sch.c
index 9bf0839..9f20abc 100644
--- a/drivers/mfd/lpc_sch.c
+++ b/drivers/mfd/lpc_sch.c
@@ -69,7 +69,7 @@ static struct resource wdt_sch_resource = {
 
 static struct mfd_cell tunnelcreek_cells[] = {
 	{
-		.name = "tunnelcreek_wdt",
+		.name = "ie6xx_wdt",
 		.num_resources = 1,
 		.resources = &wdt_sch_resource,
 	},
-- 
cgit v0.10.2


From 6608a5e2dd0dff73fe1bf912036349ad5ed78bdb Mon Sep 17 00:00:00 2001
From: Axel Lin <axel.lin@gmail.com>
Date: Fri, 11 May 2012 09:29:51 +0800
Subject: mfd: Convert da9052 to use devm_* APIs

Signed-off-by: Axel Lin <axel.lin@gmail.com>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>

diff --git a/drivers/mfd/da9052-i2c.c b/drivers/mfd/da9052-i2c.c
index 8410065f1..82c9d64 100644
--- a/drivers/mfd/da9052-i2c.c
+++ b/drivers/mfd/da9052-i2c.c
@@ -70,7 +70,7 @@ static int __devinit da9052_i2c_probe(struct i2c_client *client,
 	struct da9052 *da9052;
 	int ret;
 
-	da9052 = kzalloc(sizeof(struct da9052), GFP_KERNEL);
+	da9052 = devm_kzalloc(&client->dev, sizeof(struct da9052), GFP_KERNEL);
 	if (!da9052)
 		return -ENOMEM;
 
@@ -78,8 +78,7 @@ static int __devinit da9052_i2c_probe(struct i2c_client *client,
 				     I2C_FUNC_SMBUS_BYTE_DATA)) {
 		dev_info(&client->dev, "Error in %s:i2c_check_functionality\n",
 			 __func__);
-		ret = -ENODEV;
-		goto err;
+		return  -ENODEV;
 	}
 
 	da9052->dev = &client->dev;
@@ -87,17 +86,17 @@ static int __devinit da9052_i2c_probe(struct i2c_client *client,
 
 	i2c_set_clientdata(client, da9052);
 
-	da9052->regmap = regmap_init_i2c(client, &da9052_regmap_config);
+	da9052->regmap = devm_regmap_init_i2c(client, &da9052_regmap_config);
 	if (IS_ERR(da9052->regmap)) {
 		ret = PTR_ERR(da9052->regmap);
 		dev_err(&client->dev, "Failed to allocate register map: %d\n",
 			ret);
-		goto err;
+		return ret;
 	}
 
 	ret = da9052_i2c_enable_multiwrite(da9052);
 	if (ret < 0)
-		goto err_regmap;
+		return ret;
 
 #ifdef CONFIG_OF
 	if (!id) {
@@ -112,20 +111,14 @@ static int __devinit da9052_i2c_probe(struct i2c_client *client,
 	if (!id) {
 		ret = -ENODEV;
 		dev_err(&client->dev, "id is null.\n");
-		goto err_regmap;
+		return ret;
 	}
 
 	ret = da9052_device_init(da9052, id->driver_data);
 	if (ret != 0)
-		goto err_regmap;
+		return ret;
 
 	return 0;
-
-err_regmap:
-	regmap_exit(da9052->regmap);
-err:
-	kfree(da9052);
-	return ret;
 }
 
 static int __devexit da9052_i2c_remove(struct i2c_client *client)
@@ -133,9 +126,6 @@ static int __devexit da9052_i2c_remove(struct i2c_client *client)
 	struct da9052 *da9052 = i2c_get_clientdata(client);
 
 	da9052_device_exit(da9052);
-	regmap_exit(da9052->regmap);
-	kfree(da9052);
-
 	return 0;
 }
 
diff --git a/drivers/mfd/da9052-spi.c b/drivers/mfd/da9052-spi.c
index 6e09498..dbeadc5 100644
--- a/drivers/mfd/da9052-spi.c
+++ b/drivers/mfd/da9052-spi.c
@@ -25,8 +25,9 @@ static int __devinit da9052_spi_probe(struct spi_device *spi)
 {
 	int ret;
 	const struct spi_device_id *id = spi_get_device_id(spi);
-	struct da9052 *da9052 = kzalloc(sizeof(struct da9052), GFP_KERNEL);
+	struct da9052 *da9052;
 
+	da9052 = devm_kzalloc(&spi->dev, sizeof(struct da9052), GFP_KERNEL);
 	if (!da9052)
 		return -ENOMEM;
 
@@ -42,25 +43,19 @@ static int __devinit da9052_spi_probe(struct spi_device *spi)
 	da9052_regmap_config.read_flag_mask = 1;
 	da9052_regmap_config.write_flag_mask = 0;
 
-	da9052->regmap = regmap_init_spi(spi, &da9052_regmap_config);
+	da9052->regmap = devm_regmap_init_spi(spi, &da9052_regmap_config);
 	if (IS_ERR(da9052->regmap)) {
 		ret = PTR_ERR(da9052->regmap);
 		dev_err(&spi->dev, "Failed to allocate register map: %d\n",
 			ret);
-		goto err;
+		return ret;
 	}
 
 	ret = da9052_device_init(da9052, id->driver_data);
 	if (ret != 0)
-		goto err_regmap;
+		return ret;
 
 	return 0;
-
-err_regmap:
-	regmap_exit(da9052->regmap);
-err:
-	kfree(da9052);
-	return ret;
 }
 
 static int __devexit da9052_spi_remove(struct spi_device *spi)
@@ -68,9 +63,6 @@ static int __devexit da9052_spi_remove(struct spi_device *spi)
 	struct da9052 *da9052 = dev_get_drvdata(&spi->dev);
 
 	da9052_device_exit(da9052);
-	regmap_exit(da9052->regmap);
-	kfree(da9052);
-
 	return 0;
 }
 
-- 
cgit v0.10.2


From aa4603a0a7663b10e645b32cc808aac00bc390a3 Mon Sep 17 00:00:00 2001
From: Axel Lin <axel.lin@gmail.com>
Date: Fri, 11 May 2012 09:31:29 +0800
Subject: mfd: Convert pcf50633-core to use devm_* APIs

Signed-off-by: Axel Lin <axel.lin@gmail.com>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>

diff --git a/drivers/mfd/pcf50633-core.c b/drivers/mfd/pcf50633-core.c
index 189c2f0..29c122b 100644
--- a/drivers/mfd/pcf50633-core.c
+++ b/drivers/mfd/pcf50633-core.c
@@ -204,7 +204,7 @@ static int __devinit pcf50633_probe(struct i2c_client *client,
 		return -ENOENT;
 	}
 
-	pcf = kzalloc(sizeof(*pcf), GFP_KERNEL);
+	pcf = devm_kzalloc(&client->dev, sizeof(*pcf), GFP_KERNEL);
 	if (!pcf)
 		return -ENOMEM;
 
@@ -212,12 +212,11 @@ static int __devinit pcf50633_probe(struct i2c_client *client,
 
 	mutex_init(&pcf->lock);
 
-	pcf->regmap = regmap_init_i2c(client, &pcf50633_regmap_config);
+	pcf->regmap = devm_regmap_init_i2c(client, &pcf50633_regmap_config);
 	if (IS_ERR(pcf->regmap)) {
 		ret = PTR_ERR(pcf->regmap);
-		dev_err(pcf->dev, "Failed to allocate register map: %d\n",
-			ret);
-		goto err_free;
+		dev_err(pcf->dev, "Failed to allocate register map: %d\n", ret);
+		return ret;
 	}
 
 	i2c_set_clientdata(client, pcf);
@@ -228,7 +227,7 @@ static int __devinit pcf50633_probe(struct i2c_client *client,
 	if (version < 0 || variant < 0) {
 		dev_err(pcf->dev, "Unable to probe pcf50633\n");
 		ret = -ENODEV;
-		goto err_regmap;
+		return ret;
 	}
 
 	dev_info(pcf->dev, "Probed device version %d variant %d\n",
@@ -237,16 +236,11 @@ static int __devinit pcf50633_probe(struct i2c_client *client,
 	pcf50633_irq_init(pcf, client->irq);
 
 	/* Create sub devices */
-	pcf50633_client_dev_register(pcf, "pcf50633-input",
-						&pcf->input_pdev);
-	pcf50633_client_dev_register(pcf, "pcf50633-rtc",
-						&pcf->rtc_pdev);
-	pcf50633_client_dev_register(pcf, "pcf50633-mbc",
-						&pcf->mbc_pdev);
-	pcf50633_client_dev_register(pcf, "pcf50633-adc",
-						&pcf->adc_pdev);
-	pcf50633_client_dev_register(pcf, "pcf50633-backlight",
-						&pcf->bl_pdev);
+	pcf50633_client_dev_register(pcf, "pcf50633-input", &pcf->input_pdev);
+	pcf50633_client_dev_register(pcf, "pcf50633-rtc", &pcf->rtc_pdev);
+	pcf50633_client_dev_register(pcf, "pcf50633-mbc", &pcf->mbc_pdev);
+	pcf50633_client_dev_register(pcf, "pcf50633-adc", &pcf->adc_pdev);
+	pcf50633_client_dev_register(pcf, "pcf50633-backlight", &pcf->bl_pdev);
 
 
 	for (i = 0; i < PCF50633_NUM_REGULATORS; i++) {
@@ -274,13 +268,6 @@ static int __devinit pcf50633_probe(struct i2c_client *client,
 		pdata->probe_done(pcf);
 
 	return 0;
-
-err_regmap:
-	regmap_exit(pcf->regmap);
-err_free:
-	kfree(pcf);
-
-	return ret;
 }
 
 static int __devexit pcf50633_remove(struct i2c_client *client)
@@ -300,9 +287,6 @@ static int __devexit pcf50633_remove(struct i2c_client *client)
 	for (i = 0; i < PCF50633_NUM_REGULATORS; i++)
 		platform_device_unregister(pcf->regulator_pdev[i]);
 
-	regmap_exit(pcf->regmap);
-	kfree(pcf);
-
 	return 0;
 }
 
-- 
cgit v0.10.2


From 10bbc48d7a045c022a54f637c0c6b72f0e38b519 Mon Sep 17 00:00:00 2001
From: Laxman Dewangan <ldewangan@nvidia.com>
Date: Fri, 11 May 2012 21:48:27 +0530
Subject: gpio: Convert tps65910 to a platform driver

Make the gpio-tps65910 as platform driver and register
this from tps65910 core driver as mfd sub device.

Signed-off-by: Laxman Dewangan <ldewangan@nvidia.com>
Acked-by: Grant Likely <grant.likely@secretlab.ca>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>

diff --git a/drivers/gpio/gpio-tps65910.c b/drivers/gpio/gpio-tps65910.c
index bc155f2..af6dc83 100644
--- a/drivers/gpio/gpio-tps65910.c
+++ b/drivers/gpio/gpio-tps65910.c
@@ -18,11 +18,23 @@
 #include <linux/errno.h>
 #include <linux/gpio.h>
 #include <linux/i2c.h>
+#include <linux/platform_device.h>
 #include <linux/mfd/tps65910.h>
 
+struct tps65910_gpio {
+	struct gpio_chip gpio_chip;
+	struct tps65910 *tps65910;
+};
+
+static inline struct tps65910_gpio *to_tps65910_gpio(struct gpio_chip *chip)
+{
+	return container_of(chip, struct tps65910_gpio, gpio_chip);
+}
+
 static int tps65910_gpio_get(struct gpio_chip *gc, unsigned offset)
 {
-	struct tps65910 *tps65910 = container_of(gc, struct tps65910, gpio);
+	struct tps65910_gpio *tps65910_gpio = to_tps65910_gpio(gc);
+	struct tps65910 *tps65910 = tps65910_gpio->tps65910;
 	unsigned int val;
 
 	tps65910_reg_read(tps65910, TPS65910_GPIO0 + offset, &val);
@@ -36,7 +48,8 @@ static int tps65910_gpio_get(struct gpio_chip *gc, unsigned offset)
 static void tps65910_gpio_set(struct gpio_chip *gc, unsigned offset,
 			      int value)
 {
-	struct tps65910 *tps65910 = container_of(gc, struct tps65910, gpio);
+	struct tps65910_gpio *tps65910_gpio = to_tps65910_gpio(gc);
+	struct tps65910 *tps65910 = tps65910_gpio->tps65910;
 
 	if (value)
 		tps65910_reg_set_bits(tps65910, TPS65910_GPIO0 + offset,
@@ -49,7 +62,8 @@ static void tps65910_gpio_set(struct gpio_chip *gc, unsigned offset,
 static int tps65910_gpio_output(struct gpio_chip *gc, unsigned offset,
 				int value)
 {
-	struct tps65910 *tps65910 = container_of(gc, struct tps65910, gpio);
+	struct tps65910_gpio *tps65910_gpio = to_tps65910_gpio(gc);
+	struct tps65910 *tps65910 = tps65910_gpio->tps65910;
 
 	/* Set the initial value */
 	tps65910_gpio_set(gc, offset, value);
@@ -60,59 +74,109 @@ static int tps65910_gpio_output(struct gpio_chip *gc, unsigned offset,
 
 static int tps65910_gpio_input(struct gpio_chip *gc, unsigned offset)
 {
-	struct tps65910 *tps65910 = container_of(gc, struct tps65910, gpio);
+	struct tps65910_gpio *tps65910_gpio = to_tps65910_gpio(gc);
+	struct tps65910 *tps65910 = tps65910_gpio->tps65910;
 
 	return tps65910_reg_clear_bits(tps65910, TPS65910_GPIO0 + offset,
 						GPIO_CFG_MASK);
 }
 
-void tps65910_gpio_init(struct tps65910 *tps65910, int gpio_base)
+static int __devinit tps65910_gpio_probe(struct platform_device *pdev)
 {
+	struct tps65910 *tps65910 = dev_get_drvdata(pdev->dev.parent);
+	struct tps65910_board *pdata = dev_get_platdata(tps65910->dev);
+	struct tps65910_gpio *tps65910_gpio;
 	int ret;
-	struct tps65910_board *board_data;
+	int i;
+
+	tps65910_gpio = devm_kzalloc(&pdev->dev,
+				sizeof(*tps65910_gpio), GFP_KERNEL);
+	if (!tps65910_gpio) {
+		dev_err(&pdev->dev, "Could not allocate tps65910_gpio\n");
+		return -ENOMEM;
+	}
 
-	if (!gpio_base)
-		return;
+	tps65910_gpio->tps65910 = tps65910;
 
-	tps65910->gpio.owner		= THIS_MODULE;
-	tps65910->gpio.label		= tps65910->i2c_client->name;
-	tps65910->gpio.dev		= tps65910->dev;
-	tps65910->gpio.base		= gpio_base;
+	tps65910_gpio->gpio_chip.owner = THIS_MODULE;
+	tps65910_gpio->gpio_chip.label = tps65910->i2c_client->name;
 
 	switch(tps65910_chip_id(tps65910)) {
 	case TPS65910:
-		tps65910->gpio.ngpio	= TPS65910_NUM_GPIO;
+		tps65910_gpio->gpio_chip.ngpio = TPS65910_NUM_GPIO;
 		break;
 	case TPS65911:
-		tps65910->gpio.ngpio	= TPS65911_NUM_GPIO;
+		tps65910_gpio->gpio_chip.ngpio = TPS65911_NUM_GPIO;
 		break;
 	default:
-		return;
+		return -EINVAL;
 	}
-	tps65910->gpio.can_sleep	= 1;
-
-	tps65910->gpio.direction_input	= tps65910_gpio_input;
-	tps65910->gpio.direction_output	= tps65910_gpio_output;
-	tps65910->gpio.set		= tps65910_gpio_set;
-	tps65910->gpio.get		= tps65910_gpio_get;
-
-	/* Configure sleep control for gpios */
-	board_data = dev_get_platdata(tps65910->dev);
-	if (board_data) {
-		int i;
-		for (i = 0; i < tps65910->gpio.ngpio; ++i) {
-			if (board_data->en_gpio_sleep[i]) {
-				ret = tps65910_reg_set_bits(tps65910,
-					TPS65910_GPIO0 + i, GPIO_SLEEP_MASK);
-				if (ret < 0)
-					dev_warn(tps65910->dev,
-						"GPIO Sleep setting failed\n");
-			}
-		}
+	tps65910_gpio->gpio_chip.can_sleep = 1;
+	tps65910_gpio->gpio_chip.direction_input = tps65910_gpio_input;
+	tps65910_gpio->gpio_chip.direction_output = tps65910_gpio_output;
+	tps65910_gpio->gpio_chip.set	= tps65910_gpio_set;
+	tps65910_gpio->gpio_chip.get	= tps65910_gpio_get;
+	tps65910_gpio->gpio_chip.dev = &pdev->dev;
+	if (pdata && pdata->gpio_base)
+		tps65910_gpio->gpio_chip.base = pdata->gpio_base;
+	else
+		tps65910_gpio->gpio_chip.base = -1;
+
+	if (!pdata)
+		goto skip_init;
+
+	/* Configure sleep control for gpios if provided */
+	for (i = 0; i < tps65910_gpio->gpio_chip.ngpio; ++i) {
+		if (!pdata->en_gpio_sleep[i])
+			continue;
+
+		ret = tps65910_reg_set_bits(tps65910,
+			TPS65910_GPIO0 + i, GPIO_SLEEP_MASK);
+		if (ret < 0)
+			dev_warn(tps65910->dev,
+				"GPIO Sleep setting failed with err %d\n", ret);
+	}
+
+skip_init:
+	ret = gpiochip_add(&tps65910_gpio->gpio_chip);
+	if (ret < 0) {
+		dev_err(&pdev->dev, "Could not register gpiochip, %d\n", ret);
+		return ret;
 	}
 
-	ret = gpiochip_add(&tps65910->gpio);
+	platform_set_drvdata(pdev, tps65910_gpio);
 
-	if (ret)
-		dev_warn(tps65910->dev, "GPIO registration failed: %d\n", ret);
+	return ret;
 }
+
+static int __devexit tps65910_gpio_remove(struct platform_device *pdev)
+{
+	struct tps65910_gpio *tps65910_gpio = platform_get_drvdata(pdev);
+
+	return gpiochip_remove(&tps65910_gpio->gpio_chip);
+}
+
+static struct platform_driver tps65910_gpio_driver = {
+	.driver.name    = "tps65910-gpio",
+	.driver.owner   = THIS_MODULE,
+	.probe		= tps65910_gpio_probe,
+	.remove		= __devexit_p(tps65910_gpio_remove),
+};
+
+static int __init tps65910_gpio_init(void)
+{
+	return platform_driver_register(&tps65910_gpio_driver);
+}
+subsys_initcall(tps65910_gpio_init);
+
+static void __exit tps65910_gpio_exit(void)
+{
+	platform_driver_unregister(&tps65910_gpio_driver);
+}
+module_exit(tps65910_gpio_exit);
+
+MODULE_AUTHOR("Graeme Gregory <gg@slimlogic.co.uk>");
+MODULE_AUTHOR("Jorge Eduardo Candelaria jedu@slimlogic.co.uk>");
+MODULE_DESCRIPTION("GPIO interface for TPS65910/TPS6511 PMICs");
+MODULE_LICENSE("GPL v2");
+MODULE_ALIAS("platform:tps65910-gpio");
-- 
cgit v0.10.2


From 168755ebb11e8bc17f2c12c42534adaf003a7d7e Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Sat, 12 May 2012 14:01:04 +0300
Subject: mfd: Silence an lm3533 gcc warning
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This is supposed to be umode_t.  It causes a GCC warning:
drivers/mfd/lm3533-core.c:440:2: warning: initialization from incompatible pointer type [enabled by default]
drivers/mfd/lm3533-core.c:440:2: warning: (near initialization for ‘lm3533_attribute_group.is_visible’) [enabled by default]

Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>

diff --git a/drivers/mfd/lm3533-core.c b/drivers/mfd/lm3533-core.c
index d11c891..1627472 100644
--- a/drivers/mfd/lm3533-core.c
+++ b/drivers/mfd/lm3533-core.c
@@ -359,7 +359,7 @@ static struct attribute *lm3533_attributes[] = {
 #define to_dev_attr(_attr) \
 	container_of(_attr, struct device_attribute, attr)
 
-static mode_t lm3533_attr_is_visible(struct kobject *kobj,
+static umode_t lm3533_attr_is_visible(struct kobject *kobj,
 					     struct attribute *attr, int n)
 {
 	struct device *dev = container_of(kobj, struct device, kobj);
-- 
cgit v0.10.2


From eee0e4b44f855f4a34c77137cb64d08c9fb484e9 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@opensource.wolfsonmicro.com>
Date: Mon, 14 May 2012 10:13:15 +0100
Subject: mfd: Don't support non-modular wm8400 build

It's relying on non-exported symbols.

Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>

diff --git a/drivers/mfd/Kconfig b/drivers/mfd/Kconfig
index b914483..af46ce0 100644
--- a/drivers/mfd/Kconfig
+++ b/drivers/mfd/Kconfig
@@ -465,7 +465,7 @@ config MFD_S5M_CORE
 	 of the device
 
 config MFD_WM8400
-	tristate "Support Wolfson Microelectronics WM8400"
+	bool "Support Wolfson Microelectronics WM8400"
 	select MFD_CORE
 	depends on I2C
 	select REGMAP_I2C
-- 
cgit v0.10.2


From ebd29c6cc0b29b4bb041441fc251e0f400eea2cf Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@opensource.wolfsonmicro.com>
Date: Mon, 14 May 2012 10:13:16 +0100
Subject: mfd: Export wm8400_block_read()

Used by the regulator driver.

Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>

diff --git a/drivers/mfd/wm8400-core.c b/drivers/mfd/wm8400-core.c
index 9083b77..4b7d378 100644
--- a/drivers/mfd/wm8400-core.c
+++ b/drivers/mfd/wm8400-core.c
@@ -60,6 +60,7 @@ int wm8400_block_read(struct wm8400 *wm8400, u8 reg, int count, u16 *data)
 {
 	return regmap_bulk_read(wm8400->regmap, reg, data, count);
 }
+EXPORT_SYMBOL_GPL(wm8400_block_read);
 
 static int wm8400_register_codec(struct wm8400 *wm8400)
 {
-- 
cgit v0.10.2


From 83871c00bb43f41d85dd15aba56a83bbb191eabc Mon Sep 17 00:00:00 2001
From: Chanwoo Choi <cw00.choi@samsung.com>
Date: Mon, 14 May 2012 22:50:39 +0200
Subject: mfd: Add MAX77693 driver

This patch adds MFD driver for MAX77693 to enable its sub devices.

The MAX77693 is a multi-function devices. It includes PMIC,
MUIC(Micro USB Interface Controller), flash LED control and
haptic motor control.

Signed-off-by: Chanwoo Choi <cw00.choi@samsung.com>
Signed-off-by: Myungjoo Ham <myungjoo.ham@samsung.com>
Signed-off-by: Kyungmin Park <kyungmin.park@samsung.com>
Reviewed-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>

diff --git a/drivers/mfd/Kconfig b/drivers/mfd/Kconfig
index af46ce0..a0e1b83 100644
--- a/drivers/mfd/Kconfig
+++ b/drivers/mfd/Kconfig
@@ -420,6 +420,18 @@ config PMIC_ADP5520
 	  individual components like LCD backlight, LEDs, GPIOs and Kepad
 	  under the corresponding menus.
 
+config MFD_MAX77693
+	bool "Maxim Semiconductor MAX77693 PMIC Support"
+	depends on I2C=y && GENERIC_HARDIRQS
+	select MFD_CORE
+	help
+	  Say yes here to support for Maxim Semiconductor MAX77693.
+	  This is a companion Power Management IC with Flash, Haptic, Charger,
+	  and MUIC(Micro USB Interface Controller) controls on chip.
+	  This driver provides common support for accessing the device;
+	  additional drivers must be enabled in order to use the functionality
+	  of the device.
+
 config MFD_MAX8925
 	bool "Maxim Semiconductor MAX8925 PMIC Support"
 	depends on I2C=y && GENERIC_HARDIRQS
diff --git a/drivers/mfd/Makefile b/drivers/mfd/Makefile
index d3dae95..db0262b 100644
--- a/drivers/mfd/Makefile
+++ b/drivers/mfd/Makefile
@@ -78,6 +78,7 @@ obj-$(CONFIG_PMIC_DA9052)	+= da9052-core.o
 obj-$(CONFIG_MFD_DA9052_SPI)	+= da9052-spi.o
 obj-$(CONFIG_MFD_DA9052_I2C)	+= da9052-i2c.o
 
+obj-$(CONFIG_MFD_MAX77693)	+= max77693.o
 max8925-objs			:= max8925-core.o max8925-i2c.o
 obj-$(CONFIG_MFD_MAX8925)	+= max8925.o
 obj-$(CONFIG_MFD_MAX8997)	+= max8997.o max8997-irq.o
diff --git a/drivers/mfd/max77693.c b/drivers/mfd/max77693.c
new file mode 100644
index 0000000..c852515
--- /dev/null
+++ b/drivers/mfd/max77693.c
@@ -0,0 +1,217 @@
+/*
+ * max77693.c - mfd core driver for the MAX 77693
+ *
+ * Copyright (C) 2012 Samsung Electronics
+ * SangYoung Son <hello.son@smasung.com>
+ *
+ * This program is not provided / owned by Maxim Integrated Products.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ * This driver is based on max8997.c
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/i2c.h>
+#include <linux/err.h>
+#include <linux/interrupt.h>
+#include <linux/pm_runtime.h>
+#include <linux/mutex.h>
+#include <linux/mfd/core.h>
+#include <linux/mfd/max77693.h>
+#include <linux/mfd/max77693-private.h>
+#include <linux/regulator/machine.h>
+#include <linux/regmap.h>
+
+#define I2C_ADDR_PMIC	(0xCC >> 1)	/* Charger, Flash LED */
+#define I2C_ADDR_MUIC	(0x4A >> 1)
+#define I2C_ADDR_HAPTIC	(0x90 >> 1)
+
+static struct mfd_cell max77693_devs[] = {
+	{ .name = "max77693-pmic", },
+	{ .name = "max77693-charger", },
+	{ .name = "max77693-flash", },
+	{ .name = "max77693-muic", },
+	{ .name = "max77693-haptic", },
+};
+
+int max77693_read_reg(struct regmap *map, u8 reg, u8 *dest)
+{
+	unsigned int val;
+	int ret;
+
+	ret = regmap_read(map, reg, &val);
+	*dest = val;
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(max77693_read_reg);
+
+int max77693_bulk_read(struct regmap *map, u8 reg, int count, u8 *buf)
+{
+	int ret;
+
+	ret = regmap_bulk_read(map, reg, buf, count);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(max77693_bulk_read);
+
+int max77693_write_reg(struct regmap *map, u8 reg, u8 value)
+{
+	int ret;
+
+	ret = regmap_write(map, reg, value);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(max77693_write_reg);
+
+int max77693_bulk_write(struct regmap *map, u8 reg, int count, u8 *buf)
+{
+	int ret;
+
+	ret = regmap_bulk_write(map, reg, buf, count);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(max77693_bulk_write);
+
+int max77693_update_reg(struct regmap *map, u8 reg, u8 val, u8 mask)
+{
+	int ret;
+
+	ret = regmap_update_bits(map, reg, mask, val);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(max77693_update_reg);
+
+static const struct regmap_config max77693_regmap_config = {
+	.reg_bits = 8,
+	.val_bits = 8,
+	.max_register = MAX77693_PMIC_REG_END,
+};
+
+static int max77693_i2c_probe(struct i2c_client *i2c,
+			      const struct i2c_device_id *id)
+{
+	struct max77693_dev *max77693;
+	struct max77693_platform_data *pdata = i2c->dev.platform_data;
+	u8 reg_data;
+	int ret = 0;
+
+	max77693 = devm_kzalloc(&i2c->dev,
+			sizeof(struct max77693_dev), GFP_KERNEL);
+	if (max77693 == NULL)
+		return -ENOMEM;
+
+	max77693->regmap = devm_regmap_init_i2c(i2c, &max77693_regmap_config);
+	if (IS_ERR(max77693->regmap)) {
+		ret = PTR_ERR(max77693->regmap);
+		dev_err(max77693->dev,"failed to allocate register map: %d\n",
+				ret);
+		goto err_regmap;
+	}
+
+	i2c_set_clientdata(i2c, max77693);
+	max77693->dev = &i2c->dev;
+	max77693->i2c = i2c;
+	max77693->irq = i2c->irq;
+	max77693->type = id->driver_data;
+
+	if (!pdata)
+		goto err_regmap;
+
+	max77693->wakeup = pdata->wakeup;
+
+	mutex_init(&max77693->iolock);
+
+	if (max77693_read_reg(max77693->regmap,
+				MAX77693_PMIC_REG_PMIC_ID2, &reg_data) < 0) {
+		dev_err(max77693->dev, "device not found on this channel\n");
+		ret = -ENODEV;
+		goto err_regmap;
+	} else
+		dev_info(max77693->dev, "device ID: 0x%x\n", reg_data);
+
+	max77693->muic = i2c_new_dummy(i2c->adapter, I2C_ADDR_MUIC);
+	i2c_set_clientdata(max77693->muic, max77693);
+
+	max77693->haptic = i2c_new_dummy(i2c->adapter, I2C_ADDR_HAPTIC);
+	i2c_set_clientdata(max77693->haptic, max77693);
+
+	pm_runtime_set_active(max77693->dev);
+
+	ret = mfd_add_devices(max77693->dev, -1, max77693_devs,
+			ARRAY_SIZE(max77693_devs), NULL, 0);
+	if (ret < 0)
+		goto err_mfd;
+
+	return ret;
+
+err_mfd:
+	i2c_unregister_device(max77693->muic);
+	i2c_unregister_device(max77693->haptic);
+err_regmap:
+	kfree(max77693);
+
+	return ret;
+}
+
+static int max77693_i2c_remove(struct i2c_client *i2c)
+{
+	struct max77693_dev *max77693 = i2c_get_clientdata(i2c);
+
+	mfd_remove_devices(max77693->dev);
+	i2c_unregister_device(max77693->muic);
+	i2c_unregister_device(max77693->haptic);
+
+	return 0;
+}
+
+static const struct i2c_device_id max77693_i2c_id[] = {
+	{ "max77693", TYPE_MAX77693 },
+	{ }
+};
+MODULE_DEVICE_TABLE(i2c, max77693_i2c_id);
+
+static struct i2c_driver max77693_i2c_driver = {
+	.driver = {
+		   .name = "max77693",
+		   .owner = THIS_MODULE,
+	},
+	.probe = max77693_i2c_probe,
+	.remove = max77693_i2c_remove,
+	.id_table = max77693_i2c_id,
+};
+
+static int __init max77693_i2c_init(void)
+{
+	return i2c_add_driver(&max77693_i2c_driver);
+}
+/* init early so consumer devices can complete system boot */
+subsys_initcall(max77693_i2c_init);
+
+static void __exit max77693_i2c_exit(void)
+{
+	i2c_del_driver(&max77693_i2c_driver);
+}
+module_exit(max77693_i2c_exit);
+
+MODULE_DESCRIPTION("MAXIM 77693 multi-function core driver");
+MODULE_AUTHOR("SangYoung, Son <hello.son@samsung.com>");
+MODULE_LICENSE("GPL");
diff --git a/include/linux/mfd/max77693-private.h b/include/linux/mfd/max77693-private.h
new file mode 100644
index 0000000..bf6077d
--- /dev/null
+++ b/include/linux/mfd/max77693-private.h
@@ -0,0 +1,217 @@
+/*
+ * max77693-private.h - Voltage regulator driver for the Maxim 77693
+ *
+ *  Copyright (C) 2012 Samsung Electrnoics
+ *  SangYoung Son <hello.son@samsung.com>
+ *
+ * This program is not provided / owned by Maxim Integrated Products.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#ifndef __LINUX_MFD_MAX77693_PRIV_H
+#define __LINUX_MFD_MAX77693_PRIV_H
+
+#include <linux/i2c.h>
+
+#define MAX77693_NUM_IRQ_MUIC_REGS	3
+#define MAX77693_REG_INVALID		(0xff)
+
+/* Slave addr = 0xCC: PMIC, Charger, Flash LED */
+enum max77693_pmic_reg {
+	MAX77693_LED_REG_IFLASH1			= 0x00,
+	MAX77693_LED_REG_IFLASH2			= 0x01,
+	MAX77693_LED_REG_ITORCH				= 0x02,
+	MAX77693_LED_REG_ITORCHTIMER			= 0x03,
+	MAX77693_LED_REG_FLASH_TIMER			= 0x04,
+	MAX77693_LED_REG_FLASH_EN			= 0x05,
+	MAX77693_LED_REG_MAX_FLASH1			= 0x06,
+	MAX77693_LED_REG_MAX_FLASH2			= 0x07,
+	MAX77693_LED_REG_MAX_FLASH3			= 0x08,
+	MAX77693_LED_REG_MAX_FLASH4			= 0x09,
+	MAX77693_LED_REG_VOUT_CNTL			= 0x0A,
+	MAX77693_LED_REG_VOUT_FLASH1			= 0x0B,
+	MAX77693_LED_REG_VOUT_FLASH2			= 0x0C,
+	MAX77693_LED_REG_FLASH_INT			= 0x0E,
+	MAX77693_LED_REG_FLASH_INT_MASK			= 0x0F,
+	MAX77693_LED_REG_FLASH_INT_STATUS		= 0x10,
+
+	MAX77693_PMIC_REG_PMIC_ID1			= 0x20,
+	MAX77693_PMIC_REG_PMIC_ID2			= 0x21,
+	MAX77693_PMIC_REG_INTSRC			= 0x22,
+	MAX77693_PMIC_REG_INTSRC_MASK			= 0x23,
+	MAX77693_PMIC_REG_TOPSYS_INT			= 0x24,
+	MAX77693_PMIC_REG_TOPSYS_INT_MASK		= 0x26,
+	MAX77693_PMIC_REG_TOPSYS_STAT			= 0x28,
+	MAX77693_PMIC_REG_MAINCTRL1			= 0x2A,
+	MAX77693_PMIC_REG_LSCNFG			= 0x2B,
+
+	MAX77693_CHG_REG_CHG_INT			= 0xB0,
+	MAX77693_CHG_REG_CHG_INT_MASK			= 0xB1,
+	MAX77693_CHG_REG_CHG_INT_OK			= 0xB2,
+	MAX77693_CHG_REG_CHG_DETAILS_00			= 0xB3,
+	MAX77693_CHG_REG_CHG_DETAILS_01			= 0xB4,
+	MAX77693_CHG_REG_CHG_DETAILS_02			= 0xB5,
+	MAX77693_CHG_REG_CHG_DETAILS_03			= 0xB6,
+	MAX77693_CHG_REG_CHG_CNFG_00			= 0xB7,
+	MAX77693_CHG_REG_CHG_CNFG_01			= 0xB8,
+	MAX77693_CHG_REG_CHG_CNFG_02			= 0xB9,
+	MAX77693_CHG_REG_CHG_CNFG_03			= 0xBA,
+	MAX77693_CHG_REG_CHG_CNFG_04			= 0xBB,
+	MAX77693_CHG_REG_CHG_CNFG_05			= 0xBC,
+	MAX77693_CHG_REG_CHG_CNFG_06			= 0xBD,
+	MAX77693_CHG_REG_CHG_CNFG_07			= 0xBE,
+	MAX77693_CHG_REG_CHG_CNFG_08			= 0xBF,
+	MAX77693_CHG_REG_CHG_CNFG_09			= 0xC0,
+	MAX77693_CHG_REG_CHG_CNFG_10			= 0xC1,
+	MAX77693_CHG_REG_CHG_CNFG_11			= 0xC2,
+	MAX77693_CHG_REG_CHG_CNFG_12			= 0xC3,
+	MAX77693_CHG_REG_CHG_CNFG_13			= 0xC4,
+	MAX77693_CHG_REG_CHG_CNFG_14			= 0xC5,
+	MAX77693_CHG_REG_SAFEOUT_CTRL			= 0xC6,
+
+	MAX77693_PMIC_REG_END,
+};
+
+/* Slave addr = 0x4A: MUIC */
+enum max77693_muic_reg {
+	MAX77693_MUIC_REG_ID		= 0x00,
+	MAX77693_MUIC_REG_INT1		= 0x01,
+	MAX77693_MUIC_REG_INT2		= 0x02,
+	MAX77693_MUIC_REG_INT3		= 0x03,
+	MAX77693_MUIC_REG_STATUS1	= 0x04,
+	MAX77693_MUIC_REG_STATUS2	= 0x05,
+	MAX77693_MUIC_REG_STATUS3	= 0x06,
+	MAX77693_MUIC_REG_INTMASK1	= 0x07,
+	MAX77693_MUIC_REG_INTMASK2	= 0x08,
+	MAX77693_MUIC_REG_INTMASK3	= 0x09,
+	MAX77693_MUIC_REG_CDETCTRL1	= 0x0A,
+	MAX77693_MUIC_REG_CDETCTRL2	= 0x0B,
+	MAX77693_MUIC_REG_CTRL1		= 0x0C,
+	MAX77693_MUIC_REG_CTRL2		= 0x0D,
+	MAX77693_MUIC_REG_CTRL3		= 0x0E,
+
+	MAX77693_MUIC_REG_END,
+};
+
+/* Slave addr = 0x90: Haptic */
+enum max77693_haptic_reg {
+	MAX77693_HAPTIC_REG_STATUS		= 0x00,
+	MAX77693_HAPTIC_REG_CONFIG1		= 0x01,
+	MAX77693_HAPTIC_REG_CONFIG2		= 0x02,
+	MAX77693_HAPTIC_REG_CONFIG_CHNL		= 0x03,
+	MAX77693_HAPTIC_REG_CONFG_CYC1		= 0x04,
+	MAX77693_HAPTIC_REG_CONFG_CYC2		= 0x05,
+	MAX77693_HAPTIC_REG_CONFIG_PER1		= 0x06,
+	MAX77693_HAPTIC_REG_CONFIG_PER2		= 0x07,
+	MAX77693_HAPTIC_REG_CONFIG_PER3		= 0x08,
+	MAX77693_HAPTIC_REG_CONFIG_PER4		= 0x09,
+	MAX77693_HAPTIC_REG_CONFIG_DUTY1	= 0x0A,
+	MAX77693_HAPTIC_REG_CONFIG_DUTY2	= 0x0B,
+	MAX77693_HAPTIC_REG_CONFIG_PWM1		= 0x0C,
+	MAX77693_HAPTIC_REG_CONFIG_PWM2		= 0x0D,
+	MAX77693_HAPTIC_REG_CONFIG_PWM3		= 0x0E,
+	MAX77693_HAPTIC_REG_CONFIG_PWM4		= 0x0F,
+	MAX77693_HAPTIC_REG_REV			= 0x10,
+
+	MAX77693_HAPTIC_REG_END,
+};
+
+enum max77693_irq_source {
+	LED_INT = 0,
+	TOPSYS_INT,
+	CHG_INT,
+	MUIC_INT1,
+	MUIC_INT2,
+	MUIC_INT3,
+
+	MAX77693_IRQ_GROUP_NR,
+};
+
+enum max77693_irq {
+	/* PMIC - FLASH */
+	MAX77693_LED_IRQ_FLED2_OPEN,
+	MAX77693_LED_IRQ_FLED2_SHORT,
+	MAX77693_LED_IRQ_FLED1_OPEN,
+	MAX77693_LED_IRQ_FLED1_SHORT,
+	MAX77693_LED_IRQ_MAX_FLASH,
+
+	/* PMIC - TOPSYS */
+	MAX77693_TOPSYS_IRQ_T120C_INT,
+	MAX77693_TOPSYS_IRQ_T140C_INT,
+	MAX77693_TOPSYS_IRQ_LOWSYS_INT,
+
+	/* PMIC - Charger */
+	MAX77693_CHG_IRQ_BYP_I,
+	MAX77693_CHG_IRQ_THM_I,
+	MAX77693_CHG_IRQ_BAT_I,
+	MAX77693_CHG_IRQ_CHG_I,
+	MAX77693_CHG_IRQ_CHGIN_I,
+
+	/* MUIC INT1 */
+	MAX77693_MUIC_IRQ_INT1_ADC,
+	MAX77693_MUIC_IRQ_INT1_ADC_LOW,
+	MAX77693_MUIC_IRQ_INT1_ADC_ERR,
+	MAX77693_MUIC_IRQ_INT1_ADC1K,
+
+	/* MUIC INT2 */
+	MAX77693_MUIC_IRQ_INT2_CHGTYP,
+	MAX77693_MUIC_IRQ_INT2_CHGDETREUN,
+	MAX77693_MUIC_IRQ_INT2_DCDTMR,
+	MAX77693_MUIC_IRQ_INT2_DXOVP,
+	MAX77693_MUIC_IRQ_INT2_VBVOLT,
+	MAX77693_MUIC_IRQ_INT2_VIDRM,
+
+	/* MUIC INT3 */
+	MAX77693_MUIC_IRQ_INT3_EOC,
+	MAX77693_MUIC_IRQ_INT3_CGMBC,
+	MAX77693_MUIC_IRQ_INT3_OVP,
+	MAX77693_MUIC_IRQ_INT3_MBCCHG_ERR,
+	MAX77693_MUIC_IRQ_INT3_CHG_ENABLED,
+	MAX77693_MUIC_IRQ_INT3_BAT_DET,
+
+	MAX77693_IRQ_NR,
+};
+
+struct max77693_dev {
+	struct device *dev;
+	struct i2c_client *i2c;		/* 0xCC , PMIC, Charger, Flash LED */
+	struct i2c_client *muic;	/* 0x4A , MUIC */
+	struct i2c_client *haptic;	/* 0x90 , Haptic */
+	struct mutex iolock;
+
+	int type;
+
+	struct regmap *regmap;
+	struct regmap *regmap_muic;
+	struct regmap *regmap_haptic;
+
+	int irq;
+	bool wakeup;
+};
+
+enum max77693_types {
+	TYPE_MAX77693,
+};
+
+extern int max77693_read_reg(struct regmap *map, u8 reg, u8 *dest);
+extern int max77693_bulk_read(struct regmap *map, u8 reg, int count,
+				u8 *buf);
+extern int max77693_write_reg(struct regmap *map, u8 reg, u8 value);
+extern int max77693_bulk_write(struct regmap *map, u8 reg, int count,
+				u8 *buf);
+extern int max77693_update_reg(struct regmap *map, u8 reg, u8 val, u8 mask);
+
+#endif /*  __LINUX_MFD_MAX77693_PRIV_H */
diff --git a/include/linux/mfd/max77693.h b/include/linux/mfd/max77693.h
new file mode 100644
index 0000000..5020b86
--- /dev/null
+++ b/include/linux/mfd/max77693.h
@@ -0,0 +1,37 @@
+/*
+ * max77693.h - Driver for the Maxim 77693
+ *
+ *  Copyright (C) 2012 Samsung Electrnoics
+ *  SangYoung Son <hello.son@samsung.com>
+ *
+ * This program is not provided / owned by Maxim Integrated Products.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ * This driver is based on max8997.h
+ *
+ * MAX77693 has PMIC, Charger, Flash LED, Haptic, MUIC devices.
+ * The devices share the same I2C bus and included in
+ * this mfd driver.
+ */
+
+#ifndef __LINUX_MFD_MAX77693_H
+#define __LINUX_MFD_MAX77693_H
+
+struct max77693_platform_data {
+	/* IRQ */
+	int wakeup;
+};
+#endif	/* __LINUX_MFD_MAX77693_H */
-- 
cgit v0.10.2


From 6592ebb3979c1ec0e37eb06553ef5ce9d6f5f025 Mon Sep 17 00:00:00 2001
From: Chanwoo Choi <cw00.choi@samsung.com>
Date: Mon, 14 May 2012 22:54:20 +0200
Subject: mfd: Add MAX77693 irq handler

This patch supports IRQ handling for MAX77693.

Signed-off-by: Chanwoo Choi <cw00.choi@samsung.com>
Signed-off-by: Myungjoo Ham <myungjoo.ham@samsung.com>
Signed-off-by: Kyungmin Park <kyungmin.park@samsung.com>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>

diff --git a/drivers/mfd/Makefile b/drivers/mfd/Makefile
index db0262b..d713851 100644
--- a/drivers/mfd/Makefile
+++ b/drivers/mfd/Makefile
@@ -78,7 +78,7 @@ obj-$(CONFIG_PMIC_DA9052)	+= da9052-core.o
 obj-$(CONFIG_MFD_DA9052_SPI)	+= da9052-spi.o
 obj-$(CONFIG_MFD_DA9052_I2C)	+= da9052-i2c.o
 
-obj-$(CONFIG_MFD_MAX77693)	+= max77693.o
+obj-$(CONFIG_MFD_MAX77693)	+= max77693.o max77693-irq.o
 max8925-objs			:= max8925-core.o max8925-i2c.o
 obj-$(CONFIG_MFD_MAX8925)	+= max8925.o
 obj-$(CONFIG_MFD_MAX8997)	+= max8997.o max8997-irq.o
diff --git a/drivers/mfd/max77693-irq.c b/drivers/mfd/max77693-irq.c
new file mode 100644
index 0000000..2b40356
--- /dev/null
+++ b/drivers/mfd/max77693-irq.c
@@ -0,0 +1,309 @@
+/*
+ * max77693-irq.c - Interrupt controller support for MAX77693
+ *
+ * Copyright (C) 2012 Samsung Electronics Co.Ltd
+ * SangYoung Son <hello.son@samsung.com>
+ *
+ * This program is not provided / owned by Maxim Integrated Products.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ * This driver is based on max8997-irq.c
+ */
+
+#include <linux/err.h>
+#include <linux/irq.h>
+#include <linux/interrupt.h>
+#include <linux/module.h>
+#include <linux/irqdomain.h>
+#include <linux/mfd/max77693.h>
+#include <linux/mfd/max77693-private.h>
+
+static const u8 max77693_mask_reg[] = {
+	[LED_INT] = MAX77693_LED_REG_FLASH_INT_MASK,
+	[TOPSYS_INT] = MAX77693_PMIC_REG_TOPSYS_INT_MASK,
+	[CHG_INT] = MAX77693_CHG_REG_CHG_INT_MASK,
+	[MUIC_INT1] = MAX77693_MUIC_REG_INTMASK1,
+	[MUIC_INT2] = MAX77693_MUIC_REG_INTMASK2,
+	[MUIC_INT3] = MAX77693_MUIC_REG_INTMASK3,
+};
+
+static struct regmap *max77693_get_regmap(struct max77693_dev *max77693,
+				enum max77693_irq_source src)
+{
+	switch (src) {
+	case LED_INT ... CHG_INT:
+		return max77693->regmap;
+	case MUIC_INT1 ... MUIC_INT3:
+		return max77693->regmap_muic;
+	default:
+		return ERR_PTR(-EINVAL);
+	}
+}
+
+struct max77693_irq_data {
+	int mask;
+	enum max77693_irq_source group;
+};
+
+#define DECLARE_IRQ(idx, _group, _mask)		\
+	[(idx)] = { .group = (_group), .mask = (_mask) }
+static const struct max77693_irq_data max77693_irqs[] = {
+	DECLARE_IRQ(MAX77693_LED_IRQ_FLED2_OPEN,	LED_INT, 1 << 0),
+	DECLARE_IRQ(MAX77693_LED_IRQ_FLED2_SHORT,	LED_INT, 1 << 1),
+	DECLARE_IRQ(MAX77693_LED_IRQ_FLED1_OPEN,	LED_INT, 1 << 2),
+	DECLARE_IRQ(MAX77693_LED_IRQ_FLED1_SHORT,	LED_INT, 1 << 3),
+	DECLARE_IRQ(MAX77693_LED_IRQ_MAX_FLASH,		LED_INT, 1 << 4),
+
+	DECLARE_IRQ(MAX77693_TOPSYS_IRQ_T120C_INT,	TOPSYS_INT, 1 << 0),
+	DECLARE_IRQ(MAX77693_TOPSYS_IRQ_T140C_INT,	TOPSYS_INT, 1 << 1),
+	DECLARE_IRQ(MAX77693_TOPSYS_IRQ_LOWSYS_INT,	TOPSYS_INT, 1 << 3),
+
+	DECLARE_IRQ(MAX77693_CHG_IRQ_BYP_I,		CHG_INT, 1 << 0),
+	DECLARE_IRQ(MAX77693_CHG_IRQ_THM_I,		CHG_INT, 1 << 2),
+	DECLARE_IRQ(MAX77693_CHG_IRQ_BAT_I,		CHG_INT, 1 << 3),
+	DECLARE_IRQ(MAX77693_CHG_IRQ_CHG_I,		CHG_INT, 1 << 4),
+	DECLARE_IRQ(MAX77693_CHG_IRQ_CHGIN_I,		CHG_INT, 1 << 6),
+
+	DECLARE_IRQ(MAX77693_MUIC_IRQ_INT1_ADC,		MUIC_INT1, 1 << 0),
+	DECLARE_IRQ(MAX77693_MUIC_IRQ_INT1_ADC_LOW,	MUIC_INT1, 1 << 1),
+	DECLARE_IRQ(MAX77693_MUIC_IRQ_INT1_ADC_ERR,	MUIC_INT1, 1 << 2),
+	DECLARE_IRQ(MAX77693_MUIC_IRQ_INT1_ADC1K,	MUIC_INT1, 1 << 3),
+
+	DECLARE_IRQ(MAX77693_MUIC_IRQ_INT2_CHGTYP,	MUIC_INT2, 1 << 0),
+	DECLARE_IRQ(MAX77693_MUIC_IRQ_INT2_CHGDETREUN,	MUIC_INT2, 1 << 1),
+	DECLARE_IRQ(MAX77693_MUIC_IRQ_INT2_DCDTMR,	MUIC_INT2, 1 << 2),
+	DECLARE_IRQ(MAX77693_MUIC_IRQ_INT2_DXOVP,	MUIC_INT2, 1 << 3),
+	DECLARE_IRQ(MAX77693_MUIC_IRQ_INT2_VBVOLT,	MUIC_INT2, 1 << 4),
+	DECLARE_IRQ(MAX77693_MUIC_IRQ_INT2_VIDRM,	MUIC_INT2, 1 << 5),
+
+	DECLARE_IRQ(MAX77693_MUIC_IRQ_INT3_EOC,		MUIC_INT3, 1 << 0),
+	DECLARE_IRQ(MAX77693_MUIC_IRQ_INT3_CGMBC,	MUIC_INT3, 1 << 1),
+	DECLARE_IRQ(MAX77693_MUIC_IRQ_INT3_OVP,		MUIC_INT3, 1 << 2),
+	DECLARE_IRQ(MAX77693_MUIC_IRQ_INT3_MBCCHG_ERR,	MUIC_INT3, 1 << 3),
+	DECLARE_IRQ(MAX77693_MUIC_IRQ_INT3_CHG_ENABLED,	MUIC_INT3, 1 << 4),
+	DECLARE_IRQ(MAX77693_MUIC_IRQ_INT3_BAT_DET,	MUIC_INT3, 1 << 5),
+};
+
+static void max77693_irq_lock(struct irq_data *data)
+{
+	struct max77693_dev *max77693 = irq_get_chip_data(data->irq);
+
+	mutex_lock(&max77693->irqlock);
+}
+
+static void max77693_irq_sync_unlock(struct irq_data *data)
+{
+	struct max77693_dev *max77693 = irq_get_chip_data(data->irq);
+	int i;
+
+	for (i = 0; i < MAX77693_IRQ_GROUP_NR; i++) {
+		u8 mask_reg = max77693_mask_reg[i];
+		struct regmap *map = max77693_get_regmap(max77693, i);
+
+		if (mask_reg == MAX77693_REG_INVALID ||
+				IS_ERR_OR_NULL(map))
+			continue;
+		max77693->irq_masks_cache[i] = max77693->irq_masks_cur[i];
+
+		max77693_write_reg(map, max77693_mask_reg[i],
+				max77693->irq_masks_cur[i]);
+	}
+
+	mutex_unlock(&max77693->irqlock);
+}
+
+static const inline struct max77693_irq_data *
+irq_to_max77693_irq(struct max77693_dev *max77693, int irq)
+{
+	return &max77693_irqs[irq];
+}
+
+static void max77693_irq_mask(struct irq_data *data)
+{
+	struct max77693_dev *max77693 = irq_get_chip_data(data->irq);
+	const struct max77693_irq_data *irq_data =
+				irq_to_max77693_irq(max77693, data->irq);
+
+	if (irq_data->group >= MUIC_INT1 && irq_data->group <= MUIC_INT3)
+		max77693->irq_masks_cur[irq_data->group] &= ~irq_data->mask;
+	else
+		max77693->irq_masks_cur[irq_data->group] |= irq_data->mask;
+}
+
+static void max77693_irq_unmask(struct irq_data *data)
+{
+	struct max77693_dev *max77693 = irq_get_chip_data(data->irq);
+	const struct max77693_irq_data *irq_data =
+	    irq_to_max77693_irq(max77693, data->irq);
+
+	if (irq_data->group >= MUIC_INT1 && irq_data->group <= MUIC_INT3)
+		max77693->irq_masks_cur[irq_data->group] |= irq_data->mask;
+	else
+		max77693->irq_masks_cur[irq_data->group] &= ~irq_data->mask;
+}
+
+static struct irq_chip max77693_irq_chip = {
+	.name			= "max77693",
+	.irq_bus_lock		= max77693_irq_lock,
+	.irq_bus_sync_unlock	= max77693_irq_sync_unlock,
+	.irq_mask		= max77693_irq_mask,
+	.irq_unmask		= max77693_irq_unmask,
+};
+
+#define MAX77693_IRQSRC_CHG		(1 << 0)
+#define MAX77693_IRQSRC_TOP		(1 << 1)
+#define MAX77693_IRQSRC_FLASH		(1 << 2)
+#define MAX77693_IRQSRC_MUIC		(1 << 3)
+static irqreturn_t max77693_irq_thread(int irq, void *data)
+{
+	struct max77693_dev *max77693 = data;
+	u8 irq_reg[MAX77693_IRQ_GROUP_NR] = {};
+	u8 irq_src;
+	int ret;
+	int i, cur_irq;
+
+	ret = max77693_read_reg(max77693->regmap, MAX77693_PMIC_REG_INTSRC,
+				&irq_src);
+	if (ret < 0) {
+		dev_err(max77693->dev, "Failed to read interrupt source: %d\n",
+				ret);
+		return IRQ_NONE;
+	}
+
+	if (irq_src & MAX77693_IRQSRC_CHG)
+		/* CHG_INT */
+		ret = max77693_read_reg(max77693->regmap, MAX77693_CHG_REG_CHG_INT,
+				&irq_reg[CHG_INT]);
+
+	if (irq_src & MAX77693_IRQSRC_TOP)
+		/* TOPSYS_INT */
+		ret = max77693_read_reg(max77693->regmap,
+			MAX77693_PMIC_REG_TOPSYS_INT, &irq_reg[TOPSYS_INT]);
+
+	if (irq_src & MAX77693_IRQSRC_FLASH)
+		/* LED_INT */
+		ret = max77693_read_reg(max77693->regmap,
+			MAX77693_LED_REG_FLASH_INT, &irq_reg[LED_INT]);
+
+	if (irq_src & MAX77693_IRQSRC_MUIC)
+		/* MUIC INT1 ~ INT3 */
+		max77693_bulk_read(max77693->regmap, MAX77693_MUIC_REG_INT1,
+			MAX77693_NUM_IRQ_MUIC_REGS, &irq_reg[MUIC_INT1]);
+
+	/* Apply masking */
+	for (i = 0; i < MAX77693_IRQ_GROUP_NR; i++) {
+		if (i >= MUIC_INT1 && i <= MUIC_INT3)
+			irq_reg[i] &= max77693->irq_masks_cur[i];
+		else
+			irq_reg[i] &= ~max77693->irq_masks_cur[i];
+	}
+
+	/* Report */
+	for (i = 0; i < MAX77693_IRQ_NR; i++) {
+		if (irq_reg[max77693_irqs[i].group] & max77693_irqs[i].mask) {
+			cur_irq = irq_find_mapping(max77693->irq_domain, i);
+			if (cur_irq)
+				handle_nested_irq(cur_irq);
+		}
+	}
+
+	return IRQ_HANDLED;
+}
+
+int max77693_irq_resume(struct max77693_dev *max77693)
+{
+	if (max77693->irq)
+		max77693_irq_thread(0, max77693);
+
+	return 0;
+}
+
+static int max77693_irq_domain_map(struct irq_domain *d, unsigned int irq,
+				irq_hw_number_t hw)
+{
+	struct max77693_dev *max77693 = d->host_data;
+
+	irq_set_chip_data(irq, max77693);
+	irq_set_chip_and_handler(irq, &max77693_irq_chip, handle_edge_irq);
+	irq_set_nested_thread(irq, 1);
+#ifdef CONFIG_ARM
+	set_irq_flags(irq, IRQF_VALID);
+#else
+	irq_set_noprobe(irq);
+#endif
+	return 0;
+}
+
+static struct irq_domain_ops max77693_irq_domain_ops = {
+	.map = max77693_irq_domain_map,
+};
+
+int max77693_irq_init(struct max77693_dev *max77693)
+{
+	struct irq_domain *domain;
+	int i;
+	int ret;
+
+	mutex_init(&max77693->irqlock);
+
+	/* Mask individual interrupt sources */
+	for (i = 0; i < MAX77693_IRQ_GROUP_NR; i++) {
+		struct regmap *map;
+		/* MUIC IRQ  0:MASK 1:NOT MASK */
+		/* Other IRQ 1:MASK 0:NOT MASK */
+		if (i >= MUIC_INT1 && i <= MUIC_INT3) {
+			max77693->irq_masks_cur[i] = 0x00;
+			max77693->irq_masks_cache[i] = 0x00;
+		} else {
+			max77693->irq_masks_cur[i] = 0xff;
+			max77693->irq_masks_cache[i] = 0xff;
+		}
+		map = max77693_get_regmap(max77693, i);
+
+		if (IS_ERR_OR_NULL(map))
+			continue;
+		if (max77693_mask_reg[i] == MAX77693_REG_INVALID)
+			continue;
+		if (i >= MUIC_INT1 && i <= MUIC_INT3)
+			max77693_write_reg(map, max77693_mask_reg[i], 0x00);
+		else
+			max77693_write_reg(map, max77693_mask_reg[i], 0xff);
+	}
+
+	domain = irq_domain_add_linear(NULL, MAX77693_IRQ_NR,
+					&max77693_irq_domain_ops, max77693);
+	if (!domain) {
+		dev_err(max77693->dev, "could not create irq domain\n");
+		return -ENODEV;
+	}
+	max77693->irq_domain = domain;
+
+	ret = request_threaded_irq(max77693->irq, NULL, max77693_irq_thread,
+				   IRQF_TRIGGER_FALLING | IRQF_ONESHOT,
+				   "max77693-irq", max77693);
+
+	if (ret)
+		dev_err(max77693->dev, "Failed to request IRQ %d: %d\n",
+			max77693->irq, ret);
+
+	return 0;
+}
+
+void max77693_irq_exit(struct max77693_dev *max77693)
+{
+	if (max77693->irq)
+		free_irq(max77693->irq, max77693);
+}
diff --git a/drivers/mfd/max77693.c b/drivers/mfd/max77693.c
index c852515..e9e4278 100644
--- a/drivers/mfd/max77693.c
+++ b/drivers/mfd/max77693.c
@@ -154,6 +154,10 @@ static int max77693_i2c_probe(struct i2c_client *i2c,
 	max77693->haptic = i2c_new_dummy(i2c->adapter, I2C_ADDR_HAPTIC);
 	i2c_set_clientdata(max77693->haptic, max77693);
 
+	ret = max77693_irq_init(max77693);
+	if (ret < 0)
+		goto err_mfd;
+
 	pm_runtime_set_active(max77693->dev);
 
 	ret = mfd_add_devices(max77693->dev, -1, max77693_devs,
@@ -161,6 +165,8 @@ static int max77693_i2c_probe(struct i2c_client *i2c,
 	if (ret < 0)
 		goto err_mfd;
 
+	device_init_wakeup(max77693->dev, pdata->wakeup);
+
 	return ret;
 
 err_mfd:
@@ -189,10 +195,36 @@ static const struct i2c_device_id max77693_i2c_id[] = {
 };
 MODULE_DEVICE_TABLE(i2c, max77693_i2c_id);
 
+static int max77693_suspend(struct device *dev)
+{
+	struct i2c_client *i2c = container_of(dev, struct i2c_client, dev);
+	struct max77693_dev *max77693 = i2c_get_clientdata(i2c);
+
+	if (device_may_wakeup(dev))
+		irq_set_irq_wake(max77693->irq, 1);
+	return 0;
+}
+
+static int max77693_resume(struct device *dev)
+{
+	struct i2c_client *i2c = container_of(dev, struct i2c_client, dev);
+	struct max77693_dev *max77693 = i2c_get_clientdata(i2c);
+
+	if (device_may_wakeup(dev))
+		irq_set_irq_wake(max77693->irq, 0);
+	return max77693_irq_resume(max77693);
+}
+
+const struct dev_pm_ops max77693_pm = {
+	.suspend = max77693_suspend,
+	.resume = max77693_resume,
+};
+
 static struct i2c_driver max77693_i2c_driver = {
 	.driver = {
 		   .name = "max77693",
 		   .owner = THIS_MODULE,
+		   .pm = &max77693_pm,
 	},
 	.probe = max77693_i2c_probe,
 	.remove = max77693_i2c_remove,
diff --git a/include/linux/mfd/max77693.h b/include/linux/mfd/max77693.h
index 5020b86..1d28ae9 100644
--- a/include/linux/mfd/max77693.h
+++ b/include/linux/mfd/max77693.h
@@ -31,7 +31,6 @@
 #define __LINUX_MFD_MAX77693_H
 
 struct max77693_platform_data {
-	/* IRQ */
 	int wakeup;
 };
 #endif	/* __LINUX_MFD_MAX77693_H */
-- 
cgit v0.10.2


From 4492c4c3ff7bbb5fd400f021532643a3493f0723 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@opensource.wolfsonmicro.com>
Date: Sun, 13 May 2012 21:53:45 +0100
Subject: mfd: Don't try to flag IRQ 0 as a wm831x wake source

If we've not got a primary IRQ we shouldn't be trying to flag IRQ 0 as a
wake source.

Signed-off-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>

diff --git a/drivers/mfd/wm831x-irq.c b/drivers/mfd/wm831x-irq.c
index bec4d05..2be9628 100644
--- a/drivers/mfd/wm831x-irq.c
+++ b/drivers/mfd/wm831x-irq.c
@@ -565,17 +565,6 @@ int wm831x_irq_init(struct wm831x *wm831x, int irq)
 	wm831x_set_bits(wm831x, WM831X_IRQ_CONFIG,
 			WM831X_IRQ_OD, i);
 
-	/* Try to flag /IRQ as a wake source; there are a number of
-	 * unconditional wake sources in the PMIC so this isn't
-	 * conditional but we don't actually care *too* much if it
-	 * fails.
-	 */
-	ret = enable_irq_wake(irq);
-	if (ret != 0) {
-		dev_warn(wm831x->dev, "Can't enable IRQ as wake source: %d\n",
-			 ret);
-	}
-
 	wm831x->irq = irq;
 
 	/* Register them with genirq */
@@ -597,6 +586,18 @@ int wm831x_irq_init(struct wm831x *wm831x, int irq)
 	}
 
 	if (irq) {
+		/* Try to flag /IRQ as a wake source; there are a number of
+		 * unconditional wake sources in the PMIC so this isn't
+		 * conditional but we don't actually care *too* much if it
+		 * fails.
+		 */
+		ret = enable_irq_wake(irq);
+		if (ret != 0) {
+			dev_warn(wm831x->dev,
+				 "Can't enable IRQ as wake source: %d\n",
+				 ret);
+		}
+
 		ret = request_threaded_irq(irq, NULL, wm831x_irq_thread,
 					   IRQF_TRIGGER_LOW | IRQF_ONESHOT,
 					   "wm831x", wm831x);
-- 
cgit v0.10.2


From cd99758ba3bde64347a8ece381cbae2fb5c745b2 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@opensource.wolfsonmicro.com>
Date: Mon, 14 May 2012 23:14:24 +0200
Subject: mfd: Convert wm831x to irq_domain

The modern idiom is to use irq_domain to allocate interrupts. This is
useful partly to allow further infrastructure to be based on the domains
and partly because it makes it much easier to allocate virtual interrupts
to devices as we don't need to allocate a contiguous range of interrupt
numbers.

Convert the wm831x driver over to this infrastructure, using a legacy
IRQ mapping if an irq_base is specified in platform data and otherwise
using a linear mapping, always registering the interrupts even if they
won't ever be used. Only boards which need to use the GPIOs as
interrupts should need to use an irq_base.

This means that we can't use the MFD irq_base management since the
unless we're using an explicit irq_base from platform data we can't rely
on a linear mapping of interrupts.  Instead we need to map things via
the irq_domain - provide a conveniencem function wm831x_irq() to save a
small amount of typing when doing so. Looking at this I couldn't clearly
see anything the MFD core could do to make this nicer.

Since we're not supporting device tree yet there's no meaningful
advantage if we don't do this conversion in one, the fact that the
interrupt resources are used for repeated IP blocks makes accessor
functions for the irq_domain more trouble to do than they're worth.

Signed-off-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>

diff --git a/drivers/gpio/gpio-wm831x.c b/drivers/gpio/gpio-wm831x.c
index deb949e..e56a216 100644
--- a/drivers/gpio/gpio-wm831x.c
+++ b/drivers/gpio/gpio-wm831x.c
@@ -102,10 +102,8 @@ static int wm831x_gpio_to_irq(struct gpio_chip *chip, unsigned offset)
 	struct wm831x_gpio *wm831x_gpio = to_wm831x_gpio(chip);
 	struct wm831x *wm831x = wm831x_gpio->wm831x;
 
-	if (!wm831x->irq_base)
-		return -EINVAL;
-
-	return wm831x->irq_base + WM831X_IRQ_GPIO_1 + offset;
+	return irq_create_mapping(wm831x->irq_domain,
+				  WM831X_IRQ_GPIO_1 + offset);
 }
 
 static int wm831x_gpio_set_debounce(struct gpio_chip *chip, unsigned offset,
diff --git a/drivers/input/misc/wm831x-on.c b/drivers/input/misc/wm831x-on.c
index 47f18d6..6790a81 100644
--- a/drivers/input/misc/wm831x-on.c
+++ b/drivers/input/misc/wm831x-on.c
@@ -73,7 +73,7 @@ static int __devinit wm831x_on_probe(struct platform_device *pdev)
 {
 	struct wm831x *wm831x = dev_get_drvdata(pdev->dev.parent);
 	struct wm831x_on *wm831x_on;
-	int irq = platform_get_irq(pdev, 0);
+	int irq = wm831x_irq(wm831x, platform_get_irq(pdev, 0));
 	int ret;
 
 	wm831x_on = kzalloc(sizeof(struct wm831x_on), GFP_KERNEL);
diff --git a/drivers/input/touchscreen/wm831x-ts.c b/drivers/input/touchscreen/wm831x-ts.c
index 4bc851a..e834107 100644
--- a/drivers/input/touchscreen/wm831x-ts.c
+++ b/drivers/input/touchscreen/wm831x-ts.c
@@ -260,15 +260,16 @@ static __devinit int wm831x_ts_probe(struct platform_device *pdev)
 	 * If we have a direct IRQ use it, otherwise use the interrupt
 	 * from the WM831x IRQ controller.
 	 */
+	wm831x_ts->data_irq = wm831x_irq(wm831x,
+					 platform_get_irq_byname(pdev,
+								 "TCHDATA"));
 	if (pdata && pdata->data_irq)
 		wm831x_ts->data_irq = pdata->data_irq;
-	else
-		wm831x_ts->data_irq = platform_get_irq_byname(pdev, "TCHDATA");
 
+	wm831x_ts->pd_irq = wm831x_irq(wm831x,
+				       platform_get_irq_byname(pdev, "TCHPD"));
 	if (pdata && pdata->pd_irq)
 		wm831x_ts->pd_irq = pdata->pd_irq;
-	else
-		wm831x_ts->pd_irq = platform_get_irq_byname(pdev, "TCHPD");
 
 	if (pdata)
 		wm831x_ts->pressure = pdata->pressure;
diff --git a/drivers/mfd/Kconfig b/drivers/mfd/Kconfig
index a0e1b83..8325c44 100644
--- a/drivers/mfd/Kconfig
+++ b/drivers/mfd/Kconfig
@@ -496,6 +496,7 @@ config MFD_WM831X_I2C
 	select MFD_CORE
 	select MFD_WM831X
 	select REGMAP_I2C
+	select IRQ_DOMAIN
 	depends on I2C=y && GENERIC_HARDIRQS
 	help
 	  Support for the Wolfson Microelecronics WM831x and WM832x PMICs
@@ -508,6 +509,7 @@ config MFD_WM831X_SPI
 	select MFD_CORE
 	select MFD_WM831X
 	select REGMAP_SPI
+	select IRQ_DOMAIN
 	depends on SPI_MASTER && GENERIC_HARDIRQS
 	help
 	  Support for the Wolfson Microelecronics WM831x and WM832x PMICs
diff --git a/drivers/mfd/wm831x-auxadc.c b/drivers/mfd/wm831x-auxadc.c
index 8721095..6ee3018 100644
--- a/drivers/mfd/wm831x-auxadc.c
+++ b/drivers/mfd/wm831x-auxadc.c
@@ -280,11 +280,11 @@ void wm831x_auxadc_init(struct wm831x *wm831x)
 	mutex_init(&wm831x->auxadc_lock);
 	INIT_LIST_HEAD(&wm831x->auxadc_pending);
 
-	if (wm831x->irq && wm831x->irq_base) {
+	if (wm831x->irq) {
 		wm831x->auxadc_read = wm831x_auxadc_read_irq;
 
-		ret = request_threaded_irq(wm831x->irq_base +
-					   WM831X_IRQ_AUXADC_DATA,
+		ret = request_threaded_irq(wm831x_irq(wm831x,
+						      WM831X_IRQ_AUXADC_DATA),
 					   NULL, wm831x_auxadc_irq, 0,
 					   "auxadc", wm831x);
 		if (ret < 0) {
diff --git a/drivers/mfd/wm831x-core.c b/drivers/mfd/wm831x-core.c
index 476e4d3..946698f 100644
--- a/drivers/mfd/wm831x-core.c
+++ b/drivers/mfd/wm831x-core.c
@@ -1813,27 +1813,27 @@ int wm831x_device_init(struct wm831x *wm831x, unsigned long id, int irq)
 	case WM8310:
 		ret = mfd_add_devices(wm831x->dev, wm831x_num,
 				      wm8310_devs, ARRAY_SIZE(wm8310_devs),
-				      NULL, wm831x->irq_base);
+				      NULL, 0);
 		break;
 
 	case WM8311:
 		ret = mfd_add_devices(wm831x->dev, wm831x_num,
 				      wm8311_devs, ARRAY_SIZE(wm8311_devs),
-				      NULL, wm831x->irq_base);
+				      NULL, 0);
 		if (!pdata || !pdata->disable_touch)
 			mfd_add_devices(wm831x->dev, wm831x_num,
 					touch_devs, ARRAY_SIZE(touch_devs),
-					NULL, wm831x->irq_base);
+					NULL, 0);
 		break;
 
 	case WM8312:
 		ret = mfd_add_devices(wm831x->dev, wm831x_num,
 				      wm8312_devs, ARRAY_SIZE(wm8312_devs),
-				      NULL, wm831x->irq_base);
+				      NULL, 0);
 		if (!pdata || !pdata->disable_touch)
 			mfd_add_devices(wm831x->dev, wm831x_num,
 					touch_devs, ARRAY_SIZE(touch_devs),
-					NULL, wm831x->irq_base);
+					NULL, 0);
 		break;
 
 	case WM8320:
@@ -1842,7 +1842,7 @@ int wm831x_device_init(struct wm831x *wm831x, unsigned long id, int irq)
 	case WM8326:
 		ret = mfd_add_devices(wm831x->dev, wm831x_num,
 				      wm8320_devs, ARRAY_SIZE(wm8320_devs),
-				      NULL, wm831x->irq_base);
+				      NULL, 0);
 		break;
 
 	default:
@@ -1867,7 +1867,7 @@ int wm831x_device_init(struct wm831x *wm831x, unsigned long id, int irq)
 	if (ret & WM831X_XTAL_ENA) {
 		ret = mfd_add_devices(wm831x->dev, wm831x_num,
 				      rtc_devs, ARRAY_SIZE(rtc_devs),
-				      NULL, wm831x->irq_base);
+				      NULL, 0);
 		if (ret != 0) {
 			dev_err(wm831x->dev, "Failed to add RTC: %d\n", ret);
 			goto err_irq;
@@ -1880,7 +1880,7 @@ int wm831x_device_init(struct wm831x *wm831x, unsigned long id, int irq)
 		/* Treat errors as non-critical */
 		ret = mfd_add_devices(wm831x->dev, wm831x_num, backlight_devs,
 				      ARRAY_SIZE(backlight_devs), NULL,
-				      wm831x->irq_base);
+				      0);
 		if (ret < 0)
 			dev_err(wm831x->dev, "Failed to add backlight: %d\n",
 				ret);
@@ -1909,8 +1909,7 @@ void wm831x_device_exit(struct wm831x *wm831x)
 {
 	wm831x_otp_exit(wm831x);
 	mfd_remove_devices(wm831x->dev);
-	if (wm831x->irq_base)
-		free_irq(wm831x->irq_base + WM831X_IRQ_AUXADC_DATA, wm831x);
+	free_irq(wm831x_irq(wm831x, WM831X_IRQ_AUXADC_DATA), wm831x);
 	wm831x_irq_exit(wm831x);
 }
 
diff --git a/drivers/mfd/wm831x-irq.c b/drivers/mfd/wm831x-irq.c
index 2be9628..ecc9d6d 100644
--- a/drivers/mfd/wm831x-irq.c
+++ b/drivers/mfd/wm831x-irq.c
@@ -18,6 +18,7 @@
 #include <linux/irq.h>
 #include <linux/mfd/core.h>
 #include <linux/interrupt.h>
+#include <linux/irqdomain.h>
 
 #include <linux/mfd/wm831x/core.h>
 #include <linux/mfd/wm831x/pdata.h>
@@ -328,7 +329,7 @@ static inline int irq_data_to_status_reg(struct wm831x_irq_data *irq_data)
 static inline struct wm831x_irq_data *irq_to_wm831x_irq(struct wm831x *wm831x,
 							int irq)
 {
-	return &wm831x_irqs[irq - wm831x->irq_base];
+	return &wm831x_irqs[irq];
 }
 
 static void wm831x_irq_lock(struct irq_data *data)
@@ -374,7 +375,7 @@ static void wm831x_irq_enable(struct irq_data *data)
 {
 	struct wm831x *wm831x = irq_data_get_irq_chip_data(data);
 	struct wm831x_irq_data *irq_data = irq_to_wm831x_irq(wm831x,
-							     data->irq);
+							     data->hwirq);
 
 	wm831x->irq_masks_cur[irq_data->reg - 1] &= ~irq_data->mask;
 }
@@ -383,7 +384,7 @@ static void wm831x_irq_disable(struct irq_data *data)
 {
 	struct wm831x *wm831x = irq_data_get_irq_chip_data(data);
 	struct wm831x_irq_data *irq_data = irq_to_wm831x_irq(wm831x,
-							     data->irq);
+							     data->hwirq);
 
 	wm831x->irq_masks_cur[irq_data->reg - 1] |= irq_data->mask;
 }
@@ -393,7 +394,7 @@ static int wm831x_irq_set_type(struct irq_data *data, unsigned int type)
 	struct wm831x *wm831x = irq_data_get_irq_chip_data(data);
 	int irq;
 
-	irq = data->irq - wm831x->irq_base;
+	irq = data->hwirq;
 
 	if (irq < WM831X_IRQ_GPIO_1 || irq > WM831X_IRQ_GPIO_11) {
 		/* Ignore internal-only IRQs */
@@ -469,9 +470,11 @@ static irqreturn_t wm831x_irq_thread(int irq, void *data)
 	 * descriptors.
 	 */
 	if (primary & WM831X_TCHPD_INT)
-		handle_nested_irq(wm831x->irq_base + WM831X_IRQ_TCHPD);
+		handle_nested_irq(irq_find_mapping(wm831x->irq_domain,
+						   WM831X_IRQ_TCHPD));
 	if (primary & WM831X_TCHDATA_INT)
-		handle_nested_irq(wm831x->irq_base + WM831X_IRQ_TCHDATA);
+		handle_nested_irq(irq_find_mapping(wm831x->irq_domain,
+						   WM831X_IRQ_TCHDATA));
 	primary &= ~(WM831X_TCHDATA_EINT | WM831X_TCHPD_EINT);
 
 	for (i = 0; i < ARRAY_SIZE(wm831x_irqs); i++) {
@@ -507,7 +510,8 @@ static irqreturn_t wm831x_irq_thread(int irq, void *data)
 		}
 
 		if (*status & wm831x_irqs[i].mask)
-			handle_nested_irq(wm831x->irq_base + i);
+			handle_nested_irq(irq_find_mapping(wm831x->irq_domain,
+							   i));
 
 		/* Simulate an edge triggered IRQ by polling the input
 		 * status.  This is sucky but improves interoperability.
@@ -516,7 +520,8 @@ static irqreturn_t wm831x_irq_thread(int irq, void *data)
 		    wm831x->gpio_level[i - WM831X_IRQ_GPIO_1]) {
 			ret = wm831x_reg_read(wm831x, WM831X_GPIO_LEVEL);
 			while (ret & 1 << (i - WM831X_IRQ_GPIO_1)) {
-				handle_nested_irq(wm831x->irq_base + i);
+				handle_nested_irq(irq_find_mapping(wm831x->irq_domain,
+								   i));
 				ret = wm831x_reg_read(wm831x,
 						      WM831X_GPIO_LEVEL);
 			}
@@ -527,10 +532,34 @@ out:
 	return IRQ_HANDLED;
 }
 
+static int wm831x_irq_map(struct irq_domain *h, unsigned int virq,
+			  irq_hw_number_t hw)
+{
+	irq_set_chip_data(virq, h->host_data);
+	irq_set_chip_and_handler(virq, &wm831x_irq_chip, handle_edge_irq);
+	irq_set_nested_thread(virq, 1);
+
+	/* ARM needs us to explicitly flag the IRQ as valid
+	 * and will set them noprobe when we do so. */
+#ifdef CONFIG_ARM
+	set_irq_flags(virq, IRQF_VALID);
+#else
+	irq_set_noprobe(virq);
+#endif
+
+	return 0;
+}
+
+static struct irq_domain_ops wm831x_irq_domain_ops = {
+	.map	= wm831x_irq_map,
+	.xlate	= irq_domain_xlate_twocell,
+};
+
 int wm831x_irq_init(struct wm831x *wm831x, int irq)
 {
 	struct wm831x_pdata *pdata = wm831x->dev->platform_data;
-	int i, cur_irq, ret;
+	struct irq_domain *domain;
+	int i, ret, irq_base;
 
 	mutex_init(&wm831x->irq_lock);
 
@@ -543,18 +572,33 @@ int wm831x_irq_init(struct wm831x *wm831x, int irq)
 	}
 
 	/* Try to dynamically allocate IRQs if no base is specified */
-	if (!pdata || !pdata->irq_base)
-		wm831x->irq_base = -1;
+	if (pdata && pdata->irq_base) {
+		irq_base = irq_alloc_descs(pdata->irq_base, 0,
+					   WM831X_NUM_IRQS, 0);
+		if (irq_base < 0) {
+			dev_warn(wm831x->dev, "Failed to allocate IRQs: %d\n",
+				 irq_base);
+			irq_base = 0;
+		}
+	} else {
+		irq_base = 0;
+	}
+
+	if (irq_base)
+		domain = irq_domain_add_legacy(wm831x->dev->of_node,
+					       ARRAY_SIZE(wm831x_irqs),
+					       irq_base, 0,
+					       &wm831x_irq_domain_ops,
+					       wm831x);
 	else
-		wm831x->irq_base = pdata->irq_base;
+		domain = irq_domain_add_linear(wm831x->dev->of_node,
+					       ARRAY_SIZE(wm831x_irqs),
+					       &wm831x_irq_domain_ops,
+					       wm831x);
 
-	wm831x->irq_base = irq_alloc_descs(wm831x->irq_base, 0,
-					   WM831X_NUM_IRQS, 0);
-	if (wm831x->irq_base < 0) {
-		dev_warn(wm831x->dev, "Failed to allocate IRQs: %d\n",
-			 wm831x->irq_base);
-		wm831x->irq_base = 0;
-		return 0;
+	if (!domain) {
+		dev_warn(wm831x->dev, "Failed to allocate IRQ domain\n");
+		return -EINVAL;
 	}
 
 	if (pdata && pdata->irq_cmos)
@@ -566,24 +610,7 @@ int wm831x_irq_init(struct wm831x *wm831x, int irq)
 			WM831X_IRQ_OD, i);
 
 	wm831x->irq = irq;
-
-	/* Register them with genirq */
-	for (cur_irq = wm831x->irq_base;
-	     cur_irq < ARRAY_SIZE(wm831x_irqs) + wm831x->irq_base;
-	     cur_irq++) {
-		irq_set_chip_data(cur_irq, wm831x);
-		irq_set_chip_and_handler(cur_irq, &wm831x_irq_chip,
-					 handle_edge_irq);
-		irq_set_nested_thread(cur_irq, 1);
-
-		/* ARM needs us to explicitly flag the IRQ as valid
-		 * and will set them noprobe when we do so. */
-#ifdef CONFIG_ARM
-		set_irq_flags(cur_irq, IRQF_VALID);
-#else
-		irq_set_noprobe(cur_irq);
-#endif
-	}
+	wm831x->irq_domain = domain;
 
 	if (irq) {
 		/* Try to flag /IRQ as a wake source; there are a number of
diff --git a/drivers/power/wm831x_power.c b/drivers/power/wm831x_power.c
index 987332b..fc1ad95 100644
--- a/drivers/power/wm831x_power.c
+++ b/drivers/power/wm831x_power.c
@@ -565,7 +565,7 @@ static __devinit int wm831x_power_probe(struct platform_device *pdev)
 			    goto err_usb;
 	}
 
-	irq = platform_get_irq_byname(pdev, "SYSLO");
+	irq = wm831x_irq(wm831x, platform_get_irq_byname(pdev, "SYSLO"));
 	ret = request_threaded_irq(irq, NULL, wm831x_syslo_irq,
 				   IRQF_TRIGGER_RISING, "System power low",
 				   power);
@@ -575,7 +575,7 @@ static __devinit int wm831x_power_probe(struct platform_device *pdev)
 		goto err_battery;
 	}
 
-	irq = platform_get_irq_byname(pdev, "PWR SRC");
+	irq = wm831x_irq(wm831x, platform_get_irq_byname(pdev, "PWR SRC"));
 	ret = request_threaded_irq(irq, NULL, wm831x_pwr_src_irq,
 				   IRQF_TRIGGER_RISING, "Power source",
 				   power);
@@ -586,7 +586,9 @@ static __devinit int wm831x_power_probe(struct platform_device *pdev)
 	}
 
 	for (i = 0; i < ARRAY_SIZE(wm831x_bat_irqs); i++) {
-		irq = platform_get_irq_byname(pdev, wm831x_bat_irqs[i]);
+		irq = wm831x_irq(wm831x,
+				 platform_get_irq_byname(pdev,
+							 wm831x_bat_irqs[i]));
 		ret = request_threaded_irq(irq, NULL, wm831x_bat_irq,
 					   IRQF_TRIGGER_RISING,
 					   wm831x_bat_irqs[i],
@@ -606,10 +608,10 @@ err_bat_irq:
 		irq = platform_get_irq_byname(pdev, wm831x_bat_irqs[i]);
 		free_irq(irq, power);
 	}
-	irq = platform_get_irq_byname(pdev, "PWR SRC");
+	irq = wm831x_irq(wm831x, platform_get_irq_byname(pdev, "PWR SRC"));
 	free_irq(irq, power);
 err_syslo:
-	irq = platform_get_irq_byname(pdev, "SYSLO");
+	irq = wm831x_irq(wm831x, platform_get_irq_byname(pdev, "SYSLO"));
 	free_irq(irq, power);
 err_battery:
 	if (power->have_battery)
@@ -626,17 +628,20 @@ err_kmalloc:
 static __devexit int wm831x_power_remove(struct platform_device *pdev)
 {
 	struct wm831x_power *wm831x_power = platform_get_drvdata(pdev);
+	struct wm831x *wm831x = wm831x_power->wm831x;
 	int irq, i;
 
 	for (i = 0; i < ARRAY_SIZE(wm831x_bat_irqs); i++) {
-		irq = platform_get_irq_byname(pdev, wm831x_bat_irqs[i]);
+		irq = wm831x_irq(wm831x, 
+				 platform_get_irq_byname(pdev,
+							 wm831x_bat_irqs[i]));
 		free_irq(irq, wm831x_power);
 	}
 
-	irq = platform_get_irq_byname(pdev, "PWR SRC");
+	irq = wm831x_irq(wm831x, platform_get_irq_byname(pdev, "PWR SRC"));
 	free_irq(irq, wm831x_power);
 
-	irq = platform_get_irq_byname(pdev, "SYSLO");
+	irq = wm831x_irq(wm831x, platform_get_irq_byname(pdev, "SYSLO"));
 	free_irq(irq, wm831x_power);
 
 	if (wm831x_power->have_battery)
diff --git a/drivers/regulator/wm831x-dcdc.c b/drivers/regulator/wm831x-dcdc.c
index ff810e7..33b2f20 100644
--- a/drivers/regulator/wm831x-dcdc.c
+++ b/drivers/regulator/wm831x-dcdc.c
@@ -565,7 +565,7 @@ static __devinit int wm831x_buckv_probe(struct platform_device *pdev)
 		goto err;
 	}
 
-	irq = platform_get_irq_byname(pdev, "UV");
+	irq = wm831x_irq(wm831x, platform_get_irq_byname(pdev, "UV"));
 	ret = request_threaded_irq(irq, NULL, wm831x_dcdc_uv_irq,
 				   IRQF_TRIGGER_RISING, dcdc->name, dcdc);
 	if (ret != 0) {
@@ -574,7 +574,7 @@ static __devinit int wm831x_buckv_probe(struct platform_device *pdev)
 		goto err_regulator;
 	}
 
-	irq = platform_get_irq_byname(pdev, "HC");
+	irq = wm831x_irq(wm831x, platform_get_irq_byname(pdev, "HC"));
 	ret = request_threaded_irq(irq, NULL, wm831x_dcdc_oc_irq,
 				   IRQF_TRIGGER_RISING, dcdc->name, dcdc);
 	if (ret != 0) {
@@ -588,7 +588,8 @@ static __devinit int wm831x_buckv_probe(struct platform_device *pdev)
 	return 0;
 
 err_uv:
-	free_irq(platform_get_irq_byname(pdev, "UV"), dcdc);
+	free_irq(wm831x_irq(wm831x, platform_get_irq_byname(pdev, "UV")),
+		 dcdc);
 err_regulator:
 	regulator_unregister(dcdc->regulator);
 err:
@@ -600,11 +601,14 @@ err:
 static __devexit int wm831x_buckv_remove(struct platform_device *pdev)
 {
 	struct wm831x_dcdc *dcdc = platform_get_drvdata(pdev);
+	struct wm831x *wm831x = dcdc->wm831x;
 
 	platform_set_drvdata(pdev, NULL);
 
-	free_irq(platform_get_irq_byname(pdev, "HC"), dcdc);
-	free_irq(platform_get_irq_byname(pdev, "UV"), dcdc);
+	free_irq(wm831x_irq(wm831x, platform_get_irq_byname(pdev, "HC")),
+			    dcdc);
+	free_irq(wm831x_irq(wm831x, platform_get_irq_byname(pdev, "UV")),
+			    dcdc);
 	regulator_unregister(dcdc->regulator);
 	if (dcdc->dvs_gpio)
 		gpio_free(dcdc->dvs_gpio);
@@ -758,7 +762,7 @@ static __devinit int wm831x_buckp_probe(struct platform_device *pdev)
 		goto err;
 	}
 
-	irq = platform_get_irq_byname(pdev, "UV");
+	irq = wm831x_irq(wm831x, platform_get_irq_byname(pdev, "UV"));
 	ret = request_threaded_irq(irq, NULL, wm831x_dcdc_uv_irq,
 				   IRQF_TRIGGER_RISING,	dcdc->name, dcdc);
 	if (ret != 0) {
@@ -783,7 +787,8 @@ static __devexit int wm831x_buckp_remove(struct platform_device *pdev)
 
 	platform_set_drvdata(pdev, NULL);
 
-	free_irq(platform_get_irq_byname(pdev, "UV"), dcdc);
+	free_irq(wm831x_irq(dcdc->wm831x, platform_get_irq_byname(pdev, "UV")),
+			    dcdc);
 	regulator_unregister(dcdc->regulator);
 
 	return 0;
@@ -883,7 +888,7 @@ static __devinit int wm831x_boostp_probe(struct platform_device *pdev)
 		goto err;
 	}
 
-	irq = platform_get_irq_byname(pdev, "UV");
+	irq = wm831x_irq(wm831x, platform_get_irq_byname(pdev, "UV"));
 	ret = request_threaded_irq(irq, NULL, wm831x_dcdc_uv_irq,
 				   IRQF_TRIGGER_RISING, dcdc->name,
 				   dcdc);
@@ -910,7 +915,8 @@ static __devexit int wm831x_boostp_remove(struct platform_device *pdev)
 
 	platform_set_drvdata(pdev, NULL);
 
-	free_irq(platform_get_irq_byname(pdev, "UV"), dcdc);
+	free_irq(wm831x_irq(dcdc->wm831x, platform_get_irq_byname(pdev, "UV")),
+		 dcdc);
 	regulator_unregister(dcdc->regulator);
 	kfree(dcdc);
 
diff --git a/drivers/regulator/wm831x-isink.c b/drivers/regulator/wm831x-isink.c
index b414e09..1596947 100644
--- a/drivers/regulator/wm831x-isink.c
+++ b/drivers/regulator/wm831x-isink.c
@@ -198,7 +198,7 @@ static __devinit int wm831x_isink_probe(struct platform_device *pdev)
 		goto err;
 	}
 
-	irq = platform_get_irq(pdev, 0);
+	irq = wm831x_irq(wm831x, platform_get_irq(pdev, 0));
 	ret = request_threaded_irq(irq, NULL, wm831x_isink_irq,
 				   IRQF_TRIGGER_RISING, isink->name, isink);
 	if (ret != 0) {
@@ -223,7 +223,7 @@ static __devexit int wm831x_isink_remove(struct platform_device *pdev)
 
 	platform_set_drvdata(pdev, NULL);
 
-	free_irq(platform_get_irq(pdev, 0), isink);
+	free_irq(wm831x_irq(isink->wm831x, platform_get_irq(pdev, 0)), isink);
 
 	regulator_unregister(isink->regulator);
 
diff --git a/drivers/regulator/wm831x-ldo.c b/drivers/regulator/wm831x-ldo.c
index 641e9f6..b09ba05 100644
--- a/drivers/regulator/wm831x-ldo.c
+++ b/drivers/regulator/wm831x-ldo.c
@@ -359,7 +359,7 @@ static __devinit int wm831x_gp_ldo_probe(struct platform_device *pdev)
 		goto err;
 	}
 
-	irq = platform_get_irq_byname(pdev, "UV");
+	irq = wm831x_irq(wm831x, platform_get_irq_byname(pdev, "UV"));
 	ret = request_threaded_irq(irq, NULL, wm831x_ldo_uv_irq,
 				   IRQF_TRIGGER_RISING, ldo->name,
 				   ldo);
@@ -385,7 +385,8 @@ static __devexit int wm831x_gp_ldo_remove(struct platform_device *pdev)
 
 	platform_set_drvdata(pdev, NULL);
 
-	free_irq(platform_get_irq_byname(pdev, "UV"), ldo);
+	free_irq(wm831x_irq(ldo->wm831x,
+			    platform_get_irq_byname(pdev, "UV")), ldo);
 	regulator_unregister(ldo->regulator);
 
 	return 0;
@@ -624,7 +625,7 @@ static __devinit int wm831x_aldo_probe(struct platform_device *pdev)
 		goto err;
 	}
 
-	irq = platform_get_irq_byname(pdev, "UV");
+	irq = wm831x_irq(wm831x, platform_get_irq_byname(pdev, "UV"));
 	ret = request_threaded_irq(irq, NULL, wm831x_ldo_uv_irq,
 				   IRQF_TRIGGER_RISING, ldo->name, ldo);
 	if (ret != 0) {
@@ -647,7 +648,8 @@ static __devexit int wm831x_aldo_remove(struct platform_device *pdev)
 {
 	struct wm831x_ldo *ldo = platform_get_drvdata(pdev);
 
-	free_irq(platform_get_irq_byname(pdev, "UV"), ldo);
+	free_irq(wm831x_irq(ldo->wm831x, platform_get_irq_byname(pdev, "UV")),
+		 ldo);
 	regulator_unregister(ldo->regulator);
 
 	return 0;
diff --git a/drivers/rtc/rtc-wm831x.c b/drivers/rtc/rtc-wm831x.c
index 3b6e6a6..59c6245 100644
--- a/drivers/rtc/rtc-wm831x.c
+++ b/drivers/rtc/rtc-wm831x.c
@@ -396,7 +396,7 @@ static int wm831x_rtc_probe(struct platform_device *pdev)
 {
 	struct wm831x *wm831x = dev_get_drvdata(pdev->dev.parent);
 	struct wm831x_rtc *wm831x_rtc;
-	int alm_irq = platform_get_irq_byname(pdev, "ALM");
+	int alm_irq = wm831x_irq(wm831x, platform_get_irq_byname(pdev, "ALM"));
 	int ret = 0;
 
 	wm831x_rtc = devm_kzalloc(&pdev->dev, sizeof(*wm831x_rtc), GFP_KERNEL);
diff --git a/include/linux/mfd/wm831x/core.h b/include/linux/mfd/wm831x/core.h
index 4b12118..736191c 100644
--- a/include/linux/mfd/wm831x/core.h
+++ b/include/linux/mfd/wm831x/core.h
@@ -17,6 +17,7 @@
 
 #include <linux/completion.h>
 #include <linux/interrupt.h>
+#include <linux/irqdomain.h>
 #include <linux/list.h>
 #include <linux/regmap.h>
 
@@ -338,6 +339,7 @@
 #define WM831X_FLL_CLK_SRC_WIDTH                     2  /* FLL_CLK_SRC - [1:0] */
 
 struct regulator_dev;
+struct irq_domain;
 
 #define WM831X_NUM_IRQ_REGS 5
 #define WM831X_NUM_GPIO_REGS 16
@@ -367,7 +369,7 @@ struct wm831x {
 
 	int irq;  /* Our chip IRQ */
 	struct mutex irq_lock;
-	int irq_base;
+	struct irq_domain *irq_domain;
 	int irq_masks_cur[WM831X_NUM_IRQ_REGS];   /* Currently active value */
 	int irq_masks_cache[WM831X_NUM_IRQ_REGS]; /* Cached hardware value */
 
@@ -417,6 +419,11 @@ int wm831x_irq_init(struct wm831x *wm831x, int irq);
 void wm831x_irq_exit(struct wm831x *wm831x);
 void wm831x_auxadc_init(struct wm831x *wm831x);
 
+static inline int wm831x_irq(struct wm831x *wm831x, int irq)
+{
+	return irq_create_mapping(wm831x->irq_domain, irq);
+}
+
 extern struct regmap_config wm831x_regmap_config;
 
 #endif
-- 
cgit v0.10.2


From 08b4c118af35d4d67eca2052aaa8d4a5f7d0aecb Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@opensource.wolfsonmicro.com>
Date: Wed, 16 May 2012 09:05:54 +0100
Subject: mfd: wm8400 needs to depend on I2C=y

Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>

diff --git a/drivers/mfd/Kconfig b/drivers/mfd/Kconfig
index 8325c44..6da82de 100644
--- a/drivers/mfd/Kconfig
+++ b/drivers/mfd/Kconfig
@@ -479,7 +479,7 @@ config MFD_S5M_CORE
 config MFD_WM8400
 	bool "Support Wolfson Microelectronics WM8400"
 	select MFD_CORE
-	depends on I2C
+	depends on I2C=y
 	select REGMAP_I2C
 	help
 	  Support for the Wolfson Microelecronics WM8400 PMIC and audio
-- 
cgit v0.10.2


From b09530ef844f0bf29ed3677080c02b179be84818 Mon Sep 17 00:00:00 2001
From: Richard Zhao <richard.zhao@linaro.org>
Date: Sun, 13 May 2012 09:18:02 +0800
Subject: mfd: Make anatop register accessor more flexible and rename
 meaningfully

 - rename to anatop_read_reg and anatop_write_reg
 - anatop_read_reg directly return reg value
 - anatop_write_reg write reg with mask

Signed-off-by: Richard Zhao <richard.zhao@freescale.com>
Reviewed-by: Ying-Chun Liu (PaulLiu) <paul.liu@linaro.org>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>

diff --git a/drivers/mfd/anatop-mfd.c b/drivers/mfd/anatop-mfd.c
index 2af4248..6da0634 100644
--- a/drivers/mfd/anatop-mfd.c
+++ b/drivers/mfd/anatop-mfd.c
@@ -41,39 +41,26 @@
 #include <linux/of_address.h>
 #include <linux/mfd/anatop.h>
 
-u32 anatop_get_bits(struct anatop *adata, u32 addr, int bit_shift,
-		    int bit_width)
+u32 anatop_read_reg(struct anatop *adata, u32 addr)
 {
-	u32 val, mask;
-
-	if (bit_width == 32)
-		mask = ~0;
-	else
-		mask = (1 << bit_width) - 1;
-
-	val = readl(adata->ioreg + addr);
-	val = (val >> bit_shift) & mask;
-
-	return val;
+	return readl(adata->ioreg + addr);
 }
-EXPORT_SYMBOL_GPL(anatop_get_bits);
+EXPORT_SYMBOL_GPL(anatop_read_reg);
 
-void anatop_set_bits(struct anatop *adata, u32 addr, int bit_shift,
-		     int bit_width, u32 data)
+void anatop_write_reg(struct anatop *adata, u32 addr, u32 data, u32 mask)
 {
-	u32 val, mask;
+	u32 val;
 
-	if (bit_width == 32)
-		mask = ~0;
-	else
-		mask = (1 << bit_width) - 1;
+	data &= mask;
 
 	spin_lock(&adata->reglock);
-	val = readl(adata->ioreg + addr) & ~(mask << bit_shift);
-	writel((data << bit_shift) | val, adata->ioreg + addr);
+	val = readl(adata->ioreg + addr);
+	val &= ~mask;
+	val |= data;
+	writel(val, adata->ioreg + addr);
 	spin_unlock(&adata->reglock);
 }
-EXPORT_SYMBOL_GPL(anatop_set_bits);
+EXPORT_SYMBOL_GPL(anatop_write_reg);
 
 static const struct of_device_id of_anatop_match[] = {
 	{ .compatible = "fsl,imx6q-anatop", },
diff --git a/drivers/regulator/anatop-regulator.c b/drivers/regulator/anatop-regulator.c
index 81fd606..0a34085 100644
--- a/drivers/regulator/anatop-regulator.c
+++ b/drivers/regulator/anatop-regulator.c
@@ -47,7 +47,7 @@ static int anatop_set_voltage(struct regulator_dev *reg, int min_uV,
 				  int max_uV, unsigned *selector)
 {
 	struct anatop_regulator *anatop_reg = rdev_get_drvdata(reg);
-	u32 val, sel;
+	u32 val, sel, mask;
 	int uv;
 
 	uv = min_uV;
@@ -71,11 +71,10 @@ static int anatop_set_voltage(struct regulator_dev *reg, int min_uV,
 	val = anatop_reg->min_bit_val + sel;
 	*selector = sel;
 	dev_dbg(&reg->dev, "%s: calculated val %d\n", __func__, val);
-	anatop_set_bits(anatop_reg->mfd,
-			anatop_reg->control_reg,
-			anatop_reg->vol_bit_shift,
-			anatop_reg->vol_bit_width,
-			val);
+	mask = ((1 << anatop_reg->vol_bit_width) - 1) <<
+		anatop_reg->vol_bit_shift;
+	val <<= anatop_reg->vol_bit_shift;
+	anatop_write_reg(anatop_reg->mfd, anatop_reg->control_reg, val, mask);
 
 	return 0;
 }
@@ -88,10 +87,9 @@ static int anatop_get_voltage_sel(struct regulator_dev *reg)
 	if (!anatop_reg->control_reg)
 		return -ENOTSUPP;
 
-	val = anatop_get_bits(anatop_reg->mfd,
-			      anatop_reg->control_reg,
-			      anatop_reg->vol_bit_shift,
-			      anatop_reg->vol_bit_width);
+	val = anatop_read_reg(anatop_reg->mfd, anatop_reg->control_reg);
+	val = (val & ((1 << anatop_reg->vol_bit_width) - 1)) >>
+		anatop_reg->vol_bit_shift;
 
 	return val - anatop_reg->min_bit_val;
 }
diff --git a/include/linux/mfd/anatop.h b/include/linux/mfd/anatop.h
index 22c1007..7f92acf 100644
--- a/include/linux/mfd/anatop.h
+++ b/include/linux/mfd/anatop.h
@@ -34,7 +34,7 @@ struct anatop {
 	spinlock_t reglock;
 };
 
-extern u32 anatop_get_bits(struct anatop *, u32, int, int);
-extern void anatop_set_bits(struct anatop *, u32, int, int, u32);
+extern u32 anatop_read_reg(struct anatop *, u32);
+extern void anatop_write_reg(struct anatop *, u32, u32, u32);
 
 #endif /*  __LINUX_MFD_ANATOP_H */
-- 
cgit v0.10.2


From 21f7541d8861fdcdff663c68903e961ca1b06dc6 Mon Sep 17 00:00:00 2001
From: Rhyland Klein <rklein@nvidia.com>
Date: Fri, 18 May 2012 11:52:19 +0200
Subject: mfd: Add tps65910-irq devicetree init and irqdomain support

This change changes the tps65910-irq code to use irqdomain, and support
initialization from devicetree. This assumes that the irq_base in the
platform data is -1 if devicetree is used.

Signed-off-by: Rhyland Klein <rklein@nvidia.com>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>

diff --git a/drivers/mfd/Kconfig b/drivers/mfd/Kconfig
index 6da82de..b819eea 100644
--- a/drivers/mfd/Kconfig
+++ b/drivers/mfd/Kconfig
@@ -190,6 +190,7 @@ config MFD_TPS65910
 	depends on I2C=y && GPIOLIB
 	select MFD_CORE
 	select REGMAP_I2C
+	select IRQ_DOMAIN
 	help
 	  if you say yes here you get support for the TPS65910 series of
 	  Power Management chips.
diff --git a/drivers/mfd/tps65910-irq.c b/drivers/mfd/tps65910-irq.c
index 0f1ff7f..09aab3e4 100644
--- a/drivers/mfd/tps65910-irq.c
+++ b/drivers/mfd/tps65910-irq.c
@@ -20,15 +20,10 @@
 #include <linux/device.h>
 #include <linux/interrupt.h>
 #include <linux/irq.h>
+#include <linux/irqdomain.h>
 #include <linux/gpio.h>
 #include <linux/mfd/tps65910.h>
 
-static inline int irq_to_tps65910_irq(struct tps65910 *tps65910,
-							int irq)
-{
-	return (irq - tps65910->irq_base);
-}
-
 /*
  * This is a threaded IRQ handler so can access I2C/SPI.  Since all
  * interrupts are clear on read the IRQ line will be reasserted and
@@ -76,7 +71,7 @@ static irqreturn_t tps65910_irq(int irq, void *irq_data)
 		if (!(irq_sts & (1 << i)))
 			continue;
 
-		handle_nested_irq(tps65910->irq_base + i);
+		handle_nested_irq(irq_find_mapping(tps65910->domain, i));
 	}
 
 	/* Write the STS register back to clear IRQs we handled */
@@ -135,14 +130,14 @@ static void tps65910_irq_enable(struct irq_data *data)
 {
 	struct tps65910 *tps65910 = irq_data_get_irq_chip_data(data);
 
-	tps65910->irq_mask &= ~( 1 << irq_to_tps65910_irq(tps65910, data->irq));
+	tps65910->irq_mask &= ~(1 << data->hwirq);
 }
 
 static void tps65910_irq_disable(struct irq_data *data)
 {
 	struct tps65910 *tps65910 = irq_data_get_irq_chip_data(data);
 
-	tps65910->irq_mask |= ( 1 << irq_to_tps65910_irq(tps65910, data->irq));
+	tps65910->irq_mask |= (1 << data->hwirq);
 }
 
 #ifdef CONFIG_PM_SLEEP
@@ -164,10 +159,35 @@ static struct irq_chip tps65910_irq_chip = {
 	.irq_set_wake = tps65910_irq_set_wake,
 };
 
+static int tps65910_irq_map(struct irq_domain *h, unsigned int virq,
+				irq_hw_number_t hw)
+{
+	struct tps65910 *tps65910 = h->host_data;
+
+	irq_set_chip_data(virq, tps65910);
+	irq_set_chip_and_handler(virq, &tps65910_irq_chip, handle_edge_irq);
+	irq_set_nested_thread(virq, 1);
+
+	/* ARM needs us to explicitly flag the IRQ as valid
+	 * and will set them noprobe when we do so. */
+#ifdef CONFIG_ARM
+	set_irq_flags(virq, IRQF_VALID);
+#else
+	irq_set_noprobe(virq);
+#endif
+
+	return 0;
+}
+
+static struct irq_domain_ops tps65910_domain_ops = {
+	.map	= tps65910_irq_map,
+	.xlate	= irq_domain_xlate_twocell,
+};
+
 int tps65910_irq_init(struct tps65910 *tps65910, int irq,
 		    struct tps65910_platform_data *pdata)
 {
-	int ret, cur_irq;
+	int ret;
 	int flags = IRQF_ONESHOT;
 
 	if (!irq) {
@@ -175,17 +195,11 @@ int tps65910_irq_init(struct tps65910 *tps65910, int irq,
 		return -EINVAL;
 	}
 
-	if (!pdata || !pdata->irq_base) {
-		dev_warn(tps65910->dev, "No interrupt support, no IRQ base\n");
+	if (!pdata) {
+		dev_warn(tps65910->dev, "No interrupt support, no pdata\n");
 		return -EINVAL;
 	}
 
-	tps65910->irq_mask = 0xFFFFFF;
-
-	mutex_init(&tps65910->irq_lock);
-	tps65910->chip_irq = irq;
-	tps65910->irq_base = pdata->irq_base;
-
 	switch (tps65910_chip_id(tps65910)) {
 	case TPS65910:
 		tps65910->irq_num = TPS65910_NUM_IRQ;
@@ -195,22 +209,36 @@ int tps65910_irq_init(struct tps65910 *tps65910, int irq,
 		break;
 	}
 
-	/* Register with genirq */
-	for (cur_irq = tps65910->irq_base;
-	     cur_irq < tps65910->irq_num + tps65910->irq_base;
-	     cur_irq++) {
-		irq_set_chip_data(cur_irq, tps65910);
-		irq_set_chip_and_handler(cur_irq, &tps65910_irq_chip,
-					 handle_edge_irq);
-		irq_set_nested_thread(cur_irq, 1);
-
-		/* ARM needs us to explicitly flag the IRQ as valid
-		 * and will set them noprobe when we do so. */
-#ifdef CONFIG_ARM
-		set_irq_flags(cur_irq, IRQF_VALID);
-#else
-		irq_set_noprobe(cur_irq);
-#endif
+	if (pdata->irq_base > 0) {
+		pdata->irq_base = irq_alloc_descs(pdata->irq_base, 0,
+					tps65910->irq_num, -1);
+		if (pdata->irq_base < 0) {
+			dev_warn(tps65910->dev, "Failed to alloc IRQs: %d\n",
+					pdata->irq_base);
+			return pdata->irq_base;
+		}
+	}
+
+	tps65910->irq_mask = 0xFFFFFF;
+
+	mutex_init(&tps65910->irq_lock);
+	tps65910->chip_irq = irq;
+	tps65910->irq_base = pdata->irq_base;
+
+	if (pdata->irq_base > 0)
+		tps65910->domain = irq_domain_add_legacy(tps65910->dev->of_node,
+					tps65910->irq_num,
+					pdata->irq_base,
+					0,
+					&tps65910_domain_ops, tps65910);
+	else
+		tps65910->domain = irq_domain_add_linear(tps65910->dev->of_node,
+					tps65910->irq_num,
+					&tps65910_domain_ops, tps65910);
+
+	if (!tps65910->domain) {
+		dev_err(tps65910->dev, "Failed to create IRQ domain\n");
+		return -ENOMEM;
 	}
 
 	ret = request_threaded_irq(irq, NULL, tps65910_irq, flags,
diff --git a/include/linux/mfd/tps65910.h b/include/linux/mfd/tps65910.h
index c2673ee..ab04e90 100644
--- a/include/linux/mfd/tps65910.h
+++ b/include/linux/mfd/tps65910.h
@@ -836,6 +836,7 @@ struct tps65910 {
 	int irq_base;
 	int irq_num;
 	u32 irq_mask;
+	struct irq_domain *domain;
 };
 
 struct tps65910_platform_data {
-- 
cgit v0.10.2


From 16e5e204c92800aad4e7db52d289565cc82240ce Mon Sep 17 00:00:00 2001
From: Ashish Jangam <ashish.jangam@kpitcummins.com>
Date: Fri, 18 May 2012 12:19:18 +0200
Subject: mfd: Add ADC support to the DA9052/53 core

This patch adds ADC support to the DA9052/53 core.

Tested on smdkv6410 and i.mx53 QS boards.

Signed-off-by: Ashish Jangam <ashish.jangam@kpitcummins.com>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>

diff --git a/drivers/mfd/da9052-core.c b/drivers/mfd/da9052-core.c
index 7ff313f..5036cf5 100644
--- a/drivers/mfd/da9052-core.c
+++ b/drivers/mfd/da9052-core.c
@@ -318,6 +318,135 @@ static bool da9052_reg_volatile(struct device *dev, unsigned int reg)
 	}
 }
 
+/*
+ * TBAT look-up table is computed from the R90 reg (8 bit register)
+ * reading as below. The battery temperature is in milliCentigrade
+ * TBAT = (1/(t1+1/298) - 273) * 1000 mC
+ * where t1 = (1/B)* ln(( ADCval * 2.5)/(R25*ITBAT*255))
+ * Default values are R25 = 10e3, B = 3380, ITBAT = 50e-6
+ * Example:
+ * R25=10E3, B=3380, ITBAT=50e-6, ADCVAL=62d calculates
+ * TBAT = 20015 mili degrees Centrigrade
+ *
+*/
+static const int32_t tbat_lookup[255] = {
+	183258, 144221, 124334, 111336, 101826, 94397, 88343, 83257,
+	78889, 75071, 71688, 68656, 65914, 63414, 61120, 59001,
+	570366, 55204, 53490, 51881, 50364, 48931, 47574, 46285,
+	45059, 43889, 42772, 41703, 40678, 39694, 38748, 37838,
+	36961, 36115, 35297, 34507, 33743, 33002, 32284, 31588,
+	30911, 30254, 29615, 28994, 28389, 27799, 27225, 26664,
+	26117, 25584, 25062, 24553, 24054, 23567, 23091, 22624,
+	22167, 21719, 21281, 20851, 20429, 20015, 19610, 19211,
+	18820, 18436, 18058, 17688, 17323, 16965, 16612, 16266,
+	15925, 15589, 15259, 14933, 14613, 14298, 13987, 13681,
+	13379, 13082, 12788, 12499, 12214, 11933, 11655, 11382,
+	11112, 10845, 10582, 10322, 10066, 9812, 9562, 9315,
+	9071, 8830, 8591, 8356, 8123, 7893, 7665, 7440,
+	7218, 6998, 6780, 6565, 6352, 6141, 5933, 5726,
+	5522, 5320, 5120, 4922, 4726, 4532, 4340, 4149,
+	3961, 3774, 3589, 3406, 3225, 3045, 2867, 2690,
+	2516, 2342, 2170, 2000, 1831, 1664, 1498, 1334,
+	1171, 1009, 849, 690, 532, 376, 221, 67,
+	-84, -236, -386, -535, -683, -830, -975, -1119,
+	-1263, -1405, -1546, -1686, -1825, -1964, -2101, -2237,
+	-2372, -2506, -2639, -2771, -2902, -3033, -3162, -3291,
+	-3418, -3545, -3671, -3796, -3920, -4044, -4166, -4288,
+	-4409, -4529, -4649, -4767, -4885, -5002, -5119, -5235,
+	-5349, -5464, -5577, -5690, -5802, -5913, -6024, -6134,
+	-6244, -6352, -6461, -6568, -6675, -6781, -6887, -6992,
+	-7096, -7200, -7303, -7406, -7508, -7609, -7710, -7810,
+	-7910, -8009, -8108, -8206, -8304, -8401, -8497, -8593,
+	-8689, -8784, -8878, -8972, -9066, -9159, -9251, -9343,
+	-9435, -9526, -9617, -9707, -9796, -9886, -9975, -10063,
+	-10151, -10238, -10325, -10412, -10839, -10923, -11007, -11090,
+	-11173, -11256, -11338, -11420, -11501, -11583, -11663, -11744,
+	-11823, -11903, -11982
+};
+
+static const u8 chan_mux[DA9052_ADC_VBBAT + 1] = {
+	[DA9052_ADC_VDDOUT]	= DA9052_ADC_MAN_MUXSEL_VDDOUT,
+	[DA9052_ADC_ICH]	= DA9052_ADC_MAN_MUXSEL_ICH,
+	[DA9052_ADC_TBAT]	= DA9052_ADC_MAN_MUXSEL_TBAT,
+	[DA9052_ADC_VBAT]	= DA9052_ADC_MAN_MUXSEL_VBAT,
+	[DA9052_ADC_IN4]	= DA9052_ADC_MAN_MUXSEL_AD4,
+	[DA9052_ADC_IN5]	= DA9052_ADC_MAN_MUXSEL_AD5,
+	[DA9052_ADC_IN6]	= DA9052_ADC_MAN_MUXSEL_AD6,
+	[DA9052_ADC_VBBAT]	= DA9052_ADC_MAN_MUXSEL_VBBAT
+};
+
+int da9052_adc_manual_read(struct da9052 *da9052, unsigned char channel)
+{
+	int ret;
+	unsigned short calc_data;
+	unsigned short data;
+	unsigned char mux_sel;
+
+	if (channel > DA9052_ADC_VBBAT)
+		return -EINVAL;
+
+	mutex_lock(&da9052->auxadc_lock);
+
+	/* Channel gets activated on enabling the Conversion bit */
+	mux_sel = chan_mux[channel] | DA9052_ADC_MAN_MAN_CONV;
+
+	ret = da9052_reg_write(da9052, DA9052_ADC_MAN_REG, mux_sel);
+	if (ret < 0)
+		goto err;
+
+	/* Wait for an interrupt */
+	if (!wait_for_completion_timeout(&da9052->done,
+					 msecs_to_jiffies(500))) {
+		dev_err(da9052->dev,
+			"timeout waiting for ADC conversion interrupt\n");
+		ret = -ETIMEDOUT;
+		goto err;
+	}
+
+	ret = da9052_reg_read(da9052, DA9052_ADC_RES_H_REG);
+	if (ret < 0)
+		goto err;
+
+	calc_data = (unsigned short)ret;
+	data = calc_data << 2;
+
+	ret = da9052_reg_read(da9052, DA9052_ADC_RES_L_REG);
+	if (ret < 0)
+		goto err;
+
+	calc_data = (unsigned short)(ret & DA9052_ADC_RES_LSB);
+	data |= calc_data;
+
+	ret = data;
+
+err:
+	mutex_unlock(&da9052->auxadc_lock);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(da9052_adc_manual_read);
+
+static irqreturn_t da9052_auxadc_irq(int irq, void *irq_data)
+{
+	struct da9052 *da9052 = irq_data;
+
+	complete(&da9052->done);
+
+	return IRQ_HANDLED;
+}
+
+int da9052_adc_read_temp(struct da9052 *da9052)
+{
+	int tbat;
+
+	tbat = da9052_reg_read(da9052, DA9052_TBAT_RES_REG);
+	if (tbat <= 0)
+		return tbat;
+
+	/* ARRAY_SIZE check is not needed since TBAT is a 8-bit register */
+	return tbat_lookup[tbat - 1];
+}
+EXPORT_SYMBOL_GPL(da9052_adc_read_temp);
+
 static struct resource da9052_rtc_resource = {
 	.name = "ALM",
 	.start = DA9052_IRQ_ALARM,
@@ -646,6 +775,9 @@ int __devinit da9052_device_init(struct da9052 *da9052, u8 chip_id)
 	struct irq_desc *desc;
 	int ret;
 
+	mutex_init(&da9052->auxadc_lock);
+	init_completion(&da9052->done);
+
 	if (pdata && pdata->init != NULL)
 		pdata->init(da9052);
 
@@ -666,6 +798,12 @@ int __devinit da9052_device_init(struct da9052 *da9052, u8 chip_id)
 	desc = irq_to_desc(da9052->chip_irq);
 	da9052->irq_base = regmap_irq_chip_get_base(desc->action->dev_id);
 
+	ret = request_threaded_irq(DA9052_IRQ_ADC_EOM, NULL, da9052_auxadc_irq,
+				   IRQF_TRIGGER_LOW | IRQF_ONESHOT,
+				   "adc irq", da9052);
+	if (ret != 0)
+		dev_err(da9052->dev, "DA9052 ADC IRQ failed ret=%d\n", ret);
+
 	ret = mfd_add_devices(da9052->dev, -1, da9052_subdev_info,
 			      ARRAY_SIZE(da9052_subdev_info), NULL, 0);
 	if (ret)
@@ -674,6 +812,7 @@ int __devinit da9052_device_init(struct da9052 *da9052, u8 chip_id)
 	return 0;
 
 err:
+	free_irq(DA9052_IRQ_ADC_EOM, da9052);
 	mfd_remove_devices(da9052->dev);
 regmap_err:
 	return ret;
@@ -681,6 +820,7 @@ regmap_err:
 
 void da9052_device_exit(struct da9052 *da9052)
 {
+	free_irq(DA9052_IRQ_ADC_EOM, da9052);
 	regmap_del_irq_chip(da9052->chip_irq,
 			    irq_get_irq_data(da9052->irq_base)->chip_data);
 	mfd_remove_devices(da9052->dev);
diff --git a/include/linux/mfd/da9052/da9052.h b/include/linux/mfd/da9052/da9052.h
index 7ffbd6e..b990cca 100644
--- a/include/linux/mfd/da9052/da9052.h
+++ b/include/linux/mfd/da9052/da9052.h
@@ -33,6 +33,18 @@
 
 #include <linux/mfd/da9052/reg.h>
 
+/* Common - HWMON Channel Definations */
+#define DA9052_ADC_VDDOUT	0
+#define DA9052_ADC_ICH		1
+#define DA9052_ADC_TBAT	2
+#define DA9052_ADC_VBAT	3
+#define DA9052_ADC_IN4		4
+#define DA9052_ADC_IN5		5
+#define DA9052_ADC_IN6		6
+#define DA9052_ADC_TSI		7
+#define DA9052_ADC_TJUNC	8
+#define DA9052_ADC_VBBAT	9
+
 #define DA9052_IRQ_DCIN	0
 #define DA9052_IRQ_VBUS	1
 #define DA9052_IRQ_DCINREM	2
@@ -79,12 +91,19 @@ struct da9052 {
 	struct device *dev;
 	struct regmap *regmap;
 
+	struct mutex auxadc_lock;
+	struct completion done;
+
 	int irq_base;
 	u8 chip_id;
 
 	int chip_irq;
 };
 
+/* ADC API */
+int da9052_adc_manual_read(struct da9052 *da9052, unsigned char channel);
+int da9052_adc_read_temp(struct da9052 *da9052);
+
 /* Device I/O API */
 static inline int da9052_reg_read(struct da9052 *da9052, unsigned char reg)
 {
-- 
cgit v0.10.2


From fa648e51f84d060ef991eeee4d7bacec45d7fbfd Mon Sep 17 00:00:00 2001
From: Johan Hovold <jhovold@gmail.com>
Date: Tue, 15 May 2012 19:24:53 +0200
Subject: mfd: Convert lm3533 to use devres

Use devres to manage core driver data and regmap.

Signed-off-by: Johan Hovold <jhovold@gmail.com>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>

diff --git a/drivers/mfd/lm3533-core.c b/drivers/mfd/lm3533-core.c
index 1627472..09019c6 100644
--- a/drivers/mfd/lm3533-core.c
+++ b/drivers/mfd/lm3533-core.c
@@ -603,33 +603,24 @@ static int __devinit lm3533_i2c_probe(struct i2c_client *i2c,
 
 	dev_dbg(&i2c->dev, "%s\n", __func__);
 
-	lm3533 = kzalloc(sizeof(*lm3533), GFP_KERNEL);
+	lm3533 = devm_kzalloc(&i2c->dev, sizeof(*lm3533), GFP_KERNEL);
 	if (!lm3533)
 		return -ENOMEM;
 
 	i2c_set_clientdata(i2c, lm3533);
 
-	lm3533->regmap = regmap_init_i2c(i2c, &regmap_config);
-	if (IS_ERR(lm3533->regmap)) {
-		ret = PTR_ERR(lm3533->regmap);
-		goto err_regmap;
-	}
+	lm3533->regmap = devm_regmap_init_i2c(i2c, &regmap_config);
+	if (IS_ERR(lm3533->regmap))
+		return PTR_ERR(lm3533->regmap);
 
 	lm3533->dev = &i2c->dev;
 	lm3533->irq = i2c->irq;
 
 	ret = lm3533_device_init(lm3533);
 	if (ret)
-		goto err_dev;
+		return ret;
 
 	return 0;
-
-err_dev:
-	regmap_exit(lm3533->regmap);
-err_regmap:
-	kfree(lm3533);
-
-	return ret;
 }
 
 static int __devexit lm3533_i2c_remove(struct i2c_client *i2c)
@@ -639,9 +630,6 @@ static int __devexit lm3533_i2c_remove(struct i2c_client *i2c)
 	dev_dbg(&i2c->dev, "%s\n", __func__);
 
 	lm3533_device_exit(lm3533);
-	regmap_exit(lm3533->regmap);
-
-	kfree(lm3533);
 
 	return 0;
 }
-- 
cgit v0.10.2


From 65ee362cb2c13bd164ade0eda66919a2e16d8a89 Mon Sep 17 00:00:00 2001
From: Johan Hovold <jhovold@gmail.com>
Date: Tue, 15 May 2012 19:45:40 +0200
Subject: mfd: Fix double free in wm8350 error path

Fix double free in probe error path introduced by the recent conversion
of wm8350 to use regmap.

Signed-off-by: Johan Hovold <jhovold@gmail.com>
Acked-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>

diff --git a/drivers/mfd/wm8350-i2c.c b/drivers/mfd/wm8350-i2c.c
index 271589f..a68aceb 100644
--- a/drivers/mfd/wm8350-i2c.c
+++ b/drivers/mfd/wm8350-i2c.c
@@ -49,15 +49,7 @@ static int wm8350_i2c_probe(struct i2c_client *i2c,
 	i2c_set_clientdata(i2c, wm8350);
 	wm8350->dev = &i2c->dev;
 
-	ret = wm8350_device_init(wm8350, i2c->irq, i2c->dev.platform_data);
-	if (ret < 0)
-		goto err;
-
-	return ret;
-
-err:
-	regmap_exit(wm8350->regmap);
-	return ret;
+	return wm8350_device_init(wm8350, i2c->irq, i2c->dev.platform_data);
 }
 
 static int wm8350_i2c_remove(struct i2c_client *i2c)
-- 
cgit v0.10.2


From 1cb3642a68c983ada0f4090a4dac1d70a96126ca Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@opensource.wolfsonmicro.com>
Date: Fri, 18 May 2012 13:01:19 +0200
Subject: mfd: mc13xxx core should not be user visible

Since the core is not usable without one of the bus modules it should not
be presented in the UI but should instead be selected by the bus modules.

Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>

diff --git a/drivers/mfd/Kconfig b/drivers/mfd/Kconfig
index b819eea..094bf4e 100644
--- a/drivers/mfd/Kconfig
+++ b/drivers/mfd/Kconfig
@@ -623,7 +623,7 @@ config MFD_MC13783
 	tristate
 
 config MFD_MC13XXX
-	tristate "Support Freescale MC13783 and MC13892"
+	tristate
 	depends on SPI_MASTER || I2C
 	select MFD_CORE
 	select MFD_MC13783
@@ -633,24 +633,22 @@ config MFD_MC13XXX
 	  additional drivers must be enabled in order to use the
 	  functionality of the device.
 
-if MFD_MC13XXX
-
 config MFD_MC13XXX_SPI
-	tristate "MC13xxx SPI interface" if SPI_MASTER
-	default SPI_MASTER
+	tristate "Freescale MC13783 and MC13892 SPI interface"
+	depends on SPI_MASTER
 	select REGMAP_SPI
+	select MFD_MC13XXX
 	help
 	  Select this if your MC13xxx is connected via an SPI bus.
 
 config MFD_MC13XXX_I2C
-	tristate "MC13xxx I2C interface" if I2C
-	default I2C
+	tristate "Freescale MC13892 I2C interface"
+	depends on I2C
 	select REGMAP_I2C
+	select MFD_MC13XXX
 	help
 	  Select this if your MC13xxx is connected via an I2C bus.
 
-endif
-
 config ABX500_CORE
 	bool "ST-Ericsson ABX500 Mixed Signal Circuit register functions"
 	default y if ARCH_U300 || ARCH_U8500
-- 
cgit v0.10.2


From d28f1db8187dd1ddc1fa6f380ff0402cf8e4d44d Mon Sep 17 00:00:00 2001
From: Lee Jones <lee.jones@linaro.org>
Date: Sat, 19 May 2012 17:21:37 +0200
Subject: mfd: Remove confusing ab8500-i2c file and merge into ab8500-core

ab8500-i2c is used as core code to register the ab8500 device.
After allocating ab8500 memory, it immediately calls into
ab8500-core where the real initialisation takes place. This
patch moves all core registration and memory allocation into
the true ab8500-core file and removes ab8500-i2c completely.

Signed-off-by: Lee Jones <lee.jones@linaro.org>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>

diff --git a/arch/arm/mach-ux500/board-mop500.c b/arch/arm/mach-ux500/board-mop500.c
index 77d03c1..6dd632d 100644
--- a/arch/arm/mach-ux500/board-mop500.c
+++ b/arch/arm/mach-ux500/board-mop500.c
@@ -205,7 +205,7 @@ static struct resource ab8500_resources[] = {
 };
 
 struct platform_device ab8500_device = {
-	.name = "ab8500-i2c",
+	.name = "ab8500-core",
 	.id = 0,
 	.dev = {
 		.platform_data = &ab8500_platdata,
diff --git a/drivers/mfd/Makefile b/drivers/mfd/Makefile
index d713851..afe96b2 100644
--- a/drivers/mfd/Makefile
+++ b/drivers/mfd/Makefile
@@ -93,12 +93,11 @@ obj-$(CONFIG_AB3100_CORE)	+= ab3100-core.o
 obj-$(CONFIG_AB3100_OTP)	+= ab3100-otp.o
 obj-$(CONFIG_AB5500_CORE)	+= ab5500-core.o
 obj-$(CONFIG_AB5500_DEBUG)	+= ab5500-debugfs.o
-obj-$(CONFIG_AB8500_CORE)	+= ab8500-core.o ab8500-sysctrl.o
 obj-$(CONFIG_AB8500_DEBUG)	+= ab8500-debugfs.o
 obj-$(CONFIG_AB8500_GPADC)	+= ab8500-gpadc.o
 obj-$(CONFIG_MFD_DB8500_PRCMU)	+= db8500-prcmu.o
-# ab8500-i2c need to come after db8500-prcmu (which provides the channel)
-obj-$(CONFIG_AB8500_I2C_CORE)	+= ab8500-i2c.o
+# ab8500-core need to come after db8500-prcmu (which provides the channel)
+obj-$(CONFIG_AB8500_CORE)	+= ab8500-core.o ab8500-sysctrl.o
 obj-$(CONFIG_MFD_DB5500_PRCMU)	+= db5500-prcmu.o
 obj-$(CONFIG_MFD_TIMBERDALE)    += timberdale.o
 obj-$(CONFIG_PMIC_ADP5520)	+= adp5520.o
diff --git a/drivers/mfd/ab8500-core.c b/drivers/mfd/ab8500-core.c
index 6a59919..fb807ab 100644
--- a/drivers/mfd/ab8500-core.c
+++ b/drivers/mfd/ab8500-core.c
@@ -18,6 +18,7 @@
 #include <linux/mfd/core.h>
 #include <linux/mfd/abx500.h>
 #include <linux/mfd/abx500/ab8500.h>
+#include <linux/mfd/dbx500-prcmu.h>
 #include <linux/regulator/ab8500.h>
 
 /*
@@ -137,6 +138,41 @@ static const char ab8500_version_str[][7] = {
 	[AB8500_VERSION_AB8540] = "AB8540",
 };
 
+static int ab8500_i2c_write(struct ab8500 *ab8500, u16 addr, u8 data)
+{
+	int ret;
+
+	ret = prcmu_abb_write((u8)(addr >> 8), (u8)(addr & 0xFF), &data, 1);
+	if (ret < 0)
+		dev_err(ab8500->dev, "prcmu i2c error %d\n", ret);
+	return ret;
+}
+
+static int ab8500_i2c_write_masked(struct ab8500 *ab8500, u16 addr, u8 mask,
+	u8 data)
+{
+	int ret;
+
+	ret = prcmu_abb_write_masked((u8)(addr >> 8), (u8)(addr & 0xFF), &data,
+		&mask, 1);
+	if (ret < 0)
+		dev_err(ab8500->dev, "prcmu i2c error %d\n", ret);
+	return ret;
+}
+
+static int ab8500_i2c_read(struct ab8500 *ab8500, u16 addr)
+{
+	int ret;
+	u8 data;
+
+	ret = prcmu_abb_read((u8)(addr >> 8), (u8)(addr & 0xFF), &data, 1);
+	if (ret < 0) {
+		dev_err(ab8500->dev, "prcmu i2c error %d\n", ret);
+		return ret;
+	}
+	return (int)data;
+}
+
 static int ab8500_get_chip_id(struct device *dev)
 {
 	struct ab8500 *ab8500;
@@ -1169,27 +1205,51 @@ static struct attribute_group ab9540_attr_group = {
 	.attrs	= ab9540_sysfs_entries,
 };
 
-int __devinit ab8500_init(struct ab8500 *ab8500, enum ab8500_version version)
+static int __devinit ab8500_probe(struct platform_device *pdev)
 {
-	struct ab8500_platform_data *plat = dev_get_platdata(ab8500->dev);
+	struct ab8500_platform_data *plat = dev_get_platdata(&pdev->dev);
+	const struct platform_device_id *platid = platform_get_device_id(pdev);
+	enum ab8500_version version = platid->driver_data;
+	struct ab8500 *ab8500;
+	struct resource *resource;
 	int ret;
 	int i;
 	u8 value;
 
+	ab8500 = kzalloc(sizeof *ab8500, GFP_KERNEL);
+	if (!ab8500)
+		return -ENOMEM;
+
 	if (plat)
 		ab8500->irq_base = plat->irq_base;
 
+	ab8500->dev = &pdev->dev;
+
+	resource = platform_get_resource(pdev, IORESOURCE_IRQ, 0);
+	if (!resource) {
+		ret = -ENODEV;
+		goto out_free_ab8500;
+	}
+
+	ab8500->irq = resource->start;
+
+	ab8500->read = ab8500_i2c_read;
+	ab8500->write = ab8500_i2c_write;
+	ab8500->write_masked = ab8500_i2c_write_masked;
+
 	mutex_init(&ab8500->lock);
 	mutex_init(&ab8500->irq_lock);
 	atomic_set(&ab8500->transfer_ongoing, 0);
 
+	platform_set_drvdata(pdev, ab8500);
+
 	if (version != AB8500_VERSION_UNDEFINED)
 		ab8500->version = version;
 	else {
 		ret = get_register_interruptible(ab8500, AB8500_MISC,
 			AB8500_IC_NAME_REG, &value);
 		if (ret < 0)
-			return ret;
+			goto out_free_ab8500;
 
 		ab8500->version = value;
 	}
@@ -1197,7 +1257,7 @@ int __devinit ab8500_init(struct ab8500 *ab8500, enum ab8500_version version)
 	ret = get_register_interruptible(ab8500, AB8500_MISC,
 		AB8500_REV_REG, &value);
 	if (ret < 0)
-		return ret;
+		goto out_free_ab8500;
 
 	ab8500->chip_id = value;
 
@@ -1342,12 +1402,16 @@ out_freeoldmask:
 	kfree(ab8500->oldmask);
 out_freemask:
 	kfree(ab8500->mask);
+out_free_ab8500:
+	kfree(ab8500);
 
 	return ret;
 }
 
-int __devexit ab8500_exit(struct ab8500 *ab8500)
+static int __devexit ab8500_remove(struct platform_device *pdev)
 {
+	struct ab8500 *ab8500 = platform_get_drvdata(pdev);
+
 	if (is_ab9540(ab8500))
 		sysfs_remove_group(&ab8500->dev->kobj, &ab9540_attr_group);
 	else
@@ -1359,10 +1423,41 @@ int __devexit ab8500_exit(struct ab8500 *ab8500)
 	}
 	kfree(ab8500->oldmask);
 	kfree(ab8500->mask);
+	kfree(ab8500);
 
 	return 0;
 }
 
+static const struct platform_device_id ab8500_id[] = {
+	{ "ab8500-core", AB8500_VERSION_AB8500 },
+	{ "ab8505-i2c", AB8500_VERSION_AB8505 },
+	{ "ab9540-i2c", AB8500_VERSION_AB9540 },
+	{ "ab8540-i2c", AB8500_VERSION_AB8540 },
+	{ }
+};
+
+static struct platform_driver ab8500_core_driver = {
+	.driver = {
+		.name = "ab8500-core",
+		.owner = THIS_MODULE,
+	},
+	.probe	= ab8500_probe,
+	.remove	= __devexit_p(ab8500_remove),
+	.id_table = ab8500_id,
+};
+
+static int __init ab8500_core_init(void)
+{
+	return platform_driver_register(&ab8500_core_driver);
+}
+
+static void __exit ab8500_core_exit(void)
+{
+	platform_driver_unregister(&ab8500_core_driver);
+}
+arch_initcall(ab8500_core_init);
+module_exit(ab8500_core_exit);
+
 MODULE_AUTHOR("Mattias Wallin, Srinidhi Kasagar, Rabin Vincent");
 MODULE_DESCRIPTION("AB8500 MFD core");
 MODULE_LICENSE("GPL v2");
diff --git a/drivers/mfd/ab8500-i2c.c b/drivers/mfd/ab8500-i2c.c
deleted file mode 100644
index 89b31a3..0000000
--- a/drivers/mfd/ab8500-i2c.c
+++ /dev/null
@@ -1,160 +0,0 @@
-/*
- * Copyright (C) ST-Ericsson SA 2010
- * Author: Mattias Wallin <mattias.wallin@stericsson.com> for ST-Ericsson.
- * License Terms: GNU General Public License v2
- * This file was based on drivers/mfd/ab8500-spi.c
- */
-
-#include <linux/kernel.h>
-#include <linux/slab.h>
-#include <linux/init.h>
-#include <linux/module.h>
-#include <linux/platform_device.h>
-#include <linux/mfd/abx500/ab8500.h>
-#include <linux/mfd/dbx500-prcmu.h>
-
-static int ab8500_i2c_write(struct ab8500 *ab8500, u16 addr, u8 data)
-{
-	int ret;
-
-	ret = prcmu_abb_write((u8)(addr >> 8), (u8)(addr & 0xFF), &data, 1);
-	if (ret < 0)
-		dev_err(ab8500->dev, "prcmu i2c error %d\n", ret);
-	return ret;
-}
-
-static int ab8500_i2c_write_masked(struct ab8500 *ab8500, u16 addr, u8 mask,
-	u8 data)
-{
-	int ret;
-
-	ret = prcmu_abb_write_masked((u8)(addr >> 8), (u8)(addr & 0xFF), &data,
-		&mask, 1);
-	if (ret < 0)
-		dev_err(ab8500->dev, "prcmu i2c error %d\n", ret);
-	return ret;
-}
-
-static int ab8500_i2c_read(struct ab8500 *ab8500, u16 addr)
-{
-	int ret;
-	u8 data;
-
-	ret = prcmu_abb_read((u8)(addr >> 8), (u8)(addr & 0xFF), &data, 1);
-	if (ret < 0) {
-		dev_err(ab8500->dev, "prcmu i2c error %d\n", ret);
-		return ret;
-	}
-	return (int)data;
-}
-
-static int __devinit ab8500_i2c_probe(struct platform_device *plf)
-{
-	const struct platform_device_id *platid = platform_get_device_id(plf);
-	struct ab8500 *ab8500;
-	struct resource *resource;
-	int ret;
-
-	ab8500 = kzalloc(sizeof *ab8500, GFP_KERNEL);
-	if (!ab8500)
-		return -ENOMEM;
-
-	ab8500->dev = &plf->dev;
-
-	resource = platform_get_resource(plf, IORESOURCE_IRQ, 0);
-	if (!resource) {
-		kfree(ab8500);
-		return -ENODEV;
-	}
-
-	ab8500->irq = resource->start;
-
-	ab8500->read = ab8500_i2c_read;
-	ab8500->write = ab8500_i2c_write;
-	ab8500->write_masked = ab8500_i2c_write_masked;
-
-	platform_set_drvdata(plf, ab8500);
-
-	ret = ab8500_init(ab8500, platid->driver_data);
-	if (ret)
-		kfree(ab8500);
-
-
-	return ret;
-}
-
-static int __devexit ab8500_i2c_remove(struct platform_device *plf)
-{
-	struct ab8500 *ab8500 = platform_get_drvdata(plf);
-
-	ab8500_exit(ab8500);
-	kfree(ab8500);
-
-	return 0;
-}
-
-static const struct platform_device_id ab8500_id[] = {
-	{ "ab8500-i2c", AB8500_VERSION_AB8500 },
-	{ "ab8505-i2c", AB8500_VERSION_AB8505 },
-	{ "ab9540-i2c", AB8500_VERSION_AB9540 },
-	{ "ab8540-i2c", AB8500_VERSION_AB8540 },
-	{ }
-};
-
-#ifdef CONFIG_PM
-static int ab8500_i2c_suspend(struct device *dev)
-{
-	struct ab8500 *ab = dev_get_drvdata(dev);
-
-	disable_irq(ab->irq);
-	enable_irq_wake(ab->irq);
-
-	return 0;
-}
-
-static int ab8500_i2c_resume(struct device *dev)
-{
-	struct ab8500 *ab = dev_get_drvdata(dev);
-
-	disable_irq_wake(ab->irq);
-	enable_irq(ab->irq);
-
-	return 0;
-}
-
-static const struct dev_pm_ops ab8500_i2c_pm_ops = {
-	.suspend	= ab8500_i2c_suspend,
-	.resume		= ab8500_i2c_resume,
-};
-
-#define AB8500_I2C_PM_OPS	(&ab8500_i2c_pm_ops)
-#else
-#define AB8500_I2C_PM_OPS	NULL
-#endif
-
-static struct platform_driver ab8500_i2c_driver = {
-	.driver = {
-		.name = "ab8500-i2c",
-		.owner = THIS_MODULE,
-		.pm	= AB8500_I2C_PM_OPS,
-	},
-	.probe	= ab8500_i2c_probe,
-	.remove	= __devexit_p(ab8500_i2c_remove),
-	.id_table = ab8500_id,
-};
-
-static int __init ab8500_i2c_init(void)
-{
-	return platform_driver_register(&ab8500_i2c_driver);
-}
-
-static void __exit ab8500_i2c_exit(void)
-{
-	platform_driver_unregister(&ab8500_i2c_driver);
-}
-arch_initcall(ab8500_i2c_init);
-module_exit(ab8500_i2c_exit);
-
-MODULE_AUTHOR("Mattias WALLIN <mattias.wallin@stericsson.com");
-MODULE_DESCRIPTION("AB8500 Core access via PRCMU I2C");
-MODULE_LICENSE("GPL v2");
-- 
cgit v0.10.2


From 6bc4a568414caab05424b702165a732177daccd0 Mon Sep 17 00:00:00 2001
From: Lee Jones <lee.jones@linaro.org>
Date: Thu, 17 May 2012 14:45:13 +0100
Subject: mfd: Enable Device Tree for ab8500-core driver

This patch will allow the ab8500-core driver to be probed and set up
when booting when Device Tree is enabled. This includes platform ID
look-up which identifies the machine it is currently running on. If
we are undergoing a DT enabled boot, we will refuse to setup each of
the other ab8500-* devices, as they will be probed individually by DT.

Signed-off-by: Lee Jones <lee.jones@linaro.org>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>

diff --git a/drivers/mfd/ab8500-core.c b/drivers/mfd/ab8500-core.c
index fb807ab..dac0e29 100644
--- a/drivers/mfd/ab8500-core.c
+++ b/drivers/mfd/ab8500-core.c
@@ -20,6 +20,8 @@
 #include <linux/mfd/abx500/ab8500.h>
 #include <linux/mfd/dbx500-prcmu.h>
 #include <linux/regulator/ab8500.h>
+#include <linux/of.h>
+#include <linux/of_device.h>
 
 /*
  * Interrupt register offsets
@@ -1205,11 +1207,20 @@ static struct attribute_group ab9540_attr_group = {
 	.attrs	= ab9540_sysfs_entries,
 };
 
+static const struct of_device_id ab8500_match[] = {
+	{
+		.compatible = "stericsson,ab8500",
+		.data = (void *)AB8500_VERSION_AB8500,
+	},
+	{},
+};
+
 static int __devinit ab8500_probe(struct platform_device *pdev)
 {
 	struct ab8500_platform_data *plat = dev_get_platdata(&pdev->dev);
 	const struct platform_device_id *platid = platform_get_device_id(pdev);
-	enum ab8500_version version = platid->driver_data;
+	enum ab8500_version version = AB8500_VERSION_UNDEFINED;
+	struct device_node *np = pdev->dev.of_node;
 	struct ab8500 *ab8500;
 	struct resource *resource;
 	int ret;
@@ -1222,6 +1233,14 @@ static int __devinit ab8500_probe(struct platform_device *pdev)
 
 	if (plat)
 		ab8500->irq_base = plat->irq_base;
+	else if (np)
+		ret = of_property_read_u32(np, "stericsson,irq-base", &ab8500->irq_base);
+
+	if (!ab8500->irq_base) {
+		dev_info(&pdev->dev, "couldn't find irq-base\n");
+		ret = -EINVAL;
+		goto out_free_ab8500;
+	}
 
 	ab8500->dev = &pdev->dev;
 
@@ -1243,6 +1262,12 @@ static int __devinit ab8500_probe(struct platform_device *pdev)
 
 	platform_set_drvdata(pdev, ab8500);
 
+	if (platid)
+		version = platid->driver_data;
+	else if (np)
+		version = (unsigned int)
+			of_match_device(ab8500_match, &pdev->dev)->data;
+
 	if (version != AB8500_VERSION_UNDEFINED)
 		ab8500->version = version;
 	else {
@@ -1348,29 +1373,32 @@ static int __devinit ab8500_probe(struct platform_device *pdev)
 			goto out_removeirq;
 	}
 
-	ret = mfd_add_devices(ab8500->dev, 0, abx500_common_devs,
-			      ARRAY_SIZE(abx500_common_devs), NULL,
-			      ab8500->irq_base);
-
-	if (ret)
-		goto out_freeirq;
+	if (!np) {
+		ret = mfd_add_devices(ab8500->dev, 0, abx500_common_devs,
+				ARRAY_SIZE(abx500_common_devs), NULL,
+				ab8500->irq_base);
 
-	if (is_ab9540(ab8500))
-		ret = mfd_add_devices(ab8500->dev, 0, ab9540_devs,
-			      ARRAY_SIZE(ab9540_devs), NULL,
-			      ab8500->irq_base);
-	else
-		ret = mfd_add_devices(ab8500->dev, 0, ab8500_devs,
-			      ARRAY_SIZE(ab8500_devs), NULL,
-			      ab8500->irq_base);
+		if (ret)
+			goto out_freeirq;
 
-	if (is_ab9540(ab8500) || is_ab8505(ab8500))
-		ret = mfd_add_devices(ab8500->dev, 0, ab9540_ab8505_devs,
-			      ARRAY_SIZE(ab9540_ab8505_devs), NULL,
-			      ab8500->irq_base);
+		if (is_ab9540(ab8500))
+			ret = mfd_add_devices(ab8500->dev, 0, ab9540_devs,
+					ARRAY_SIZE(ab9540_devs), NULL,
+					ab8500->irq_base);
+		else
+			ret = mfd_add_devices(ab8500->dev, 0, ab8500_devs,
+					ARRAY_SIZE(ab8500_devs), NULL,
+					ab8500->irq_base);
+		if (ret)
+			goto out_freeirq;
 
-	if (ret)
-		goto out_freeirq;
+		if (is_ab9540(ab8500) || is_ab8505(ab8500))
+			ret = mfd_add_devices(ab8500->dev, 0, ab9540_ab8505_devs,
+					ARRAY_SIZE(ab9540_ab8505_devs), NULL,
+					ab8500->irq_base);
+		if (ret)
+			goto out_freeirq;
+	}
 
 	if (!no_bm) {
 		/* Add battery management devices */
@@ -1440,6 +1468,7 @@ static struct platform_driver ab8500_core_driver = {
 	.driver = {
 		.name = "ab8500-core",
 		.owner = THIS_MODULE,
+		.of_match_table = ab8500_match,
 	},
 	.probe	= ab8500_probe,
 	.remove	= __devexit_p(ab8500_remove),
-- 
cgit v0.10.2


From 5a8fea031ed5462b63877e0f6d29e0fc0fb82f14 Mon Sep 17 00:00:00 2001
From: Lee Jones <lee.jones@linaro.org>
Date: Thu, 17 May 2012 14:45:19 +0100
Subject: mfd: Enable ab8500-debug when Device Tree is enabled

Allow the ab8500-debugfs driver to be probed during DT start-up.

Signed-off-by: Lee Jones <lee.jones@linaro.org>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>

diff --git a/drivers/mfd/ab8500-debugfs.c b/drivers/mfd/ab8500-debugfs.c
index 9a0211a..50c4c89 100644
--- a/drivers/mfd/ab8500-debugfs.c
+++ b/drivers/mfd/ab8500-debugfs.c
@@ -608,10 +608,16 @@ static int __devexit ab8500_debug_remove(struct platform_device *plf)
 	return 0;
 }
 
+static const struct of_device_id ab8500_debug_match[] = {
+        { .compatible = "stericsson,ab8500-debug", },
+        {}
+};
+
 static struct platform_driver ab8500_debug_driver = {
 	.driver = {
 		.name = "ab8500-debug",
 		.owner = THIS_MODULE,
+		.of_match_table = ab8500_debug_match,
 	},
 	.probe  = ab8500_debug_probe,
 	.remove = __devexit_p(ab8500_debug_remove)
-- 
cgit v0.10.2


From 6dff11e5ab2f752ad75d0b506fcc981fef9999c5 Mon Sep 17 00:00:00 2001
From: Lee Jones <lee.jones@linaro.org>
Date: Thu, 17 May 2012 14:45:20 +0100
Subject: mfd: Prevent unassigned pointer from being used in ab8500-gpadc
 driver

Before this patch if probe failed to find the platform IRQ it
would attempt to print a message out using dev_err, which in
turn was being passed an unassigned pointer. This patch
ensures the information passed to dev_err is correct.

Signed-off-by: Lee Jones <lee.jones@linaro.org>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>

diff --git a/drivers/mfd/ab8500-gpadc.c b/drivers/mfd/ab8500-gpadc.c
index c39fc71..c1656ab 100644
--- a/drivers/mfd/ab8500-gpadc.c
+++ b/drivers/mfd/ab8500-gpadc.c
@@ -584,7 +584,7 @@ static int __devinit ab8500_gpadc_probe(struct platform_device *pdev)
 
 	gpadc->irq = platform_get_irq_byname(pdev, "SW_CONV_END");
 	if (gpadc->irq < 0) {
-		dev_err(gpadc->dev, "failed to get platform irq-%d\n",
+		dev_err(&pdev->dev, "failed to get platform irq-%d\n",
 			gpadc->irq);
 		ret = gpadc->irq;
 		goto fail;
-- 
cgit v0.10.2


From 3690fb2ca5f4ecb9424d02598cb55d300f48d844 Mon Sep 17 00:00:00 2001
From: Lee Jones <lee.jones@linaro.org>
Date: Thu, 17 May 2012 14:45:22 +0100
Subject: mfd: Enable ab8500-gpadc driver for Device Tree

This patch will allow the ab8500-gpadc driver to be probed during
Device Tree enabled boot.

Signed-off-by: Lee Jones <lee.jones@linaro.org>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>

diff --git a/drivers/mfd/ab8500-gpadc.c b/drivers/mfd/ab8500-gpadc.c
index c1656ab..b86fd8e 100644
--- a/drivers/mfd/ab8500-gpadc.c
+++ b/drivers/mfd/ab8500-gpadc.c
@@ -648,12 +648,18 @@ static int __devexit ab8500_gpadc_remove(struct platform_device *pdev)
 	return 0;
 }
 
+static const struct of_device_id ab8500_gpadc_match[] = {
+	{ .compatible = "stericsson,ab8500-gpadc", },
+	{}
+};
+
 static struct platform_driver ab8500_gpadc_driver = {
 	.probe = ab8500_gpadc_probe,
 	.remove = __devexit_p(ab8500_gpadc_remove),
 	.driver = {
 		.name = "ab8500-gpadc",
 		.owner = THIS_MODULE,
+		.of_match_table = ab8500_gpadc_match,
 	},
 };
 
-- 
cgit v0.10.2


From 68029c6cf9750ff0d4ed5c812a3755cbd855862a Mon Sep 17 00:00:00 2001
From: Peter Ujfalusi <peter.ujfalusi@ti.com>
Date: Wed, 16 May 2012 14:11:55 +0300
Subject: mfd: twl6040 code cleanup in interrupt initialization part

No functional change, just to make the code a bit more uniform and
remove wrapped lines.

Signed-off-by: Peter Ujfalusi <peter.ujfalusi@ti.com>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>

diff --git a/drivers/mfd/twl6040-irq.c b/drivers/mfd/twl6040-irq.c
index b3f8dda..008022c 100644
--- a/drivers/mfd/twl6040-irq.c
+++ b/drivers/mfd/twl6040-irq.c
@@ -138,7 +138,7 @@ static irqreturn_t twl6040_irq_thread(int irq, void *data)
 
 int twl6040_irq_init(struct twl6040 *twl6040)
 {
-	int cur_irq, ret;
+	int i, nr_irqs, ret;
 	u8 val;
 
 	mutex_init(&twl6040->irq_mutex);
@@ -148,21 +148,20 @@ int twl6040_irq_init(struct twl6040 *twl6040)
 	twl6040->irq_masks_cache = TWL6040_ALLINT_MSK;
 	twl6040_reg_write(twl6040, TWL6040_REG_INTMR, TWL6040_ALLINT_MSK);
 
+	nr_irqs = ARRAY_SIZE(twl6040_irqs);
 	/* Register them with genirq */
-	for (cur_irq = twl6040->irq_base;
-	     cur_irq < twl6040->irq_base + ARRAY_SIZE(twl6040_irqs);
-	     cur_irq++) {
-		irq_set_chip_data(cur_irq, twl6040);
-		irq_set_chip_and_handler(cur_irq, &twl6040_irq_chip,
+	for (i = twl6040->irq_base; i < twl6040->irq_base + nr_irqs; i++) {
+		irq_set_chip_data(i, twl6040);
+		irq_set_chip_and_handler(i, &twl6040_irq_chip,
 					 handle_level_irq);
-		irq_set_nested_thread(cur_irq, 1);
+		irq_set_nested_thread(i, 1);
 
 		/* ARM needs us to explicitly flag the IRQ as valid
 		 * and will set them noprobe when we do so. */
 #ifdef CONFIG_ARM
-		set_irq_flags(cur_irq, IRQF_VALID);
+		set_irq_flags(i, IRQF_VALID);
 #else
-		irq_set_noprobe(cur_irq);
+		irq_set_noprobe(i);
 #endif
 	}
 
-- 
cgit v0.10.2


From 6712419d697851c4472cdfd2111c844d777472e8 Mon Sep 17 00:00:00 2001
From: Peter Ujfalusi <peter.ujfalusi@ti.com>
Date: Wed, 16 May 2012 14:11:56 +0300
Subject: mfd: Allocate twl6040 IRQ numbers dynamically

Use irq_alloc_descs() to get the IRQ number range dynamically instead of
the hardwired use if pdata->irq_base.
The twl6040 only provides interrupts for it's internal components which
means that it is not working as an IRQ expander type of device.
The client drivers will receive their interrupt numbers as resource which
is configured based on the received IRQ range we got from irq_alloc_descs()

Signed-off-by: Peter Ujfalusi <peter.ujfalusi@ti.com>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>

diff --git a/drivers/mfd/twl6040-core.c b/drivers/mfd/twl6040-core.c
index 7a92d95..c50fba7 100644
--- a/drivers/mfd/twl6040-core.c
+++ b/drivers/mfd/twl6040-core.c
@@ -515,7 +515,7 @@ static int __devinit twl6040_probe(struct i2c_client *client,
 	}
 
 	/* In order to operate correctly we need valid interrupt config */
-	if (!client->irq || !pdata->irq_base) {
+	if (!client->irq) {
 		dev_err(&client->dev, "Invalid IRQ configuration\n");
 		return -EINVAL;
 	}
@@ -552,7 +552,6 @@ static int __devinit twl6040_probe(struct i2c_client *client,
 
 	twl6040->dev = &client->dev;
 	twl6040->irq = client->irq;
-	twl6040->irq_base = pdata->irq_base;
 
 	mutex_init(&twl6040->mutex);
 	mutex_init(&twl6040->io_mutex);
diff --git a/drivers/mfd/twl6040-irq.c b/drivers/mfd/twl6040-irq.c
index 008022c..914978e 100644
--- a/drivers/mfd/twl6040-irq.c
+++ b/drivers/mfd/twl6040-irq.c
@@ -23,6 +23,7 @@
 
 #include <linux/kernel.h>
 #include <linux/module.h>
+#include <linux/err.h>
 #include <linux/irq.h>
 #include <linux/interrupt.h>
 #include <linux/mfd/core.h>
@@ -138,7 +139,7 @@ static irqreturn_t twl6040_irq_thread(int irq, void *data)
 
 int twl6040_irq_init(struct twl6040 *twl6040)
 {
-	int i, nr_irqs, ret;
+	int i, nr_irqs, irq_base, ret;
 	u8 val;
 
 	mutex_init(&twl6040->irq_mutex);
@@ -149,8 +150,16 @@ int twl6040_irq_init(struct twl6040 *twl6040)
 	twl6040_reg_write(twl6040, TWL6040_REG_INTMR, TWL6040_ALLINT_MSK);
 
 	nr_irqs = ARRAY_SIZE(twl6040_irqs);
+
+	irq_base = irq_alloc_descs(-1, 0, nr_irqs, 0);
+	if (IS_ERR_VALUE(irq_base)) {
+		dev_err(twl6040->dev, "Fail to allocate IRQ descs\n");
+		return irq_base;
+	}
+	twl6040->irq_base = irq_base;
+
 	/* Register them with genirq */
-	for (i = twl6040->irq_base; i < twl6040->irq_base + nr_irqs; i++) {
+	for (i = irq_base; i < irq_base + nr_irqs; i++) {
 		irq_set_chip_data(i, twl6040);
 		irq_set_chip_and_handler(i, &twl6040_irq_chip,
 					 handle_level_irq);
-- 
cgit v0.10.2


From 1f01d60e4c0cae3416071680635f227df0020dd8 Mon Sep 17 00:00:00 2001
From: Peter Ujfalusi <peter.ujfalusi@ti.com>
Date: Wed, 16 May 2012 14:11:57 +0300
Subject: mfd: Register the twl6040 child for the ASoC codec unconditionally

The main function of the twl6040 is to provide audio on OMAP4+ platforms.
Since the ASoC codec driver can work without the pdata we can register the
child to load the codec driver whenever the twl6040 MFD driver is loaded.

Signed-off-by: Peter Ujfalusi <peter.ujfalusi@ti.com>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>

diff --git a/drivers/mfd/twl6040-core.c b/drivers/mfd/twl6040-core.c
index c50fba7..9765dc2 100644
--- a/drivers/mfd/twl6040-core.c
+++ b/drivers/mfd/twl6040-core.c
@@ -507,7 +507,7 @@ static int __devinit twl6040_probe(struct i2c_client *client,
 	struct twl6040_platform_data *pdata = client->dev.platform_data;
 	struct twl6040 *twl6040;
 	struct mfd_cell *cell = NULL;
-	int ret, children = 0;
+	int irq, ret, children = 0;
 
 	if (!pdata) {
 		dev_err(&client->dev, "Platform data is missing\n");
@@ -589,22 +589,27 @@ static int __devinit twl6040_probe(struct i2c_client *client,
 	/* dual-access registers controlled by I2C only */
 	twl6040_set_bits(twl6040, TWL6040_REG_ACCCTL, TWL6040_I2CSEL);
 
+	/*
+	 * The main functionality of twl6040 to provide audio on OMAP4+ systems.
+	 * We can add the ASoC codec child whenever this driver has been loaded.
+	 * The ASoC codec can work without pdata, pass the platform_data only if
+	 * it has been provided.
+	 */
+	irq = twl6040->irq_base + TWL6040_IRQ_PLUG;
+	cell = &twl6040->cells[children];
+	cell->name = "twl6040-codec";
+	twl6040_codec_rsrc[0].start = irq;
+	twl6040_codec_rsrc[0].end = irq;
+	cell->resources = twl6040_codec_rsrc;
+	cell->num_resources = ARRAY_SIZE(twl6040_codec_rsrc);
 	if (pdata->codec) {
-		int irq = twl6040->irq_base + TWL6040_IRQ_PLUG;
-
-		cell = &twl6040->cells[children];
-		cell->name = "twl6040-codec";
-		twl6040_codec_rsrc[0].start = irq;
-		twl6040_codec_rsrc[0].end = irq;
-		cell->resources = twl6040_codec_rsrc;
-		cell->num_resources = ARRAY_SIZE(twl6040_codec_rsrc);
 		cell->platform_data = pdata->codec;
 		cell->pdata_size = sizeof(*pdata->codec);
-		children++;
 	}
+	children++;
 
 	if (pdata->vibra) {
-		int irq = twl6040->irq_base + TWL6040_IRQ_VIB;
+		irq = twl6040->irq_base + TWL6040_IRQ_VIB;
 
 		cell = &twl6040->cells[children];
 		cell->name = "twl6040-vibra";
@@ -618,16 +623,10 @@ static int __devinit twl6040_probe(struct i2c_client *client,
 		children++;
 	}
 
-	if (children) {
-		ret = mfd_add_devices(&client->dev, -1, twl6040->cells,
-				      children, NULL, 0);
-		if (ret)
-			goto mfd_err;
-	} else {
-		dev_err(&client->dev, "No platform data found for children\n");
-		ret = -ENODEV;
+	ret = mfd_add_devices(&client->dev, -1, twl6040->cells, children,
+			      NULL, 0);
+	if (ret)
 		goto mfd_err;
-	}
 
 	return 0;
 
-- 
cgit v0.10.2


From 37e13cecaa141eccce705843f5d2f7509e29bd3a Mon Sep 17 00:00:00 2001
From: Peter Ujfalusi <peter.ujfalusi@ti.com>
Date: Wed, 16 May 2012 14:11:58 +0300
Subject: mfd: Add support for Device Tree to twl6040

Device tree based probing support for the core twl6040 driver. Child
devices will be created as MFD devices:
- ASoC codec is always created
- Vibra child is only created if the vibra section present in the DT blob.

Signed-off-by: Peter Ujfalusi <peter.ujfalusi@ti.com>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>

diff --git a/Documentation/devicetree/bindings/mfd/twl6040.txt b/Documentation/devicetree/bindings/mfd/twl6040.txt
new file mode 100644
index 0000000..bc67c6f
--- /dev/null
+++ b/Documentation/devicetree/bindings/mfd/twl6040.txt
@@ -0,0 +1,62 @@
+Texas Instruments TWL6040 family
+
+The TWL6040s are 8-channel high quality low-power audio codecs providing audio
+and vibra functionality on OMAP4+ platforms.
+They are connected ot the host processor via i2c for commands, McPDM for audio
+data and commands.
+
+Required properties:
+- compatible : Must be "ti,twl6040";
+- reg: must be 0x4b for i2c address
+- interrupts: twl6040 has one interrupt line connecteded to the main SoC
+- interrupt-parent: The parent interrupt controller
+- twl6040,audpwron-gpio: Power on GPIO line for the twl6040
+
+- vio-supply: Regulator for the twl6040 VIO supply
+- v2v1-supply: Regulator for the twl6040 V2V1 supply
+
+Optional properties, nodes:
+- enable-active-high: To power on the twl6040 during boot.
+
+Vibra functionality
+Required properties:
+- vddvibl-supply: Regulator for the left vibra motor
+- vddvibr-supply: Regulator for the right vibra motor
+- vibra { }: Configuration section for vibra parameters containing the following
+	     properties:
+- ti,vibldrv-res: Resistance parameter for left driver
+- ti,vibrdrv-res: Resistance parameter for right driver
+- ti,viblmotor-res: Resistance parameter for left motor
+- ti,viblmotor-res: Resistance parameter for right motor
+
+Optional properties within vibra { } section:
+- vddvibl_uV: If the vddvibl default voltage need to be changed
+- vddvibr_uV: If the vddvibr default voltage need to be changed
+
+Example:
+&i2c1 {
+	twl6040: twl@4b {
+		compatible = "ti,twl6040";
+		reg = <0x4b>;
+
+		interrupts = <0 119 4>;
+		interrupt-parent = <&gic>;
+		twl6040,audpwron-gpio = <&gpio4 31 0>;
+
+		vio-supply = <&v1v8>;
+		v2v1-supply = <&v2v1>;
+		enable-active-high;
+
+		/* regulators for vibra motor */
+		vddvibl-supply = <&vbat>;
+		vddvibr-supply = <&vbat>;
+
+		vibra {
+			/* Vibra driver, motor resistance parameters */
+			ti,vibldrv-res = <8>;
+			ti,vibrdrv-res = <3>;
+			ti,viblmotor-res = <10>;
+			ti,vibrmotor-res = <10>;
+		};
+	};
+};
diff --git a/drivers/mfd/twl6040-core.c b/drivers/mfd/twl6040-core.c
index 9765dc2..450a28f 100644
--- a/drivers/mfd/twl6040-core.c
+++ b/drivers/mfd/twl6040-core.c
@@ -29,6 +29,10 @@
 #include <linux/kernel.h>
 #include <linux/err.h>
 #include <linux/platform_device.h>
+#include <linux/of.h>
+#include <linux/of_irq.h>
+#include <linux/of_gpio.h>
+#include <linux/of_platform.h>
 #include <linux/gpio.h>
 #include <linux/delay.h>
 #include <linux/i2c.h>
@@ -505,11 +509,12 @@ static int __devinit twl6040_probe(struct i2c_client *client,
 				     const struct i2c_device_id *id)
 {
 	struct twl6040_platform_data *pdata = client->dev.platform_data;
+	struct device_node *node = client->dev.of_node;
 	struct twl6040 *twl6040;
 	struct mfd_cell *cell = NULL;
 	int irq, ret, children = 0;
 
-	if (!pdata) {
+	if (!pdata && !node) {
 		dev_err(&client->dev, "Platform data is missing\n");
 		return -EINVAL;
 	}
@@ -560,9 +565,13 @@ static int __devinit twl6040_probe(struct i2c_client *client,
 	twl6040->rev = twl6040_reg_read(twl6040, TWL6040_REG_ASICREV);
 
 	/* ERRATA: Automatic power-up is not possible in ES1.0 */
-	if (twl6040_get_revid(twl6040) > TWL6040_REV_ES1_0)
-		twl6040->audpwron = pdata->audpwron_gpio;
-	else
+	if (twl6040_get_revid(twl6040) > TWL6040_REV_ES1_0) {
+		if (pdata)
+			twl6040->audpwron = pdata->audpwron_gpio;
+		else
+			twl6040->audpwron = of_get_named_gpio(node,
+						"ti,audpwron-gpio", 0);
+	} else
 		twl6040->audpwron = -EINVAL;
 
 	if (gpio_is_valid(twl6040->audpwron)) {
@@ -602,13 +611,13 @@ static int __devinit twl6040_probe(struct i2c_client *client,
 	twl6040_codec_rsrc[0].end = irq;
 	cell->resources = twl6040_codec_rsrc;
 	cell->num_resources = ARRAY_SIZE(twl6040_codec_rsrc);
-	if (pdata->codec) {
+	if (pdata && pdata->codec) {
 		cell->platform_data = pdata->codec;
 		cell->pdata_size = sizeof(*pdata->codec);
 	}
 	children++;
 
-	if (pdata->vibra) {
+	if ((pdata && pdata->vibra) || of_find_node_by_name(node, "vibra")) {
 		irq = twl6040->irq_base + TWL6040_IRQ_VIB;
 
 		cell = &twl6040->cells[children];
@@ -618,8 +627,10 @@ static int __devinit twl6040_probe(struct i2c_client *client,
 		cell->resources = twl6040_vibra_rsrc;
 		cell->num_resources = ARRAY_SIZE(twl6040_vibra_rsrc);
 
-		cell->platform_data = pdata->vibra;
-		cell->pdata_size = sizeof(*pdata->vibra);
+		if (pdata && pdata->vibra) {
+			cell->platform_data = pdata->vibra;
+			cell->pdata_size = sizeof(*pdata->vibra);
+		}
 		children++;
 	}
 
diff --git a/drivers/mfd/twl6040-irq.c b/drivers/mfd/twl6040-irq.c
index 914978e..4b42543 100644
--- a/drivers/mfd/twl6040-irq.c
+++ b/drivers/mfd/twl6040-irq.c
@@ -25,6 +25,8 @@
 #include <linux/module.h>
 #include <linux/err.h>
 #include <linux/irq.h>
+#include <linux/of.h>
+#include <linux/irqdomain.h>
 #include <linux/interrupt.h>
 #include <linux/mfd/core.h>
 #include <linux/mfd/twl6040.h>
@@ -139,6 +141,7 @@ static irqreturn_t twl6040_irq_thread(int irq, void *data)
 
 int twl6040_irq_init(struct twl6040 *twl6040)
 {
+	struct device_node *node = twl6040->dev->of_node;
 	int i, nr_irqs, irq_base, ret;
 	u8 val;
 
@@ -158,6 +161,9 @@ int twl6040_irq_init(struct twl6040 *twl6040)
 	}
 	twl6040->irq_base = irq_base;
 
+	irq_domain_add_legacy(node, ARRAY_SIZE(twl6040_irqs), irq_base, 0,
+			      &irq_domain_simple_ops, NULL);
+
 	/* Register them with genirq */
 	for (i = irq_base; i < irq_base + nr_irqs; i++) {
 		irq_set_chip_data(i, twl6040);
-- 
cgit v0.10.2


From 01247ef7ecbf36515a1cc3c2b34e4738aab1bd40 Mon Sep 17 00:00:00 2001
From: Lee Jones <lee.jones@linaro.org>
Date: Fri, 18 May 2012 09:39:10 +0100
Subject: mfd: Enable Device Tree support in the ab8500-sysctrl driver

This patch ensures probing of the ab8500-sysctrl driver during a DT
enabled boot, so long as the associated nodes are present in the
Device Tree binary.

Signed-off-by: Lee Jones <lee.jones@linaro.org>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>

diff --git a/drivers/mfd/ab8500-sysctrl.c b/drivers/mfd/ab8500-sysctrl.c
index c28d4eb..5a3e51c 100644
--- a/drivers/mfd/ab8500-sysctrl.c
+++ b/drivers/mfd/ab8500-sysctrl.c
@@ -61,10 +61,16 @@ static int __devexit ab8500_sysctrl_remove(struct platform_device *pdev)
 	return 0;
 }
 
+static const struct of_device_id ab8500_sysctrl_match[] = {
+	{ .compatible = "stericsson,ab8500-sysctrl", },
+	{}
+};
+
 static struct platform_driver ab8500_sysctrl_driver = {
 	.driver = {
 		.name = "ab8500-sysctrl",
 		.owner = THIS_MODULE,
+		.of_match_table = ab8500_sysctrl_match,
 	},
 	.probe = ab8500_sysctrl_probe,
 	.remove = __devexit_p(ab8500_sysctrl_remove),
-- 
cgit v0.10.2


From 3770c75a6852a87afc66cd514c13899f8a288cef Mon Sep 17 00:00:00 2001
From: Lee Jones <lee.jones@linaro.org>
Date: Fri, 18 May 2012 09:39:12 +0100
Subject: mfd: Enable Device Tree support in the ab8500-pwm driver

This patch ensures probing of the ab8500-pwm driver during a DT
enabled boot, so long as the associated nodes are present in the
Device Tree binary.

Signed-off-by: Lee Jones <lee.jones@linaro.org>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>

diff --git a/drivers/misc/ab8500-pwm.c b/drivers/misc/ab8500-pwm.c
index d7a9aa1..042a8fe 100644
--- a/drivers/misc/ab8500-pwm.c
+++ b/drivers/misc/ab8500-pwm.c
@@ -142,10 +142,16 @@ static int __devexit ab8500_pwm_remove(struct platform_device *pdev)
 	return 0;
 }
 
+static const struct of_device_id ab8500_pwm_match[] = {
+	{ .compatible = "stericsson,ab8500-pwm", },
+	{}
+};
+
 static struct platform_driver ab8500_pwm_driver = {
 	.driver = {
 		.name = "ab8500-pwm",
 		.owner = THIS_MODULE,
+		.of_match_table = ab8500_pwm_match,
 	},
 	.probe = ab8500_pwm_probe,
 	.remove = __devexit_p(ab8500_pwm_remove),
-- 
cgit v0.10.2


From 6090885597e6500f6671a0f7bab9e82dfa6f38be Mon Sep 17 00:00:00 2001
From: Johan Hovold <jhovold@gmail.com>
Date: Sun, 20 May 2012 15:16:10 +0200
Subject: mfd: Fix return type of lm533 attribute is_visible

Since commit 587a1f1659 ("switch ->is_visible() to returning umode_t")
the return type of is_visible is umode_t rather than mode_t.

This silences a compiler warning on some architectures where these types
are not compatible.

Reported-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Johan Hovold <jhovold@gmail.com>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>

diff --git a/drivers/mfd/lm3533-core.c b/drivers/mfd/lm3533-core.c
index 09019c6..104faba 100644
--- a/drivers/mfd/lm3533-core.c
+++ b/drivers/mfd/lm3533-core.c
@@ -367,7 +367,7 @@ static umode_t lm3533_attr_is_visible(struct kobject *kobj,
 	struct device_attribute *dattr = to_dev_attr(attr);
 	struct lm3533_device_attribute *lattr = to_lm3533_dev_attr(dattr);
 	enum lm3533_attribute_type type = lattr->type;
-	mode_t mode = attr->mode;
+	umode_t mode = attr->mode;
 
 	if (!lm3533->have_backlights && type == LM3533_ATTR_TYPE_BACKLIGHT)
 		mode = 0;
-- 
cgit v0.10.2


From c48bf153f293894d232ee44f5bff99c523a06585 Mon Sep 17 00:00:00 2001
From: Johan Hovold <jhovold@gmail.com>
Date: Fri, 18 May 2012 20:22:46 +0200
Subject: mfd: Mark two lm3533 zone registers as volatile

Mark the two currently unused zone registers as volatile in regmap for
completeness.

Signed-off-by: Johan Hovold <jhovold@gmail.com>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>

diff --git a/drivers/mfd/lm3533-core.c b/drivers/mfd/lm3533-core.c
index 104faba..0b2879b 100644
--- a/drivers/mfd/lm3533-core.c
+++ b/drivers/mfd/lm3533-core.c
@@ -567,7 +567,7 @@ static bool lm3533_readable_register(struct device *dev, unsigned int reg)
 static bool lm3533_volatile_register(struct device *dev, unsigned int reg)
 {
 	switch (reg) {
-	case 0x34:		/* zone */
+	case 0x34 ... 0x36:	/* zone */
 	case 0x37 ... 0x38:	/* adc */
 	case 0xb0 ... 0xb1:	/* fault */
 		return true;
-- 
cgit v0.10.2


From 1fe17a24e2fe0a9554d19a4249eb2d80050ecb8c Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@opensource.wolfsonmicro.com>
Date: Fri, 18 May 2012 17:02:02 +0100
Subject: mfd: Emulate active low IRQs as well as active high IRQs for wm831x

As with the existing emulation this should not be used in production
systems but is useful for test purposes.

Signed-off-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>

diff --git a/drivers/mfd/wm831x-irq.c b/drivers/mfd/wm831x-irq.c
index ecc9d6d..804e56e 100644
--- a/drivers/mfd/wm831x-irq.c
+++ b/drivers/mfd/wm831x-irq.c
@@ -413,22 +413,25 @@ static int wm831x_irq_set_type(struct irq_data *data, unsigned int type)
 	 * do the update here as we can be called with the bus lock
 	 * held.
 	 */
+	wm831x->gpio_level_low[irq] = false;
+	wm831x->gpio_level_high[irq] = false;
 	switch (type) {
 	case IRQ_TYPE_EDGE_BOTH:
 		wm831x->gpio_update[irq] = 0x10000 | WM831X_GPN_INT_MODE;
-		wm831x->gpio_level[irq] = false;
 		break;
 	case IRQ_TYPE_EDGE_RISING:
 		wm831x->gpio_update[irq] = 0x10000 | WM831X_GPN_POL;
-		wm831x->gpio_level[irq] = false;
 		break;
 	case IRQ_TYPE_EDGE_FALLING:
 		wm831x->gpio_update[irq] = 0x10000;
-		wm831x->gpio_level[irq] = false;
 		break;
 	case IRQ_TYPE_LEVEL_HIGH:
 		wm831x->gpio_update[irq] = 0x10000 | WM831X_GPN_POL;
-		wm831x->gpio_level[irq] = true;
+		wm831x->gpio_level_high[irq] = true;
+		break;
+	case IRQ_TYPE_LEVEL_LOW:
+		wm831x->gpio_update[irq] = 0x10000;
+		wm831x->gpio_level_low[irq] = true;
 		break;
 	default:
 		return -EINVAL;
@@ -517,7 +520,7 @@ static irqreturn_t wm831x_irq_thread(int irq, void *data)
 		 * status.  This is sucky but improves interoperability.
 		 */
 		if (primary == WM831X_GP_INT &&
-		    wm831x->gpio_level[i - WM831X_IRQ_GPIO_1]) {
+		    wm831x->gpio_level_high[i - WM831X_IRQ_GPIO_1]) {
 			ret = wm831x_reg_read(wm831x, WM831X_GPIO_LEVEL);
 			while (ret & 1 << (i - WM831X_IRQ_GPIO_1)) {
 				handle_nested_irq(irq_find_mapping(wm831x->irq_domain,
@@ -526,6 +529,17 @@ static irqreturn_t wm831x_irq_thread(int irq, void *data)
 						      WM831X_GPIO_LEVEL);
 			}
 		}
+
+		if (primary == WM831X_GP_INT &&
+		    wm831x->gpio_level_low[i - WM831X_IRQ_GPIO_1]) {
+			ret = wm831x_reg_read(wm831x, WM831X_GPIO_LEVEL);
+			while (!(ret & 1 << (i - WM831X_IRQ_GPIO_1))) {
+				handle_nested_irq(irq_find_mapping(wm831x->irq_domain,
+								   i));
+				ret = wm831x_reg_read(wm831x,
+						      WM831X_GPIO_LEVEL);
+			}
+		}
 	}
 
 out:
diff --git a/include/linux/mfd/wm831x/core.h b/include/linux/mfd/wm831x/core.h
index 736191c..4a3b83a 100644
--- a/include/linux/mfd/wm831x/core.h
+++ b/include/linux/mfd/wm831x/core.h
@@ -384,7 +384,8 @@ struct wm831x {
 
 	/* Used by the interrupt controller code to post writes */
 	int gpio_update[WM831X_NUM_GPIO_REGS];
-	bool gpio_level[WM831X_NUM_GPIO_REGS];
+	bool gpio_level_high[WM831X_NUM_GPIO_REGS];
+	bool gpio_level_low[WM831X_NUM_GPIO_REGS];
 
 	struct mutex auxadc_lock;
 	struct list_head auxadc_pending;
-- 
cgit v0.10.2


From 6abe4a87f7bc7978705c386dbba0ca0c7790b3ec Mon Sep 17 00:00:00 2001
From: Boaz Harrosh <bharrosh@panasas.com>
Date: Wed, 16 May 2012 14:22:21 +0300
Subject: exofs: Fix CRASH on very early IO errors.

If at exofs_fill_super() we had an early termination
do to any error, like an IO error while reading the
super-block. We would crash inside exofs_free_sbi().

This is because sbi->oc.numdevs was set to 1, before
we actually have a device table at all.

Fix it by moving the sbi->oc.numdevs = 1 to after the
allocation of the device table.

Reported-by: Johannes Schild <JSchild@gmx.de>

Stable: This is a bug since v3.2.0
CC: Stable Tree <stable@kernel.org>
Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>

diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index 735ca06..59e0849 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -745,7 +745,6 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
 	sbi->one_comp.obj.partition = opts->pid;
 	sbi->one_comp.obj.id = 0;
 	exofs_make_credential(sbi->one_comp.cred, &sbi->one_comp.obj);
-	sbi->oc.numdevs = 1;
 	sbi->oc.single_comp = EC_SINGLE_COMP;
 	sbi->oc.comps = &sbi->one_comp;
 
@@ -804,6 +803,7 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
 			goto free_sbi;
 
 		ore_comp_set_dev(&sbi->oc, 0, od);
+		sbi->oc.numdevs = 1;
 	}
 
 	__sbi_read_stats(sbi);
-- 
cgit v0.10.2


From 8b56a30caaf9bc1850784f196636c5f550cc7577 Mon Sep 17 00:00:00 2001
From: Sachin Bhamare <sbhamare@panasas.com>
Date: Wed, 14 Mar 2012 18:01:45 -0700
Subject: exofs:  Add SYSFS info for autologin/pNFS export

Introduce sysfs infrastructure for exofs cluster filesystem.

Each OSD target shows up as below in the sysfs hierarchy:
	/sys/fs/exofs/<osdname>_<partition_id>/devX

Where <osdname>_<partition_id> is the unique identification
of a Superblock.

Where devX: 0 <= X < device_table_size. They are ordered
in device-table order as specified to the mkfs.exofs command

Each OSD device  devX has following attributes :
	osdname - ReadOnly
	systemid - ReadOnly
	uri - Read/Write

It is up to user-mode to update devX/uri for support of
autologin.

These sysfs information are used both for autologin as well
as support for exporting exofs via a pNFSD server in user-mode.
(.eg NFS-Ganesha)

Signed-off-by: Sachin Bhamare <sbhamare@panasas.com>
Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>

diff --git a/fs/exofs/Kbuild b/fs/exofs/Kbuild
index 352ba14..389ba83 100644
--- a/fs/exofs/Kbuild
+++ b/fs/exofs/Kbuild
@@ -16,5 +16,5 @@
 libore-y := ore.o ore_raid.o
 obj-$(CONFIG_ORE) += libore.o
 
-exofs-y := inode.o file.o symlink.o namei.o dir.o super.o
+exofs-y := inode.o file.o symlink.o namei.o dir.o super.o sys.o
 obj-$(CONFIG_EXOFS_FS) += exofs.o
diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h
index ca9d496..fffe86f 100644
--- a/fs/exofs/exofs.h
+++ b/fs/exofs/exofs.h
@@ -56,6 +56,9 @@
 struct exofs_dev {
 	struct ore_dev ored;
 	unsigned did;
+	unsigned urilen;
+	uint8_t *uri;
+	struct kobject ed_kobj;
 };
 /*
  * our extension to the in-memory superblock
@@ -73,6 +76,7 @@ struct exofs_sb_info {
 	struct ore_layout	layout;		/* Default files layout       */
 	struct ore_comp one_comp;		/* id & cred of partition id=0*/
 	struct ore_components oc;		/* comps for the partition    */
+	struct kobject	s_kobj;			/* holds per-sbi kobject      */
 };
 
 /*
@@ -176,6 +180,16 @@ void exofs_make_credential(u8 cred_a[OSD_CAP_LEN],
 			   const struct osd_obj_id *obj);
 int exofs_sbi_write_stats(struct exofs_sb_info *sbi);
 
+/* sys.c                 */
+int exofs_sysfs_init(void);
+void exofs_sysfs_uninit(void);
+int exofs_sysfs_sb_add(struct exofs_sb_info *sbi,
+		       struct exofs_dt_device_info *dt_dev);
+void exofs_sysfs_sb_del(struct exofs_sb_info *sbi);
+int exofs_sysfs_odev_add(struct exofs_dev *edev,
+			 struct exofs_sb_info *sbi);
+void exofs_sysfs_dbg_print(void);
+
 /*********************
  * operation vectors *
  *********************/
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index 59e0849..4337836 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -472,6 +472,7 @@ static void exofs_put_super(struct super_block *sb)
 	_exofs_print_device("Unmounting", NULL, ore_comp_dev(&sbi->oc, 0),
 			    sbi->one_comp.obj.partition);
 
+	exofs_sysfs_sb_del(sbi);
 	bdi_destroy(&sbi->bdi);
 	exofs_free_sbi(sbi);
 	sb->s_fs_info = NULL;
@@ -632,6 +633,12 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info *sbi,
 	memcpy(&sbi->oc.ods[numdevs], &sbi->oc.ods[0],
 		(numdevs - 1) * sizeof(sbi->oc.ods[0]));
 
+	/* create sysfs subdir under which we put the device table
+	 * And cluster layout. A Superblock is identified by the string:
+	 *	"dev[0].osdname"_"pid"
+	 */
+	exofs_sysfs_sb_add(sbi, &dt->dt_dev_table[0]);
+
 	for (i = 0; i < numdevs; i++) {
 		struct exofs_fscb fscb;
 		struct osd_dev_info odi;
@@ -657,6 +664,7 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info *sbi,
 			eds[i].ored.od = fscb_od;
 			++sbi->oc.numdevs;
 			fscb_od = NULL;
+			exofs_sysfs_odev_add(&eds[i], sbi);
 			continue;
 		}
 
@@ -682,6 +690,7 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info *sbi,
 				  odi.osdname);
 			goto out;
 		}
+		exofs_sysfs_odev_add(&eds[i], sbi);
 
 		/* TODO: verify other information is correct and FS-uuid
 		 *	 matches. Benny what did you say about device table
@@ -844,6 +853,7 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
 		goto free_sbi;
 	}
 
+	exofs_sysfs_dbg_print();
 	_exofs_print_device("Mounting", opts->dev_name,
 			    ore_comp_dev(&sbi->oc, 0),
 			    sbi->one_comp.obj.partition);
@@ -1023,6 +1033,9 @@ static int __init init_exofs(void)
 	if (err)
 		goto out_d;
 
+	/* We don't fail if sysfs creation failed */
+	exofs_sysfs_init();
+
 	return 0;
 out_d:
 	destroy_inodecache();
@@ -1032,6 +1045,7 @@ out:
 
 static void __exit exit_exofs(void)
 {
+	exofs_sysfs_uninit();
 	unregister_filesystem(&exofs_type);
 	destroy_inodecache();
 }
diff --git a/fs/exofs/sys.c b/fs/exofs/sys.c
new file mode 100644
index 0000000..e32bc91
--- /dev/null
+++ b/fs/exofs/sys.c
@@ -0,0 +1,200 @@
+/*
+ * Copyright (C) 2012
+ * Sachin Bhamare <sbhamare@panasas.com>
+ * Boaz Harrosh <bharrosh@panasas.com>
+ *
+ * This file is part of exofs.
+ *
+ * exofs is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License 2 as published by
+ * the Free Software Foundation.
+ *
+ * exofs is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with exofs; if not, write to the:
+ *	Free Software Foundation <licensing@fsf.org>
+ */
+
+#include <linux/kobject.h>
+#include <linux/device.h>
+
+#include "exofs.h"
+
+struct odev_attr {
+	struct attribute attr;
+	ssize_t (*show)(struct exofs_dev *, char *);
+	ssize_t (*store)(struct exofs_dev *, const char *, size_t);
+};
+
+static ssize_t odev_attr_show(struct kobject *kobj, struct attribute *attr,
+		char *buf)
+{
+	struct exofs_dev *edp = container_of(kobj, struct exofs_dev, ed_kobj);
+	struct odev_attr *a = container_of(attr, struct odev_attr, attr);
+
+	return a->show ? a->show(edp, buf) : 0;
+}
+
+static ssize_t odev_attr_store(struct kobject *kobj, struct attribute *attr,
+		const char *buf, size_t len)
+{
+	struct exofs_dev *edp = container_of(kobj, struct exofs_dev, ed_kobj);
+	struct odev_attr *a = container_of(attr, struct odev_attr, attr);
+
+	return a->store ? a->store(edp, buf, len) : len;
+}
+
+static const struct sysfs_ops odev_attr_ops = {
+	.show  = odev_attr_show,
+	.store = odev_attr_store,
+};
+
+
+static struct kset *exofs_kset;
+
+static ssize_t osdname_show(struct exofs_dev *edp, char *buf)
+{
+	struct osd_dev *odev = edp->ored.od;
+	const struct osd_dev_info *odi = osduld_device_info(odev);
+
+	return snprintf(buf, odi->osdname_len + 1, "%s", odi->osdname);
+}
+
+static ssize_t systemid_show(struct exofs_dev *edp, char *buf)
+{
+	struct osd_dev *odev = edp->ored.od;
+	const struct osd_dev_info *odi = osduld_device_info(odev);
+
+	memcpy(buf, odi->systemid, odi->systemid_len);
+	return odi->systemid_len;
+}
+
+static ssize_t uri_show(struct exofs_dev *edp, char *buf)
+{
+	return snprintf(buf, edp->urilen, "%s", edp->uri);
+}
+
+static ssize_t uri_store(struct exofs_dev *edp, const char *buf, size_t len)
+{
+	edp->urilen = strlen(buf) + 1;
+	edp->uri = krealloc(edp->uri, edp->urilen, GFP_KERNEL);
+	strncpy(edp->uri, buf, edp->urilen);
+	return edp->urilen;
+}
+
+#define OSD_ATTR(name, mode, show, store) \
+	static struct odev_attr odev_attr_##name = \
+					__ATTR(name, mode, show, store)
+
+OSD_ATTR(osdname, S_IRUGO, osdname_show, NULL);
+OSD_ATTR(systemid, S_IRUGO, systemid_show, NULL);
+OSD_ATTR(uri, S_IRWXU, uri_show, uri_store);
+
+static struct attribute *odev_attrs[] = {
+	&odev_attr_osdname.attr,
+	&odev_attr_systemid.attr,
+	&odev_attr_uri.attr,
+	NULL,
+};
+
+static struct kobj_type odev_ktype = {
+	.default_attrs	= odev_attrs,
+	.sysfs_ops	= &odev_attr_ops,
+};
+
+static struct kobj_type uuid_ktype = {
+};
+
+void exofs_sysfs_dbg_print()
+{
+#ifdef CONFIG_EXOFS_DEBUG
+	struct kobject *k_name, *k_tmp;
+
+	list_for_each_entry_safe(k_name, k_tmp, &exofs_kset->list, entry) {
+		printk(KERN_INFO "%s: name %s ref %d\n",
+			__func__, kobject_name(k_name),
+			(int)atomic_read(&k_name->kref.refcount));
+	}
+#endif
+}
+/*
+ * This function removes all kobjects under exofs_kset
+ * At the end of it, exofs_kset kobject will have a refcount
+ * of 1 which gets decremented only on exofs module unload
+ */
+void exofs_sysfs_sb_del(struct exofs_sb_info *sbi)
+{
+	struct kobject *k_name, *k_tmp;
+	struct kobject *s_kobj = &sbi->s_kobj;
+
+	list_for_each_entry_safe(k_name, k_tmp, &exofs_kset->list, entry) {
+		/* Remove all that are children of this SBI */
+		if (k_name->parent == s_kobj)
+			kobject_put(k_name);
+	}
+	kobject_put(s_kobj);
+}
+
+/*
+ * This function creates sysfs entries to hold the current exofs cluster
+ * instance (uniquely identified by osdname,pid tuple).
+ * This function gets called once per exofs mount instance.
+ */
+int exofs_sysfs_sb_add(struct exofs_sb_info *sbi,
+		       struct exofs_dt_device_info *dt_dev)
+{
+	struct kobject *s_kobj;
+	int retval = 0;
+	uint64_t pid = sbi->one_comp.obj.partition;
+
+	/* allocate new uuid dirent */
+	s_kobj = &sbi->s_kobj;
+	s_kobj->kset = exofs_kset;
+	retval = kobject_init_and_add(s_kobj, &uuid_ktype,
+			&exofs_kset->kobj,  "%s_%llx", dt_dev->osdname, pid);
+	if (retval) {
+		EXOFS_ERR("ERROR: Failed to create sysfs entry for "
+			  "uuid-%s_%llx => %d\n", dt_dev->osdname, pid, retval);
+		return -ENOMEM;
+	}
+	return 0;
+}
+
+int exofs_sysfs_odev_add(struct exofs_dev *edev, struct exofs_sb_info *sbi)
+{
+	struct kobject *d_kobj;
+	int retval = 0;
+
+	/* create osd device group which contains following attributes
+	 * osdname, systemid & uri
+	 */
+	d_kobj = &edev->ed_kobj;
+	d_kobj->kset = exofs_kset;
+	retval = kobject_init_and_add(d_kobj, &odev_ktype,
+			&sbi->s_kobj, "dev%u", edev->did);
+	if (retval) {
+		EXOFS_ERR("ERROR: Failed to create sysfs entry for "
+				"device dev%u\n", edev->did);
+		return retval;
+	}
+	return 0;
+}
+
+int exofs_sysfs_init(void)
+{
+	exofs_kset = kset_create_and_add("exofs", NULL, fs_kobj);
+	if (!exofs_kset) {
+		EXOFS_ERR("ERROR: kset_create_and_add exofs failed\n");
+		return -ENOMEM;
+	}
+	return 0;
+}
+
+void exofs_sysfs_uninit(void)
+{
+	kset_unregister(exofs_kset);
+}
-- 
cgit v0.10.2


From 53b8ee346463946f88b3e1639d688c384df1166c Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 22 May 2012 16:36:27 -0400
Subject: NFSv4.1: Fix a bad reference count issue in the pNFS commit code

filelayout_scan_commit_lists needs to bump the reference count on
the struct nfs_page just like nfs_scan_commit_list().

Reported-by: Andy Adamson <andros@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
index 474c630..33849d3 100644
--- a/fs/nfs/nfs4filelayout.c
+++ b/fs/nfs/nfs4filelayout.c
@@ -1106,6 +1106,7 @@ transfer_commit_list(struct list_head *src, struct list_head *dst,
 	list_for_each_entry_safe(req, tmp, src, wb_list) {
 		if (!nfs_lock_request(req))
 			continue;
+		kref_get(&req->wb_kref);
 		if (cond_resched_lock(cinfo->lock))
 			list_safe_reset_next(req, tmp, wb_list);
 		nfs_request_remove_commit_list(req, cinfo);
-- 
cgit v0.10.2


From d42e78737c31f08893ed4916fc03104790867a71 Mon Sep 17 00:00:00 2001
From: Andy Adamson <andros@netapp.com>
Date: Tue, 22 May 2012 08:09:28 -0400
Subject: NFSv4.1 fix null state reference in filelayout_async_handle_error

Signed-off-by: Andy Adamson <andros@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
index 33849d3..f8edb14 100644
--- a/fs/nfs/nfs4filelayout.c
+++ b/fs/nfs/nfs4filelayout.c
@@ -199,8 +199,8 @@ static int filelayout_async_handle_error(struct rpc_task *task,
 		 * layout is destroyed and a new valid layout is obtained.
 		 */
 		set_bit(NFS_LAYOUT_INVALID,
-				&NFS_I(state->inode)->layout->plh_flags);
-		pnfs_destroy_layout(NFS_I(state->inode));
+				&NFS_I(inode)->layout->plh_flags);
+		pnfs_destroy_layout(NFS_I(inode));
 		rpc_wake_up(&tbl->slot_tbl_waitq);
 		goto reset;
 	/* RPC connection errors */
@@ -214,7 +214,7 @@ static int filelayout_async_handle_error(struct rpc_task *task,
 		dprintk("%s DS connection error %d\n", __func__,
 			task->tk_status);
 		if (!filelayout_test_devid_invalid(devid))
-			_pnfs_return_layout(state->inode);
+			_pnfs_return_layout(inode);
 		filelayout_mark_devid_invalid(devid);
 		rpc_wake_up(&tbl->slot_tbl_waitq);
 		nfs4_ds_disconnect(clp);
-- 
cgit v0.10.2


From 996074cb8c355bf3d87d066ba1e1189ba3f648f5 Mon Sep 17 00:00:00 2001
From: Andy Adamson <andros@netapp.com>
Date: Tue, 22 May 2012 08:09:26 -0400
Subject: NFSv4.1 Just use nfs_put_client in filelayout release

Signed-off-by: Andy Adamson <andros@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
index f8edb14..4662847 100644
--- a/fs/nfs/nfs4filelayout.c
+++ b/fs/nfs/nfs4filelayout.c
@@ -101,9 +101,6 @@ static void filelayout_reset_write(struct nfs_write_data *data)
 							&hdr->pages,
 							hdr->completion_ops);
 	}
-	/* balance nfs_get_client in filelayout_write_pagelist */
-	nfs_put_client(data->ds_clp);
-	data->ds_clp     = NULL;
 }
 
 static void filelayout_reset_read(struct nfs_read_data *data)
@@ -125,9 +122,6 @@ static void filelayout_reset_read(struct nfs_read_data *data)
 							&hdr->pages,
 							hdr->completion_ops);
 	}
-	/* balance nfs_get_client in filelayout_read_pagelist */
-	nfs_put_client(data->ds_clp);
-	data->ds_clp = NULL;
 }
 
 static int filelayout_async_handle_error(struct rpc_task *task,
@@ -326,8 +320,7 @@ static void filelayout_read_release(void *data)
 {
 	struct nfs_read_data *rdata = data;
 
-	if (!test_bit(NFS_IOHDR_REDO, &rdata->header->flags))
-		nfs_put_client(rdata->ds_clp);
+	nfs_put_client(rdata->ds_clp);
 	rdata->header->mds_ops->rpc_release(data);
 }
 
@@ -424,8 +417,7 @@ static void filelayout_write_release(void *data)
 {
 	struct nfs_write_data *wdata = data;
 
-	if (!test_bit(NFS_IOHDR_REDO, &wdata->header->flags))
-		nfs_put_client(wdata->ds_clp);
+	nfs_put_client(wdata->ds_clp);
 	wdata->header->mds_ops->rpc_release(data);
 }
 
-- 
cgit v0.10.2


From bd4aeffb5b89070ae93c579f1d5a0758f7123e8b Mon Sep 17 00:00:00 2001
From: Andy Adamson <andros@netapp.com>
Date: Tue, 22 May 2012 08:09:27 -0400
Subject: NFSv4.1 skip rpc_call_done only on disconnected DS slot_table_waitq
 tasks

We reset all I/O on a disconnected data server through the pgio layer indicated
by the NFS_IOHDR_REDO flag.

Differentiate between on-the-wire tasks returning with an error which must
call rpc_call_done and tasks woken from the data server slot_table_waitq
waiting for a session slot with a status of zero which call rpc_exit in
rpc_prepare and need to skip rpc_call_done.

Signed-off-by: Andy Adamson <andros@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
index 4662847..56aa0ec 100644
--- a/fs/nfs/nfs4filelayout.c
+++ b/fs/nfs/nfs4filelayout.c
@@ -302,7 +302,8 @@ static void filelayout_read_call_done(struct rpc_task *task, void *data)
 
 	dprintk("--> %s task->tk_status %d\n", __func__, task->tk_status);
 
-	if (test_bit(NFS_IOHDR_REDO, &rdata->header->flags))
+	if (test_bit(NFS_IOHDR_REDO, &rdata->header->flags) &&
+	    task->tk_status == 0)
 		return;
 
 	/* Note this may cause RPC to be resent */
@@ -399,7 +400,8 @@ static void filelayout_write_call_done(struct rpc_task *task, void *data)
 {
 	struct nfs_write_data *wdata = data;
 
-	if (test_bit(NFS_IOHDR_REDO, &wdata->header->flags))
+	if (test_bit(NFS_IOHDR_REDO, &wdata->header->flags) &&
+	    task->tk_status == 0)
 		return;
 
 	/* Note this may cause RPC to be resent */
-- 
cgit v0.10.2


From 497826af60f812240ed5b6ba80541f7c9f2154d9 Mon Sep 17 00:00:00 2001
From: Bryan Schumaker <bjschuma@netapp.com>
Date: Tue, 22 May 2012 10:10:03 -0400
Subject: NFS: Fix compiler warnings

The "struct inode *inode" was only used in a dprintk, so compiling with
CONFIG_SUNRPC_DEBUG off triggers a warning.  To get around this, I
remove the "struct inode *inode" variable and instead change the
dprintk()s to use hdr->inode instead.

Signed-off-by: Bryan Schumaker <bjschuma@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
index 56aa0ec..ddea4d3 100644
--- a/fs/nfs/nfs4filelayout.c
+++ b/fs/nfs/nfs4filelayout.c
@@ -85,15 +85,14 @@ filelayout_get_dserver_offset(struct pnfs_layout_segment *lseg, loff_t offset)
 static void filelayout_reset_write(struct nfs_write_data *data)
 {
 	struct nfs_pgio_header *hdr = data->header;
-	struct inode *inode = hdr->inode;
 	struct rpc_task *task = &data->task;
 
 	if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
 		dprintk("%s Reset task %5u for i/o through MDS "
 			"(req %s/%lld, %u bytes @ offset %llu)\n", __func__,
 			data->task.tk_pid,
-			inode->i_sb->s_id,
-			(long long)NFS_FILEID(inode),
+			hdr->inode->i_sb->s_id,
+			(long long)NFS_FILEID(hdr->inode),
 			data->args.count,
 			(unsigned long long)data->args.offset);
 
@@ -106,15 +105,14 @@ static void filelayout_reset_write(struct nfs_write_data *data)
 static void filelayout_reset_read(struct nfs_read_data *data)
 {
 	struct nfs_pgio_header *hdr = data->header;
-	struct inode *inode = hdr->inode;
 	struct rpc_task *task = &data->task;
 
 	if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
 		dprintk("%s Reset task %5u for i/o through MDS "
 			"(req %s/%lld, %u bytes @ offset %llu)\n", __func__,
 			data->task.tk_pid,
-			inode->i_sb->s_id,
-			(long long)NFS_FILEID(inode),
+			hdr->inode->i_sb->s_id,
+			(long long)NFS_FILEID(hdr->inode),
 			data->args.count,
 			(unsigned long long)data->args.offset);
 
-- 
cgit v0.10.2


From c3607282b4d8787a530eb4a9a452b4e823508b9f Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 21 May 2012 22:45:16 -0400
Subject: NFS: Don't swap bytes in nfs4_construct_boot_verifier()

The SETCLIENTID boot verifier is opaque to NFSv4 servers, thus there
is no requirement for byte swapping before the client puts the
verifier on the wire.

This treatment is similar to other timestamp-based verifiers.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 78784e5..0f4e540 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -3908,8 +3908,8 @@ static void nfs4_construct_boot_verifier(struct nfs_client *clp,
 {
 	__be32 verf[2];
 
-	verf[0] = htonl((u32)clp->cl_boot_time.tv_sec);
-	verf[1] = htonl((u32)clp->cl_boot_time.tv_nsec);
+	verf[0] = (__be32)clp->cl_boot_time.tv_sec;
+	verf[1] = (__be32)clp->cl_boot_time.tv_nsec;
 	memcpy(bootverf->data, verf, sizeof(bootverf->data));
 }
 
-- 
cgit v0.10.2


From e3c0fb7ef515852619932b0da993baa2d107684d Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 21 May 2012 22:45:24 -0400
Subject: NFS: Add NFSDBG_STATE

fs/nfs/nfs4state.c does not yet have any dprintk() call sites, and I'm
about to introduce some.  We will need a new flag for enabling them.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/nfs4renewd.c b/fs/nfs/nfs4renewd.c
index dc484c0..6930bec 100644
--- a/fs/nfs/nfs4renewd.c
+++ b/fs/nfs/nfs4renewd.c
@@ -49,7 +49,7 @@
 #include "nfs4_fs.h"
 #include "delegation.h"
 
-#define NFSDBG_FACILITY	NFSDBG_PROC
+#define NFSDBG_FACILITY		NFSDBG_STATE
 
 void
 nfs4_renew_state(struct work_struct *work)
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 7f0fcfc..f8c06de 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -57,6 +57,8 @@
 #include "internal.h"
 #include "pnfs.h"
 
+#define NFSDBG_FACILITY		NFSDBG_STATE
+
 #define OPENOWNER_POOL_SIZE	8
 
 const nfs4_stateid zero_stateid;
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 6cc7dba..80a9385 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -634,6 +634,7 @@ nfs_fileid_to_ino_t(u64 fileid)
 #define NFSDBG_FSCACHE		0x0800
 #define NFSDBG_PNFS		0x1000
 #define NFSDBG_PNFS_LD		0x2000
+#define NFSDBG_STATE		0x4000
 #define NFSDBG_ALL		0xFFFF
 
 #ifdef __KERNEL__
-- 
cgit v0.10.2


From 722baafc9e638714a69aa66e9ed24ef961ff350c Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 21 May 2012 22:44:22 -0400
Subject: NFS: Fix comment misspelling in struct nfs_client definition

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index 7073fc7..5498e9d 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -79,7 +79,7 @@ struct nfs_client {
 	u32			cl_seqid;
 	/* The flags used for obtaining the clientid during EXCHANGE_ID */
 	u32			cl_exchange_flags;
-	struct nfs4_session	*cl_session; 	/* sharred session */
+	struct nfs4_session	*cl_session;	/* shared session */
 #endif /* CONFIG_NFS_V4 */
 
 #ifdef CONFIG_NFS_FSCACHE
-- 
cgit v0.10.2


From 79d4e1f0d8910f0214a57832ca6d589640d572c0 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 21 May 2012 22:44:31 -0400
Subject: NFS: Use proper naming conventions for NFSv4.1 server scope fields

Clean up:  When naming fields and data types, follow established
conventions to facilitate accurate grep/cscope searches.

Additionally, for consistency, move the scope field into the NFSv4-
specific part of the nfs_client, and free that memory in the logic
that shuts down NFSv4 nfs_clients.

Introduced by commit 99fe60d0 "nfs41: exchange_id operation", April
1 2009.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index b4e2199..471fc9b 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -237,6 +237,7 @@ static void nfs4_shutdown_client(struct nfs_client *clp)
 		nfs_idmap_delete(clp);
 
 	rpc_destroy_wait_queue(&clp->cl_rpcwaitq);
+	kfree(clp->cl_serverscope);
 }
 
 /* idr_remove_all is not needed as all id's are removed by nfs_put_client */
@@ -305,7 +306,6 @@ static void nfs_free_client(struct nfs_client *clp)
 
 	put_net(clp->net);
 	kfree(clp->cl_hostname);
-	kfree(clp->server_scope);
 	kfree(clp->impl_id);
 	kfree(clp);
 
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index edeef71..b14bcc3 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -338,7 +338,7 @@ extern void nfs4_schedule_stateid_recovery(const struct nfs_server *, struct nfs
 extern void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags);
 extern void nfs41_handle_recall_slot(struct nfs_client *clp);
 extern void nfs41_handle_server_scope(struct nfs_client *,
-				      struct server_scope **);
+				      struct nfs41_server_scope **);
 extern void nfs4_put_lock_state(struct nfs4_lock_state *lsp);
 extern int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl);
 extern void nfs4_select_rw_stateid(nfs4_stateid *, struct nfs4_state *,
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 0f4e540..94494f2 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -5052,7 +5052,8 @@ out_inval:
 }
 
 static bool
-nfs41_same_server_scope(struct server_scope *a, struct server_scope *b)
+nfs41_same_server_scope(struct nfs41_server_scope *a,
+			struct nfs41_server_scope *b)
 {
 	if (a->server_scope_sz == b->server_scope_sz &&
 	    memcmp(a->server_scope, b->server_scope, a->server_scope_sz) == 0)
@@ -5099,7 +5100,8 @@ int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred)
 				clp->cl_rpcclient->cl_nodename,
 				clp->cl_rpcclient->cl_auth->au_flavor);
 
-	res.server_scope = kzalloc(sizeof(struct server_scope), GFP_KERNEL);
+	res.server_scope = kzalloc(sizeof(struct nfs41_server_scope),
+					GFP_KERNEL);
 	if (unlikely(!res.server_scope)) {
 		status = -ENOMEM;
 		goto out;
@@ -5123,18 +5125,18 @@ int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred)
 		kfree(res.impl_id);
 
 	if (!status) {
-		if (clp->server_scope &&
-		    !nfs41_same_server_scope(clp->server_scope,
+		if (clp->cl_serverscope &&
+		    !nfs41_same_server_scope(clp->cl_serverscope,
 					     res.server_scope)) {
 			dprintk("%s: server_scope mismatch detected\n",
 				__func__);
 			set_bit(NFS4CLNT_SERVER_SCOPE_MISMATCH, &clp->cl_state);
-			kfree(clp->server_scope);
-			clp->server_scope = NULL;
+			kfree(clp->cl_serverscope);
+			clp->cl_serverscope = NULL;
 		}
 
-		if (!clp->server_scope) {
-			clp->server_scope = res.server_scope;
+		if (!clp->cl_serverscope) {
+			clp->cl_serverscope = res.server_scope;
 			goto out;
 		}
 	}
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index 5498e9d..900d733 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -17,7 +17,7 @@ struct nfs4_sequence_args;
 struct nfs4_sequence_res;
 struct nfs_server;
 struct nfs4_minor_version_ops;
-struct server_scope;
+struct nfs41_server_scope;
 struct nfs41_impl_id;
 
 /*
@@ -80,13 +80,13 @@ struct nfs_client {
 	/* The flags used for obtaining the clientid during EXCHANGE_ID */
 	u32			cl_exchange_flags;
 	struct nfs4_session	*cl_session;	/* shared session */
+	struct nfs41_server_scope *cl_serverscope;
 #endif /* CONFIG_NFS_V4 */
 
 #ifdef CONFIG_NFS_FSCACHE
 	struct fscache_cookie	*fscache;	/* client index cache cookie */
 #endif
 
-	struct server_scope	*server_scope;	/* from exchange_id */
 	struct nfs41_impl_id	*impl_id;	/* from exchange_id */
 	struct net		*net;
 };
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 2e53a3f..c420b8d 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -1104,7 +1104,7 @@ struct server_owner {
 	char				major_id[NFS4_OPAQUE_LIMIT];
 };
 
-struct server_scope {
+struct nfs41_server_scope {
 	uint32_t			server_scope_sz;
 	char 				server_scope[NFS4_OPAQUE_LIMIT];
 };
@@ -1118,7 +1118,7 @@ struct nfs41_impl_id {
 struct nfs41_exchange_id_res {
 	struct nfs_client		*client;
 	u32				flags;
-	struct server_scope		*server_scope;
+	struct nfs41_server_scope	*server_scope;
 	struct nfs41_impl_id		*impl_id;
 };
 
-- 
cgit v0.10.2


From 591555465ec513c42416392d392fd56866cb220c Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 21 May 2012 22:44:41 -0400
Subject: NFS: Use proper naming conventions for nfs_client.impl_id field

Clean up:  When naming fields and data types, follow established
conventions to facilitate accurate grep/cscope searches.

Additionally, for consistency, move the impl_id field into the NFSv4-
specific part of the nfs_client, and free that memory in the logic
that shuts down NFSv4 nfs_clients.

Introduced by commit 7d2ed9ac "NFSv4: parse and display server
implementation ids," Fri Feb 17, 2012.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 471fc9b..39db1be 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -238,6 +238,7 @@ static void nfs4_shutdown_client(struct nfs_client *clp)
 
 	rpc_destroy_wait_queue(&clp->cl_rpcwaitq);
 	kfree(clp->cl_serverscope);
+	kfree(clp->cl_implid);
 }
 
 /* idr_remove_all is not needed as all id's are removed by nfs_put_client */
@@ -306,7 +307,6 @@ static void nfs_free_client(struct nfs_client *clp)
 
 	put_net(clp->net);
 	kfree(clp->cl_hostname);
-	kfree(clp->impl_id);
 	kfree(clp);
 
 	dprintk("<-- nfs_free_client()\n");
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 94494f2..daa4e1b 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -5119,8 +5119,8 @@ int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred)
 
 	if (!status) {
 		/* use the most recent implementation id */
-		kfree(clp->impl_id);
-		clp->impl_id = res.impl_id;
+		kfree(clp->cl_implid);
+		clp->cl_implid = res.impl_id;
 	} else
 		kfree(res.impl_id);
 
@@ -5144,12 +5144,12 @@ int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred)
 out_server_scope:
 	kfree(res.server_scope);
 out:
-	if (clp->impl_id)
+	if (clp->cl_implid)
 		dprintk("%s: Server Implementation ID: "
 			"domain: %s, name: %s, date: %llu,%u\n",
-			__func__, clp->impl_id->domain, clp->impl_id->name,
-			clp->impl_id->date.seconds,
-			clp->impl_id->date.nseconds);
+			__func__, clp->cl_implid->domain, clp->cl_implid->name,
+			clp->cl_implid->date.seconds,
+			clp->cl_implid->date.nseconds);
 	dprintk("<-- %s status= %d\n", __func__, status);
 	return status;
 }
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index a973eb1..ff656c0 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -796,8 +796,8 @@ static void show_pnfs(struct seq_file *m, struct nfs_server *server)
 
 static void show_implementation_id(struct seq_file *m, struct nfs_server *nfss)
 {
-	if (nfss->nfs_client && nfss->nfs_client->impl_id) {
-		struct nfs41_impl_id *impl_id = nfss->nfs_client->impl_id;
+	if (nfss->nfs_client && nfss->nfs_client->cl_implid) {
+		struct nfs41_impl_id *impl_id = nfss->nfs_client->cl_implid;
 		seq_printf(m, "\n\timpl_id:\tname='%s',domain='%s',"
 			   "date='%llu,%u'",
 			   impl_id->name, impl_id->domain,
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index 900d733..773e021 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -81,13 +81,13 @@ struct nfs_client {
 	u32			cl_exchange_flags;
 	struct nfs4_session	*cl_session;	/* shared session */
 	struct nfs41_server_scope *cl_serverscope;
+	struct nfs41_impl_id	*cl_implid;
 #endif /* CONFIG_NFS_V4 */
 
 #ifdef CONFIG_NFS_FSCACHE
 	struct fscache_cookie	*fscache;	/* client index cache cookie */
 #endif
 
-	struct nfs41_impl_id	*impl_id;	/* from exchange_id */
 	struct net		*net;
 };
 
-- 
cgit v0.10.2


From 73ea666c2bb536f2862cefdb3e014ed62b262ba5 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 21 May 2012 22:44:50 -0400
Subject: NFS: Use proper naming conventions for the nfs_client.net field

Clean up:  When naming fields and data types, follow established
conventions to facilitate accurate grep/cscope searches.

Introduced by commit e50a7a1a "NFS: make NFS client allocated per
network namespace context," Tue Jan 10, 2012.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/blocklayout/blocklayoutdev.c b/fs/nfs/blocklayout/blocklayoutdev.c
index a5c88a5..c965542 100644
--- a/fs/nfs/blocklayout/blocklayoutdev.c
+++ b/fs/nfs/blocklayout/blocklayoutdev.c
@@ -123,7 +123,7 @@ nfs4_blk_decode_device(struct nfs_server *server,
 	uint8_t *dataptr;
 	DECLARE_WAITQUEUE(wq, current);
 	int offset, len, i, rc;
-	struct net *net = server->nfs_client->net;
+	struct net *net = server->nfs_client->cl_net;
 	struct nfs_net *nn = net_generic(net, nfs_net_id);
 	struct bl_dev_msg *reply = &nn->bl_mount_reply;
 
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 39db1be..9b9df71 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -65,7 +65,7 @@ static DECLARE_WAIT_QUEUE_HEAD(nfs_client_active_wq);
 static int nfs_get_cb_ident_idr(struct nfs_client *clp, int minorversion)
 {
 	int ret = 0;
-	struct nfs_net *nn = net_generic(clp->net, nfs_net_id);
+	struct nfs_net *nn = net_generic(clp->cl_net, nfs_net_id);
 
 	if (clp->rpc_ops->version != 4 || minorversion != 0)
 		return ret;
@@ -174,7 +174,7 @@ static struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_
 	clp->cl_rpcclient = ERR_PTR(-EINVAL);
 
 	clp->cl_proto = cl_init->proto;
-	clp->net = get_net(cl_init->net);
+	clp->cl_net = get_net(cl_init->net);
 
 #ifdef CONFIG_NFS_V4
 	err = nfs_get_cb_ident_idr(clp, cl_init->minorversion);
@@ -252,7 +252,7 @@ void nfs_cleanup_cb_ident_idr(struct net *net)
 /* nfs_client_lock held */
 static void nfs_cb_idr_remove_locked(struct nfs_client *clp)
 {
-	struct nfs_net *nn = net_generic(clp->net, nfs_net_id);
+	struct nfs_net *nn = net_generic(clp->cl_net, nfs_net_id);
 
 	if (clp->cl_cb_ident)
 		idr_remove(&nn->cb_ident_idr, clp->cl_cb_ident);
@@ -305,7 +305,7 @@ static void nfs_free_client(struct nfs_client *clp)
 	if (clp->cl_machine_cred != NULL)
 		put_rpccred(clp->cl_machine_cred);
 
-	put_net(clp->net);
+	put_net(clp->cl_net);
 	kfree(clp->cl_hostname);
 	kfree(clp);
 
@@ -323,7 +323,7 @@ void nfs_put_client(struct nfs_client *clp)
 		return;
 
 	dprintk("--> nfs_put_client({%d})\n", atomic_read(&clp->cl_count));
-	nn = net_generic(clp->net, nfs_net_id);
+	nn = net_generic(clp->cl_net, nfs_net_id);
 
 	if (atomic_dec_and_lock(&clp->cl_count, &nn->nfs_client_lock)) {
 		list_del(&clp->cl_share_link);
@@ -661,7 +661,7 @@ static int nfs_create_rpc_client(struct nfs_client *clp,
 {
 	struct rpc_clnt		*clnt = NULL;
 	struct rpc_create_args args = {
-		.net		= clp->net,
+		.net		= clp->cl_net,
 		.protocol	= clp->cl_proto,
 		.address	= (struct sockaddr *)&clp->cl_addr,
 		.addrsize	= clp->cl_addrlen,
@@ -715,7 +715,7 @@ static int nfs_start_lockd(struct nfs_server *server)
 		.nfs_version	= clp->rpc_ops->version,
 		.noresvport	= server->flags & NFS_MOUNT_NORESVPORT ?
 					1 : 0,
-		.net		= clp->net,
+		.net		= clp->cl_net,
 	};
 
 	if (nlm_init.nfs_version > 3)
@@ -1060,7 +1060,7 @@ static void nfs_server_copy_userdata(struct nfs_server *target, struct nfs_serve
 static void nfs_server_insert_lists(struct nfs_server *server)
 {
 	struct nfs_client *clp = server->nfs_client;
-	struct nfs_net *nn = net_generic(clp->net, nfs_net_id);
+	struct nfs_net *nn = net_generic(clp->cl_net, nfs_net_id);
 
 	spin_lock(&nn->nfs_client_lock);
 	list_add_tail_rcu(&server->client_link, &clp->cl_superblocks);
@@ -1077,7 +1077,7 @@ static void nfs_server_remove_lists(struct nfs_server *server)
 
 	if (clp == NULL)
 		return;
-	nn = net_generic(clp->net, nfs_net_id);
+	nn = net_generic(clp->cl_net, nfs_net_id);
 	spin_lock(&nn->nfs_client_lock);
 	list_del_rcu(&server->client_link);
 	if (list_empty(&clp->cl_superblocks))
@@ -1486,7 +1486,7 @@ struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp,
 		.rpc_ops = &nfs_v4_clientops,
 		.proto = ds_proto,
 		.minorversion = mds_clp->cl_minorversion,
-		.net = mds_clp->net,
+		.net = mds_clp->cl_net,
 	};
 	struct rpc_timeout ds_timeout;
 	struct nfs_client *clp;
@@ -1709,7 +1709,7 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data,
 				rpc_protocol(parent_server->client),
 				parent_server->client->cl_timeout,
 				parent_client->cl_mvops->minor_version,
-				parent_client->net);
+				parent_client->cl_net);
 	if (error < 0)
 		goto error;
 
diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c
index 3e8edbe..2eaecf9 100644
--- a/fs/nfs/idmap.c
+++ b/fs/nfs/idmap.c
@@ -415,7 +415,7 @@ static int __nfs_idmap_register(struct dentry *dir,
 static void nfs_idmap_unregister(struct nfs_client *clp,
 				      struct rpc_pipe *pipe)
 {
-	struct net *net = clp->net;
+	struct net *net = clp->cl_net;
 	struct super_block *pipefs_sb;
 
 	pipefs_sb = rpc_get_sb_net(net);
@@ -429,7 +429,7 @@ static int nfs_idmap_register(struct nfs_client *clp,
 				   struct idmap *idmap,
 				   struct rpc_pipe *pipe)
 {
-	struct net *net = clp->net;
+	struct net *net = clp->cl_net;
 	struct super_block *pipefs_sb;
 	int err = 0;
 
diff --git a/fs/nfs/nfs4filelayoutdev.c b/fs/nfs/nfs4filelayoutdev.c
index bf49b78..c610f84 100644
--- a/fs/nfs/nfs4filelayoutdev.c
+++ b/fs/nfs/nfs4filelayoutdev.c
@@ -629,7 +629,7 @@ decode_device(struct inode *ino, struct pnfs_device *pdev, gfp_t gfp_flags)
 
 		mp_count = be32_to_cpup(p); /* multipath count */
 		for (j = 0; j < mp_count; j++) {
-			da = decode_ds_addr(NFS_SERVER(ino)->nfs_client->net,
+			da = decode_ds_addr(NFS_SERVER(ino)->nfs_client->cl_net,
 					    &stream, gfp_flags);
 			if (da)
 				list_add_tail(&da->da_node, &dsaddrs);
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index 773e021..59410b3 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -88,7 +88,7 @@ struct nfs_client {
 	struct fscache_cookie	*fscache;	/* client index cache cookie */
 #endif
 
-	struct net		*net;
+	struct net		*cl_net;
 };
 
 /*
-- 
cgit v0.10.2


From 177313f1498dd66b551dccadc98331b3fc3b09a4 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 21 May 2012 22:44:58 -0400
Subject: NFS: Clean up return code checking in nfs4_proc_exchange_id()

Clean up: update to use matching types in "if" expressions.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index daa4e1b..ab6b2e5 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -5102,30 +5102,30 @@ int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred)
 
 	res.server_scope = kzalloc(sizeof(struct nfs41_server_scope),
 					GFP_KERNEL);
-	if (unlikely(!res.server_scope)) {
+	if (unlikely(res.server_scope == NULL)) {
 		status = -ENOMEM;
 		goto out;
 	}
 
 	res.impl_id = kzalloc(sizeof(struct nfs41_impl_id), GFP_KERNEL);
-	if (unlikely(!res.impl_id)) {
+	if (unlikely(res.impl_id == NULL)) {
 		status = -ENOMEM;
 		goto out_server_scope;
 	}
 
 	status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT);
-	if (!status)
+	if (status == 0)
 		status = nfs4_check_cl_exchange_flags(clp->cl_exchange_flags);
 
-	if (!status) {
+	if (status == 0) {
 		/* use the most recent implementation id */
 		kfree(clp->cl_implid);
 		clp->cl_implid = res.impl_id;
 	} else
 		kfree(res.impl_id);
 
-	if (!status) {
-		if (clp->cl_serverscope &&
+	if (status == 0) {
+		if (clp->cl_serverscope != NULL &&
 		    !nfs41_same_server_scope(clp->cl_serverscope,
 					     res.server_scope)) {
 			dprintk("%s: server_scope mismatch detected\n",
@@ -5135,7 +5135,7 @@ int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred)
 			clp->cl_serverscope = NULL;
 		}
 
-		if (!clp->cl_serverscope) {
+		if (clp->cl_serverscope == NULL) {
 			clp->cl_serverscope = res.server_scope;
 			goto out;
 		}
@@ -5144,7 +5144,7 @@ int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred)
 out_server_scope:
 	kfree(res.server_scope);
 out:
-	if (clp->cl_implid)
+	if (clp->cl_implid != NULL)
 		dprintk("%s: Server Implementation ID: "
 			"domain: %s, name: %s, date: %llu,%u\n",
 			__func__, clp->cl_implid->domain, clp->cl_implid->name,
-- 
cgit v0.10.2


From ce1c8fc12d99386737953dfeb7b531dfa3d18e5e Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 21 May 2012 22:45:08 -0400
Subject: NFS: Remove nfs_unique_id

Clean up:  this structure is unused.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index b14bcc3..1526fdf 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -52,11 +52,6 @@ struct nfs4_minor_version_ops {
 	const struct nfs4_state_maintenance_ops *state_renewal_ops;
 };
 
-struct nfs_unique_id {
-	struct rb_node rb_node;
-	__u64 id;
-};
-
 #define NFS_SEQID_CONFIRMED 1
 struct nfs_seqid_counter {
 	ktime_t create_time;
-- 
cgit v0.10.2


From 2c820d9a97f07b273b2c8a5960bd52b1b5864c68 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 21 May 2012 22:45:33 -0400
Subject: NFS: Force server to drop NFSv4 state

nfs4_reset_all_state() refreshes the boot verifier a server sees to
trigger that server to wipe this client's state.  This function is
invoked when an NFSv4.1 server reports that it has revoked some or
all of a client's NFSv4 state.

To facilitate server trunking discovery, we will eventually want to
move the cl_boot_time field to a more global structure.  The Uniform
Client String model (and specifically, server trunking detection)
requires that all servers see the same boot verifier until the client
actually does reboot, and not a fresh verifier every time the client
unmounts and remounts the server.

Without the cl_boot_time field, however, nfs4_reset_all_state() will
have to find some other way to force the server to purge the client's
NFSv4 state.

Because these verifiers are opaque (ie, the server doesn't know or
care that they happen to be timestamps), we can force the server
to wipe NFSv4 state by updating the boot verifier as we do now, then
immediately afterwards establish a fresh client ID using the old boot
verifier again.

Hopefully there are no extra paranoid server implementations that keep
track of the client's boot verifiers and prevent clients from reusing
a previous one.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 1526fdf..e6da021 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -24,6 +24,7 @@ enum nfs4_client_state {
 	NFS4CLNT_RECALL_SLOT,
 	NFS4CLNT_LEASE_CONFIRM,
 	NFS4CLNT_SERVER_SCOPE_MISMATCH,
+	NFS4CLNT_PURGE_STATE,
 };
 
 enum nfs4_session_state {
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index ab6b2e5..81ccdbb 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -3908,8 +3908,15 @@ static void nfs4_construct_boot_verifier(struct nfs_client *clp,
 {
 	__be32 verf[2];
 
-	verf[0] = (__be32)clp->cl_boot_time.tv_sec;
-	verf[1] = (__be32)clp->cl_boot_time.tv_nsec;
+	if (test_bit(NFS4CLNT_PURGE_STATE, &clp->cl_state)) {
+		/* An impossible timestamp guarantees this value
+		 * will never match a generated boot time. */
+		verf[0] = 0;
+		verf[1] = (__be32)(NSEC_PER_SEC + 1);
+	} else {
+		verf[0] = (__be32)clp->cl_boot_time.tv_sec;
+		verf[1] = (__be32)clp->cl_boot_time.tv_nsec;
+	}
 	memcpy(bootverf->data, verf, sizeof(bootverf->data));
 }
 
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index f8c06de..32cce4a 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -1615,7 +1615,7 @@ void nfs41_handle_recall_slot(struct nfs_client *clp)
 static void nfs4_reset_all_state(struct nfs_client *clp)
 {
 	if (test_and_set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) == 0) {
-		clp->cl_boot_time = CURRENT_TIME;
+		set_bit(NFS4CLNT_PURGE_STATE, &clp->cl_state);
 		nfs4_state_start_reclaim_nograce(clp);
 		nfs4_schedule_state_manager(clp);
 	}
@@ -1631,7 +1631,6 @@ static void nfs41_handle_server_reboot(struct nfs_client *clp)
 
 static void nfs41_handle_state_revoked(struct nfs_client *clp)
 {
-	/* Temporary */
 	nfs4_reset_all_state(clp);
 }
 
@@ -1652,6 +1651,10 @@ void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags)
 {
 	if (!flags)
 		return;
+
+	dprintk("%s: \"%s\" (client ID %llx) flags=0x%08x\n",
+		__func__, clp->cl_hostname, clp->cl_clientid, flags);
+
 	if (flags & SEQ4_STATUS_RESTART_RECLAIM_NEEDED)
 		nfs41_handle_server_reboot(clp);
 	if (flags & (SEQ4_STATUS_EXPIRED_ALL_STATE_REVOKED |
@@ -1762,6 +1765,12 @@ static void nfs4_state_manager(struct nfs_client *clp)
 
 	/* Ensure exclusive access to NFSv4 state */
 	do {
+		if (test_bit(NFS4CLNT_PURGE_STATE, &clp->cl_state)) {
+			nfs4_reclaim_lease(clp);
+			clear_bit(NFS4CLNT_PURGE_STATE, &clp->cl_state);
+			set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
+		}
+
 		if (test_and_clear_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state)) {
 			/* We're going to have to re-establish a clientid */
 			status = nfs4_reclaim_lease(clp);
-- 
cgit v0.10.2


From f092075dd33ea04000590e8ffea65c2e7d03d764 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 21 May 2012 22:45:41 -0400
Subject: NFS: Always use the same SETCLIENTID boot verifier

Currently our NFS client assigns a unique SETCLIENTID boot verifier
for each server IP address it knows about.  It's set to CURRENT_TIME
when the struct nfs_client for that server IP is created.

During the SETCLIENTID operation, our client also presents an
nfs_client_id4 string to servers, as an identifier on which the server
can hang all of this client's NFSv4 state.  Our client's
nfs_client_id4 string is unique for each server IP address.

An NFSv4 server is obligated to wipe all NFSv4 state associated with
an nfs_client_id4 string when the client presents the same
nfs_client_id4 string along with a changed SETCLIENTID boot verifier.

When our client unmounts the last of a server's shares, it destroys
that server's struct nfs_client.  The next time the client mounts that
NFS server, it creates a fresh struct nfs_client with a fresh boot
verifier.  On seeing the fresh verifer, the server wipes any previous
NFSv4 state associated with that nfs_client_id4.

However, NFSv4.1 clients are supposed to present the same
nfs_client_id4 string to all servers.  And, to support Transparent
State Migration, the same nfs_client_id4 string should be presented
to all NFSv4.0 servers so they recognize that migrated state for this
client belongs with state a server may already have for this client.
(This is known as the Uniform Client String model).

If the nfs_client_id4 string is the same but the boot verifier changes
for each server IP address, SETCLIENTID and EXCHANGE_ID operations
from such a client could unintentionally result in a server wiping a
client's previously obtained lease.

Thus, if our NFS client is going to use a fixed nfs_client_id4 string,
either for NFSv4.0 or NFSv4.1 mounts, our NFS client should use a
boot verifier that does not change depending on server IP address.
Replace our current per-nfs_client boot verifier with a per-nfs_net
boot verifier.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 9b9df71..af9b7e4 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -184,7 +184,6 @@ static struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_
 	spin_lock_init(&clp->cl_lock);
 	INIT_DELAYED_WORK(&clp->cl_renewd, nfs4_renew_state);
 	rpc_init_wait_queue(&clp->cl_rpcwaitq, "NFS client");
-	clp->cl_boot_time = CURRENT_TIME;
 	clp->cl_state = 1 << NFS4CLNT_LEASE_EXPIRED;
 	clp->cl_minorversion = cl_init->minorversion;
 	clp->cl_mvops = nfs_v4_minor_ops[cl_init->minorversion];
@@ -1813,6 +1812,7 @@ void nfs_clients_init(struct net *net)
 	idr_init(&nn->cb_ident_idr);
 #endif
 	spin_lock_init(&nn->nfs_client_lock);
+	nn->boot_time = CURRENT_TIME;
 }
 
 #ifdef CONFIG_PROC_FS
diff --git a/fs/nfs/netns.h b/fs/nfs/netns.h
index aa14ec3..8a6394e 100644
--- a/fs/nfs/netns.h
+++ b/fs/nfs/netns.h
@@ -1,3 +1,7 @@
+/*
+ * NFS-private data for each "struct net".  Accessed with net_generic().
+ */
+
 #ifndef __NFS_NETNS_H__
 #define __NFS_NETNS_H__
 
@@ -20,6 +24,7 @@ struct nfs_net {
 	struct idr cb_ident_idr; /* Protected by nfs_client_lock */
 #endif
 	spinlock_t nfs_client_lock;
+	struct timespec boot_time;
 };
 
 extern int nfs_net_id;
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 81ccdbb..9e9334a 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -64,6 +64,7 @@
 #include "iostat.h"
 #include "callback.h"
 #include "pnfs.h"
+#include "netns.h"
 
 #define NFSDBG_FACILITY		NFSDBG_PROC
 
@@ -3903,8 +3904,8 @@ wait_on_recovery:
 	return -EAGAIN;
 }
 
-static void nfs4_construct_boot_verifier(struct nfs_client *clp,
-					 nfs4_verifier *bootverf)
+static void nfs4_init_boot_verifier(const struct nfs_client *clp,
+				    nfs4_verifier *bootverf)
 {
 	__be32 verf[2];
 
@@ -3914,8 +3915,9 @@ static void nfs4_construct_boot_verifier(struct nfs_client *clp,
 		verf[0] = 0;
 		verf[1] = (__be32)(NSEC_PER_SEC + 1);
 	} else {
-		verf[0] = (__be32)clp->cl_boot_time.tv_sec;
-		verf[1] = (__be32)clp->cl_boot_time.tv_nsec;
+		struct nfs_net *nn = net_generic(clp->cl_net, nfs_net_id);
+		verf[0] = (__be32)nn->boot_time.tv_sec;
+		verf[1] = (__be32)nn->boot_time.tv_nsec;
 	}
 	memcpy(bootverf->data, verf, sizeof(bootverf->data));
 }
@@ -3939,7 +3941,7 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program,
 	int loop = 0;
 	int status;
 
-	nfs4_construct_boot_verifier(clp, &sc_verifier);
+	nfs4_init_boot_verifier(clp, &sc_verifier);
 
 	for(;;) {
 		rcu_read_lock();
@@ -5099,7 +5101,7 @@ int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred)
 	dprintk("--> %s\n", __func__);
 	BUG_ON(clp == NULL);
 
-	nfs4_construct_boot_verifier(clp, &verifier);
+	nfs4_init_boot_verifier(clp, &verifier);
 
 	args.id_len = scnprintf(args.id, sizeof(args.id),
 				"%s/%s/%u",
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index db040e9..12b9982 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -53,9 +53,11 @@
 #include <linux/nfs4.h>
 #include <linux/nfs_fs.h>
 #include <linux/nfs_idmap.h>
+
 #include "nfs4_fs.h"
 #include "internal.h"
 #include "pnfs.h"
+#include "netns.h"
 
 #define NFSDBG_FACILITY		NFSDBG_XDR
 
@@ -1702,6 +1704,7 @@ static void encode_create_session(struct xdr_stream *xdr,
 	char machine_name[NFS4_MAX_MACHINE_NAME_LEN];
 	uint32_t len;
 	struct nfs_client *clp = args->client;
+	struct nfs_net *nn = net_generic(clp->cl_net, nfs_net_id);
 	u32 max_resp_sz_cached;
 
 	/*
@@ -1743,7 +1746,7 @@ static void encode_create_session(struct xdr_stream *xdr,
 	*p++ = cpu_to_be32(RPC_AUTH_UNIX);			/* auth_sys */
 
 	/* authsys_parms rfc1831 */
-	*p++ = cpu_to_be32((u32)clp->cl_boot_time.tv_nsec);	/* stamp */
+	*p++ = (__be32)nn->boot_time.tv_nsec;		/* stamp */
 	p = xdr_encode_opaque(p, machine_name, len);
 	*p++ = cpu_to_be32(0);				/* UID */
 	*p++ = cpu_to_be32(0);				/* GID */
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index 59410b3..fbec57d 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -61,9 +61,6 @@ struct nfs_client {
 
 	struct rpc_wait_queue	cl_rpcwaitq;
 
-	/* used for the setclientid verifier */
-	struct timespec		cl_boot_time;
-
 	/* idmapper */
 	struct idmap *		cl_idmap;
 
-- 
cgit v0.10.2


From f411703adc762a92b72f8a93c6464050d66cb87b Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 21 May 2012 22:45:50 -0400
Subject: NFS: Refactor nfs_get_client(): add nfs_found_client()

Clean up: Code that takes and releases nfs_client_lock remains in
nfs_get_client().  Logic that handles a pre-existing nfs_client is
moved to a separate function.

No behavior change is expected.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index af9b7e4..5f19f95 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -506,6 +506,35 @@ static struct nfs_client *nfs_match_client(const struct nfs_client_initdata *dat
 }
 
 /*
+ * Found an existing client.  Make sure it's ready before returning.
+ */
+static struct nfs_client *
+nfs_found_client(const struct nfs_client_initdata *cl_init,
+		 struct nfs_client *clp)
+{
+	int error;
+
+	error = wait_event_killable(nfs_client_active_wq,
+				clp->cl_cons_state < NFS_CS_INITING);
+	if (error < 0) {
+		nfs_put_client(clp);
+		return ERR_PTR(-ERESTARTSYS);
+	}
+
+	if (clp->cl_cons_state < NFS_CS_READY) {
+		error = clp->cl_cons_state;
+		nfs_put_client(clp);
+		return ERR_PTR(error);
+	}
+
+	BUG_ON(clp->cl_cons_state != NFS_CS_READY);
+
+	dprintk("<-- %s found nfs_client %p for %s\n",
+		__func__, clp, cl_init->hostname ?: "");
+	return clp;
+}
+
+/*
  * Look up a client by IP address and protocol version
  * - creates a new record if one doesn't yet exist
  */
@@ -528,8 +557,12 @@ nfs_get_client(const struct nfs_client_initdata *cl_init,
 		spin_lock(&nn->nfs_client_lock);
 
 		clp = nfs_match_client(cl_init);
-		if (clp)
-			goto found_client;
+		if (clp) {
+			spin_unlock(&nn->nfs_client_lock);
+			if (new)
+				nfs_free_client(new);
+			return nfs_found_client(cl_init, clp);
+		}
 		if (new)
 			goto install_client;
 
@@ -538,7 +571,8 @@ nfs_get_client(const struct nfs_client_initdata *cl_init,
 		new = nfs_alloc_client(cl_init);
 	} while (!IS_ERR(new));
 
-	dprintk("--> nfs_get_client() = %ld [failed]\n", PTR_ERR(new));
+	dprintk("<-- nfs_get_client() Failed to find %s (%ld)\n",
+		cl_init->hostname ?: "", PTR_ERR(new));
 	return new;
 
 	/* install a new client and return with it unready */
@@ -555,33 +589,6 @@ install_client:
 	}
 	dprintk("--> nfs_get_client() = %p [new]\n", clp);
 	return clp;
-
-	/* found an existing client
-	 * - make sure it's ready before returning
-	 */
-found_client:
-	spin_unlock(&nn->nfs_client_lock);
-
-	if (new)
-		nfs_free_client(new);
-
-	error = wait_event_killable(nfs_client_active_wq,
-				clp->cl_cons_state < NFS_CS_INITING);
-	if (error < 0) {
-		nfs_put_client(clp);
-		return ERR_PTR(-ERESTARTSYS);
-	}
-
-	if (clp->cl_cons_state < NFS_CS_READY) {
-		error = clp->cl_cons_state;
-		nfs_put_client(clp);
-		return ERR_PTR(error);
-	}
-
-	BUG_ON(clp->cl_cons_state != NFS_CS_READY);
-
-	dprintk("--> nfs_get_client() = %p [share]\n", clp);
-	return clp;
 }
 
 /*
-- 
cgit v0.10.2


From 8cab4c390b43fe34c07bd33799c1bc24be648122 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 21 May 2012 22:45:59 -0400
Subject: NFS: Refactor nfs_get_client(): initialize nfs_client

Clean up: Continue to rationalize the locking in nfs_get_client() by
moving the logic that handles the case where a matching server IP
address is not found.

When we support server trunking detection, client initialization may
return a different nfs_client struct than was passed to it.  Change
the synopsis of the init_client methods to return an nfs_client.

The client initialization logic in nfs_get_client() is not much more
than a wrapper around ->init_client.  It's simpler to keep the little
bits of error handling in the version-specific init_client methods.

No behavior change is expected.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 5f19f95..8a4b3c2 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -546,7 +546,6 @@ nfs_get_client(const struct nfs_client_initdata *cl_init,
 	       int noresvport)
 {
 	struct nfs_client *clp, *new = NULL;
-	int error;
 	struct nfs_net *nn = net_generic(cl_init->net, nfs_net_id);
 
 	dprintk("--> nfs_get_client(%s,v%u)\n",
@@ -563,8 +562,13 @@ nfs_get_client(const struct nfs_client_initdata *cl_init,
 				nfs_free_client(new);
 			return nfs_found_client(cl_init, clp);
 		}
-		if (new)
-			goto install_client;
+		if (new) {
+			list_add(&new->cl_share_link, &nn->nfs_client_list);
+			spin_unlock(&nn->nfs_client_lock);
+			return cl_init->rpc_ops->init_client(new,
+						timeparms, ip_addr,
+						authflavour, noresvport);
+		}
 
 		spin_unlock(&nn->nfs_client_lock);
 
@@ -574,21 +578,6 @@ nfs_get_client(const struct nfs_client_initdata *cl_init,
 	dprintk("<-- nfs_get_client() Failed to find %s (%ld)\n",
 		cl_init->hostname ?: "", PTR_ERR(new));
 	return new;
-
-	/* install a new client and return with it unready */
-install_client:
-	clp = new;
-	list_add(&clp->cl_share_link, &nn->nfs_client_list);
-	spin_unlock(&nn->nfs_client_lock);
-
-	error = cl_init->rpc_ops->init_client(clp, timeparms, ip_addr,
-					      authflavour, noresvport);
-	if (error < 0) {
-		nfs_put_client(clp);
-		return ERR_PTR(error);
-	}
-	dprintk("--> nfs_get_client() = %p [new]\n", clp);
-	return clp;
 }
 
 /*
@@ -813,10 +802,19 @@ static int nfs_init_server_rpcclient(struct nfs_server *server,
 	return 0;
 }
 
-/*
- * Initialise an NFS2 or NFS3 client
+/**
+ * nfs_init_client - Initialise an NFS2 or NFS3 client
+ *
+ * @clp: nfs_client to initialise
+ * @timeparms: timeout parameters for underlying RPC transport
+ * @ip_addr: IP presentation address (not used)
+ * @authflavor: authentication flavor for underlying RPC transport
+ * @noresvport: set if RPC transport can use an ephemeral source port
+ *
+ * Returns pointer to an NFS client, or an ERR_PTR value.
  */
-int nfs_init_client(struct nfs_client *clp, const struct rpc_timeout *timeparms,
+struct nfs_client *nfs_init_client(struct nfs_client *clp,
+		    const struct rpc_timeout *timeparms,
 		    const char *ip_addr, rpc_authflavor_t authflavour,
 		    int noresvport)
 {
@@ -825,7 +823,7 @@ int nfs_init_client(struct nfs_client *clp, const struct rpc_timeout *timeparms,
 	if (clp->cl_cons_state == NFS_CS_READY) {
 		/* the client is already initialised */
 		dprintk("<-- nfs_init_client() = 0 [already %p]\n", clp);
-		return 0;
+		return clp;
 	}
 
 	/*
@@ -837,12 +835,13 @@ int nfs_init_client(struct nfs_client *clp, const struct rpc_timeout *timeparms,
 	if (error < 0)
 		goto error;
 	nfs_mark_client_ready(clp, NFS_CS_READY);
-	return 0;
+	return clp;
 
 error:
 	nfs_mark_client_ready(clp, error);
+	nfs_put_client(clp);
 	dprintk("<-- nfs_init_client() = xerror %d\n", error);
-	return error;
+	return ERR_PTR(error);
 }
 
 /*
@@ -1358,14 +1357,22 @@ static int nfs4_init_client_minor_version(struct nfs_client *clp)
 	return nfs4_init_callback(clp);
 }
 
-/*
- * Initialise an NFS4 client record
+/**
+ * nfs4_init_client - Initialise an NFS4 client record
+ *
+ * @clp: nfs_client to initialise
+ * @timeparms: timeout parameters for underlying RPC transport
+ * @ip_addr: callback IP address in presentation format
+ * @authflavor: authentication flavor for underlying RPC transport
+ * @noresvport: set if RPC transport can use an ephemeral source port
+ *
+ * Returns pointer to an NFS client, or an ERR_PTR value.
  */
-int nfs4_init_client(struct nfs_client *clp,
-		     const struct rpc_timeout *timeparms,
-		     const char *ip_addr,
-		     rpc_authflavor_t authflavour,
-		     int noresvport)
+struct nfs_client *nfs4_init_client(struct nfs_client *clp,
+				    const struct rpc_timeout *timeparms,
+				    const char *ip_addr,
+				    rpc_authflavor_t authflavour,
+				    int noresvport)
 {
 	char buf[INET6_ADDRSTRLEN + 1];
 	int error;
@@ -1373,7 +1380,7 @@ int nfs4_init_client(struct nfs_client *clp,
 	if (clp->cl_cons_state == NFS_CS_READY) {
 		/* the client is initialised already */
 		dprintk("<-- nfs4_init_client() = 0 [already %p]\n", clp);
-		return 0;
+		return clp;
 	}
 
 	/* Check NFS protocol revision and initialize RPC op vector */
@@ -1413,12 +1420,13 @@ int nfs4_init_client(struct nfs_client *clp,
 
 	if (!nfs4_has_session(clp))
 		nfs_mark_client_ready(clp, NFS_CS_READY);
-	return 0;
+	return clp;
 
 error:
 	nfs_mark_client_ready(clp, error);
+	nfs_put_client(clp);
 	dprintk("<-- nfs4_init_client() = xerror %d\n", error);
-	return error;
+	return ERR_PTR(error);
 }
 
 /*
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 989959a..3a9e80c 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -238,7 +238,7 @@ extern int nfs4_init_ds_session(struct nfs_client *clp);
 
 /* proc.c */
 void nfs_close_context(struct nfs_open_context *ctx, int is_sync);
-extern int nfs_init_client(struct nfs_client *clp,
+extern struct nfs_client *nfs_init_client(struct nfs_client *clp,
 			   const struct rpc_timeout *timeparms,
 			   const char *ip_addr, rpc_authflavor_t authflavour,
 			   int noresvport);
@@ -373,7 +373,7 @@ void nfs_init_cinfo_from_dreq(struct nfs_commit_info *cinfo,
 
 /* nfs4proc.c */
 extern void __nfs4_read_done_cb(struct nfs_read_data *);
-extern int nfs4_init_client(struct nfs_client *clp,
+extern struct nfs_client *nfs4_init_client(struct nfs_client *clp,
 			    const struct rpc_timeout *timeparms,
 			    const char *ip_addr,
 			    rpc_authflavor_t authflavour,
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index c420b8d..0c521cd 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -1397,7 +1397,8 @@ struct nfs_rpc_ops {
 				struct nfs_open_context *ctx,
 				int open_flags,
 				struct iattr *iattr);
-	int	(*init_client) (struct nfs_client *, const struct rpc_timeout *,
+	struct nfs_client *
+		(*init_client) (struct nfs_client *, const struct rpc_timeout *,
 				const char *, rpc_authflavor_t, int);
 };
 
-- 
cgit v0.10.2


From 4bf590e08f6db3395c181618a4c14f1c39b7c4af Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 21 May 2012 22:46:07 -0400
Subject: NFS: Add nfs_client behavior flags

"noresvport" and "discrtry" can be passed to nfs_create_rpc_client()
by setting flags in the passed-in nfs_client.  This change makes it
easy to add new flags.

Note that these settings are now "sticky" over the lifetime of a
struct nfs_client, and may even be copied when an nfs_client is
cloned.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 8a4b3c2..34b2e68 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -131,6 +131,7 @@ const struct rpc_program nfsacl_program = {
 #endif  /* CONFIG_NFS_V3_ACL */
 
 struct nfs_client_initdata {
+	unsigned long init_flags;
 	const char *hostname;
 	const struct sockaddr *addr;
 	size_t addrlen;
@@ -542,8 +543,7 @@ static struct nfs_client *
 nfs_get_client(const struct nfs_client_initdata *cl_init,
 	       const struct rpc_timeout *timeparms,
 	       const char *ip_addr,
-	       rpc_authflavor_t authflavour,
-	       int noresvport)
+	       rpc_authflavor_t authflavour)
 {
 	struct nfs_client *clp, *new = NULL;
 	struct nfs_net *nn = net_generic(cl_init->net, nfs_net_id);
@@ -565,9 +565,10 @@ nfs_get_client(const struct nfs_client_initdata *cl_init,
 		if (new) {
 			list_add(&new->cl_share_link, &nn->nfs_client_list);
 			spin_unlock(&nn->nfs_client_lock);
+			new->cl_flags = cl_init->init_flags;
 			return cl_init->rpc_ops->init_client(new,
 						timeparms, ip_addr,
-						authflavour, noresvport);
+						authflavour);
 		}
 
 		spin_unlock(&nn->nfs_client_lock);
@@ -651,8 +652,7 @@ static void nfs_init_timeout_values(struct rpc_timeout *to, int proto,
  */
 static int nfs_create_rpc_client(struct nfs_client *clp,
 				 const struct rpc_timeout *timeparms,
-				 rpc_authflavor_t flavor,
-				 int discrtry, int noresvport)
+				 rpc_authflavor_t flavor)
 {
 	struct rpc_clnt		*clnt = NULL;
 	struct rpc_create_args args = {
@@ -667,9 +667,9 @@ static int nfs_create_rpc_client(struct nfs_client *clp,
 		.authflavor	= flavor,
 	};
 
-	if (discrtry)
+	if (test_bit(NFS_CS_DISCRTRY, &clp->cl_flags))
 		args.flags |= RPC_CLNT_CREATE_DISCRTRY;
-	if (noresvport)
+	if (test_bit(NFS_CS_NORESVPORT, &clp->cl_flags))
 		args.flags |= RPC_CLNT_CREATE_NONPRIVPORT;
 
 	if (!IS_ERR(clp->cl_rpcclient))
@@ -809,14 +809,12 @@ static int nfs_init_server_rpcclient(struct nfs_server *server,
  * @timeparms: timeout parameters for underlying RPC transport
  * @ip_addr: IP presentation address (not used)
  * @authflavor: authentication flavor for underlying RPC transport
- * @noresvport: set if RPC transport can use an ephemeral source port
  *
  * Returns pointer to an NFS client, or an ERR_PTR value.
  */
 struct nfs_client *nfs_init_client(struct nfs_client *clp,
 		    const struct rpc_timeout *timeparms,
-		    const char *ip_addr, rpc_authflavor_t authflavour,
-		    int noresvport)
+		    const char *ip_addr, rpc_authflavor_t authflavour)
 {
 	int error;
 
@@ -830,8 +828,7 @@ struct nfs_client *nfs_init_client(struct nfs_client *clp,
 	 * Create a client RPC handle for doing FSSTAT with UNIX auth only
 	 * - RFC 2623, sec 2.3.2
 	 */
-	error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_UNIX,
-				      0, noresvport);
+	error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_UNIX);
 	if (error < 0)
 		goto error;
 	nfs_mark_client_ready(clp, NFS_CS_READY);
@@ -881,10 +878,11 @@ static int nfs_init_server(struct nfs_server *server,
 
 	nfs_init_timeout_values(&timeparms, data->nfs_server.protocol,
 			data->timeo, data->retrans);
+	if (data->flags & NFS_MOUNT_NORESVPORT)
+		set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags);
 
 	/* Allocate or find a client reference we can use */
-	clp = nfs_get_client(&cl_init, &timeparms, NULL, RPC_AUTH_UNIX,
-			     data->flags & NFS_MOUNT_NORESVPORT);
+	clp = nfs_get_client(&cl_init, &timeparms, NULL, RPC_AUTH_UNIX);
 	if (IS_ERR(clp)) {
 		dprintk("<-- nfs_init_server() = error %ld\n", PTR_ERR(clp));
 		return PTR_ERR(clp);
@@ -1364,15 +1362,13 @@ static int nfs4_init_client_minor_version(struct nfs_client *clp)
  * @timeparms: timeout parameters for underlying RPC transport
  * @ip_addr: callback IP address in presentation format
  * @authflavor: authentication flavor for underlying RPC transport
- * @noresvport: set if RPC transport can use an ephemeral source port
  *
  * Returns pointer to an NFS client, or an ERR_PTR value.
  */
 struct nfs_client *nfs4_init_client(struct nfs_client *clp,
 				    const struct rpc_timeout *timeparms,
 				    const char *ip_addr,
-				    rpc_authflavor_t authflavour,
-				    int noresvport)
+				    rpc_authflavor_t authflavour)
 {
 	char buf[INET6_ADDRSTRLEN + 1];
 	int error;
@@ -1386,8 +1382,8 @@ struct nfs_client *nfs4_init_client(struct nfs_client *clp,
 	/* Check NFS protocol revision and initialize RPC op vector */
 	clp->rpc_ops = &nfs_v4_clientops;
 
-	error = nfs_create_rpc_client(clp, timeparms, authflavour,
-				      1, noresvport);
+	__set_bit(NFS_CS_DISCRTRY, &clp->cl_flags);
+	error = nfs_create_rpc_client(clp, timeparms, authflavour);
 	if (error < 0)
 		goto error;
 
@@ -1455,9 +1451,11 @@ static int nfs4_set_client(struct nfs_server *server,
 
 	dprintk("--> nfs4_set_client()\n");
 
+	if (server->flags & NFS_MOUNT_NORESVPORT)
+		set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags);
+
 	/* Allocate or find a client reference we can use */
-	clp = nfs_get_client(&cl_init, timeparms, ip_addr, authflavour,
-			     server->flags & NFS_MOUNT_NORESVPORT);
+	clp = nfs_get_client(&cl_init, timeparms, ip_addr, authflavour);
 	if (IS_ERR(clp)) {
 		error = PTR_ERR(clp);
 		goto error;
@@ -1512,7 +1510,7 @@ struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp,
 	 */
 	nfs_init_timeout_values(&ds_timeout, ds_proto, ds_timeo, ds_retrans);
 	clp = nfs_get_client(&cl_init, &ds_timeout, mds_clp->cl_ipaddr,
-			     mds_clp->cl_rpcclient->cl_auth->au_flavor, 0);
+			     mds_clp->cl_rpcclient->cl_auth->au_flavor);
 
 	dprintk("<-- %s %p\n", __func__, clp);
 	return clp;
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 3a9e80c..547f24f 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -240,8 +240,7 @@ extern int nfs4_init_ds_session(struct nfs_client *clp);
 void nfs_close_context(struct nfs_open_context *ctx, int is_sync);
 extern struct nfs_client *nfs_init_client(struct nfs_client *clp,
 			   const struct rpc_timeout *timeparms,
-			   const char *ip_addr, rpc_authflavor_t authflavour,
-			   int noresvport);
+			   const char *ip_addr, rpc_authflavor_t authflavour);
 
 /* dir.c */
 extern int nfs_access_cache_shrinker(struct shrinker *shrink,
@@ -376,8 +375,7 @@ extern void __nfs4_read_done_cb(struct nfs_read_data *);
 extern struct nfs_client *nfs4_init_client(struct nfs_client *clp,
 			    const struct rpc_timeout *timeparms,
 			    const char *ip_addr,
-			    rpc_authflavor_t authflavour,
-			    int noresvport);
+			    rpc_authflavor_t authflavour);
 extern int _nfs4_call_sync(struct rpc_clnt *clnt,
 			   struct nfs_server *server,
 			   struct rpc_message *msg,
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index fbec57d..3a99f52 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -35,6 +35,9 @@ struct nfs_client {
 #define NFS_CS_RENEWD		3		/* - renewd started */
 #define NFS_CS_STOP_RENEW	4		/* no more state to renew */
 #define NFS_CS_CHECK_LEASE_TIME	5		/* need to check lease time */
+	unsigned long		cl_flags;	/* behavior switches */
+#define NFS_CS_NORESVPORT	0		/* - use ephemeral src port */
+#define NFS_CS_DISCRTRY		1		/* - disconnect on RPC retry */
 	struct sockaddr_storage	cl_addr;	/* server identifier */
 	size_t			cl_addrlen;
 	char *			cl_hostname;	/* hostname of server */
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 0c521cd..07048c0 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -1399,7 +1399,7 @@ struct nfs_rpc_ops {
 				struct iattr *iattr);
 	struct nfs_client *
 		(*init_client) (struct nfs_client *, const struct rpc_timeout *,
-				const char *, rpc_authflavor_t, int);
+				const char *, rpc_authflavor_t);
 };
 
 /*
-- 
cgit v0.10.2


From acdeb69d9c5934a678a732b4e24770326bf9471e Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 21 May 2012 22:46:16 -0400
Subject: NFS: EXCHANGE_ID should save the server major and minor ID

Save the server major and minor ID results from EXCHANGE_ID, as they
are needed for detecting server trunking.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 34b2e68..3c14468 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -237,6 +237,7 @@ static void nfs4_shutdown_client(struct nfs_client *clp)
 		nfs_idmap_delete(clp);
 
 	rpc_destroy_wait_queue(&clp->cl_rpcwaitq);
+	kfree(clp->cl_serverowner);
 	kfree(clp->cl_serverscope);
 	kfree(clp->cl_implid);
 }
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 9e9334a..0d46fe4 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -5109,11 +5109,18 @@ int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred)
 				clp->cl_rpcclient->cl_nodename,
 				clp->cl_rpcclient->cl_auth->au_flavor);
 
+	res.server_owner = kzalloc(sizeof(struct nfs41_server_owner),
+					GFP_KERNEL);
+	if (unlikely(res.server_owner == NULL)) {
+		status = -ENOMEM;
+		goto out;
+	}
+
 	res.server_scope = kzalloc(sizeof(struct nfs41_server_scope),
 					GFP_KERNEL);
 	if (unlikely(res.server_scope == NULL)) {
 		status = -ENOMEM;
-		goto out;
+		goto out_server_owner;
 	}
 
 	res.impl_id = kzalloc(sizeof(struct nfs41_impl_id), GFP_KERNEL);
@@ -5127,6 +5134,12 @@ int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred)
 		status = nfs4_check_cl_exchange_flags(clp->cl_exchange_flags);
 
 	if (status == 0) {
+		kfree(clp->cl_serverowner);
+		clp->cl_serverowner = res.server_owner;
+		res.server_owner = NULL;
+	}
+
+	if (status == 0) {
 		/* use the most recent implementation id */
 		kfree(clp->cl_implid);
 		clp->cl_implid = res.impl_id;
@@ -5150,6 +5163,8 @@ int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred)
 		}
 	}
 
+out_server_owner:
+	kfree(res.server_owner);
 out_server_scope:
 	kfree(res.server_scope);
 out:
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 12b9982..5ad2b2c 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -5144,24 +5144,27 @@ static int decode_exchange_id(struct xdr_stream *xdr,
 	if (dummy != SP4_NONE)
 		return -EIO;
 
-	/* Throw away minor_id */
+	/* server_owner4.so_minor_id */
 	p = xdr_inline_decode(xdr, 8);
 	if (unlikely(!p))
 		goto out_overflow;
+	p = xdr_decode_hyper(p, &res->server_owner->minor_id);
 
-	/* Throw away Major id */
+	/* server_owner4.so_major_id */
 	status = decode_opaque_inline(xdr, &dummy, &dummy_str);
 	if (unlikely(status))
 		return status;
+	if (unlikely(dummy > NFS4_OPAQUE_LIMIT))
+		return -EIO;
+	memcpy(res->server_owner->major_id, dummy_str, dummy);
+	res->server_owner->major_id_sz = dummy;
 
-	/* Save server_scope */
+	/* server_scope4 */
 	status = decode_opaque_inline(xdr, &dummy, &dummy_str);
 	if (unlikely(status))
 		return status;
-
 	if (unlikely(dummy > NFS4_OPAQUE_LIMIT))
 		return -EIO;
-
 	memcpy(res->server_scope->server_scope, dummy_str, dummy);
 	res->server_scope->server_scope_sz = dummy;
 
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index 3a99f52..fbb78fb 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -80,6 +80,7 @@ struct nfs_client {
 	/* The flags used for obtaining the clientid during EXCHANGE_ID */
 	u32			cl_exchange_flags;
 	struct nfs4_session	*cl_session;	/* shared session */
+	struct nfs41_server_owner *cl_serverowner;
 	struct nfs41_server_scope *cl_serverscope;
 	struct nfs41_impl_id	*cl_implid;
 #endif /* CONFIG_NFS_V4 */
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 07048c0..0872f32 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -1098,7 +1098,7 @@ struct nfs41_exchange_id_args {
 	u32				flags;
 };
 
-struct server_owner {
+struct nfs41_server_owner {
 	uint64_t			minor_id;
 	uint32_t			major_id_sz;
 	char				major_id[NFS4_OPAQUE_LIMIT];
@@ -1118,6 +1118,7 @@ struct nfs41_impl_id {
 struct nfs41_exchange_id_res {
 	struct nfs_client		*client;
 	u32				flags;
+	struct nfs41_server_owner	*server_owner;
 	struct nfs41_server_scope	*server_scope;
 	struct nfs41_impl_id		*impl_id;
 };
-- 
cgit v0.10.2


From 77820ffae678fa7ff6cc155354825b6b1a023afb Mon Sep 17 00:00:00 2001
From: Seth Heasley <seth.heasley@intel.com>
Date: Tue, 22 May 2012 22:54:20 +0200
Subject: gpio: Add Intel Centerton support to gpio-sch

This patch adds the Intel Centerton processor device ID for GPIO.
The device ID is defined in include/linux/pci_ids.h

Signed-off-by: Seth Heasley <seth.heasley@intel.com>
Acked-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>

diff --git a/drivers/gpio/Kconfig b/drivers/gpio/Kconfig
index 09ac540..6da36a5 100644
--- a/drivers/gpio/Kconfig
+++ b/drivers/gpio/Kconfig
@@ -170,13 +170,13 @@ config GPIO_VR41XX
 	  Say yes here to support the NEC VR4100 series General-purpose I/O Uint
 
 config GPIO_SCH
-	tristate "Intel SCH/TunnelCreek GPIO"
+	tristate "Intel SCH/TunnelCreek/Centerton GPIO"
 	depends on PCI && X86
 	select MFD_CORE
 	select LPC_SCH
 	help
-	  Say yes here to support GPIO interface on Intel Poulsbo SCH
-	  or Intel Tunnel Creek processor.
+	  Say yes here to support GPIO interface on Intel Poulsbo SCH,
+	  Intel Tunnel Creek processor or Intel Centerton processor.
 	  The Intel SCH contains a total of 14 GPIO pins. Ten GPIOs are
 	  powered by the core power rail and are turned off during sleep
 	  modes (S3 and higher). The remaining four GPIOs are powered by
@@ -185,6 +185,9 @@ config GPIO_SCH
 	  system from the Suspend-to-RAM state.
 	  The Intel Tunnel Creek processor has 5 GPIOs powered by the
 	  core power rail and 9 from suspend power supply.
+	  The Intel Centerton processor has a total of 30 GPIO pins.
+	  Twenty-one are powered by the core power rail and 9 from the
+	  suspend power supply.
 
 config GPIO_ICH
 	tristate "Intel ICH GPIO"
diff --git a/drivers/gpio/gpio-sch.c b/drivers/gpio/gpio-sch.c
index 8cadf4d..424dce8 100644
--- a/drivers/gpio/gpio-sch.c
+++ b/drivers/gpio/gpio-sch.c
@@ -232,6 +232,14 @@ static int __devinit sch_gpio_probe(struct platform_device *pdev)
 			sch_gpio_resume.ngpio = 9;
 			break;
 
+		case PCI_DEVICE_ID_INTEL_CENTERTON_ILB:
+			sch_gpio_core.base = 0;
+			sch_gpio_core.ngpio = 21;
+
+			sch_gpio_resume.base = 21;
+			sch_gpio_resume.ngpio = 9;
+			break;
+
 		default:
 			return -ENODEV;
 	}
-- 
cgit v0.10.2


From 730a3d01b1e1e3ba102a5a4d3d5dcfecd55326b6 Mon Sep 17 00:00:00 2001
From: Johan Hovold <jhovold@gmail.com>
Date: Fri, 18 May 2012 20:22:45 +0200
Subject: mfd: Add r_select to lm3533 platform data

Add resistor-select parameter to the platform data.

Signed-off-by: Johan Hovold <jhovold@gmail.com>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>

diff --git a/include/linux/mfd/lm3533.h b/include/linux/mfd/lm3533.h
index 9660feb..594bc59 100644
--- a/include/linux/mfd/lm3533.h
+++ b/include/linux/mfd/lm3533.h
@@ -43,6 +43,7 @@ struct lm3533_ctrlbank {
 
 struct lm3533_als_platform_data {
 	unsigned pwm_mode:1;		/* PWM input mode (default analog) */
+	u8 r_select;			/* 1 - 127 (ignored in PWM-mode) */
 };
 
 struct lm3533_bl_platform_data {
-- 
cgit v0.10.2


From cb8d8654570c257d2ec5f7fa089e18b338314317 Mon Sep 17 00:00:00 2001
From: Laxman Dewangan <ldewangan@nvidia.com>
Date: Sat, 19 May 2012 02:01:41 +0530
Subject: mfd: Save device node parsed platform data for tps65910 sub devices

Save the allocated memory to store the parsed device node information
to the global device structure so that sub devices can directly use this
pointer.
In this way, the sub devices does not require to re-allocate the
memory for storing the sub-devices specific device node information.

Signed-off-by: Laxman Dewangan <ldewangan@nvidia.com>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>

diff --git a/drivers/mfd/tps65910.c b/drivers/mfd/tps65910.c
index 18b30cf..05d449b 100644
--- a/drivers/mfd/tps65910.c
+++ b/drivers/mfd/tps65910.c
@@ -209,14 +209,17 @@ static __devinit int tps65910_i2c_probe(struct i2c_client *i2c,
 {
 	struct tps65910 *tps65910;
 	struct tps65910_board *pmic_plat_data;
+	struct tps65910_board *of_pmic_plat_data = NULL;
 	struct tps65910_platform_data *init_data;
 	int ret = 0;
 	int chip_id = id->driver_data;
 
 	pmic_plat_data = dev_get_platdata(&i2c->dev);
 
-	if (!pmic_plat_data && i2c->dev.of_node)
+	if (!pmic_plat_data && i2c->dev.of_node) {
 		pmic_plat_data = tps65910_parse_dt(i2c, &chip_id);
+		of_pmic_plat_data = pmic_plat_data;
+	}
 
 	if (!pmic_plat_data)
 		return -EINVAL;
@@ -229,6 +232,7 @@ static __devinit int tps65910_i2c_probe(struct i2c_client *i2c,
 	if (tps65910 == NULL)
 		return -ENOMEM;
 
+	tps65910->of_plat_data = of_pmic_plat_data;
 	i2c_set_clientdata(i2c, tps65910);
 	tps65910->dev = &i2c->dev;
 	tps65910->i2c_client = i2c;
diff --git a/include/linux/mfd/tps65910.h b/include/linux/mfd/tps65910.h
index ab04e90..dd8dc0a 100644
--- a/include/linux/mfd/tps65910.h
+++ b/include/linux/mfd/tps65910.h
@@ -830,6 +830,9 @@ struct tps65910 {
 	struct tps65910_rtc *rtc;
 	struct tps65910_power *power;
 
+	/* Device node parsed board data */
+	struct tps65910_board *of_plat_data;
+
 	/* IRQ Handling */
 	struct mutex irq_lock;
 	int chip_irq;
-- 
cgit v0.10.2


From dcc7dabd8eb13e968ee3ec52a1bb9829a3bc904e Mon Sep 17 00:00:00 2001
From: Laxman Dewangan <ldewangan@nvidia.com>
Date: Sat, 19 May 2012 02:01:42 +0530
Subject: mfd: Remove the parsing of dt info for tps65910 gpio

Remove the parsing of device node information for sub devices
from core file.
The sub devices will parse the information as per the sub-devices
specific information.

Signed-off-by: Laxman Dewangan <ldewangan@nvidia.com>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>

diff --git a/drivers/mfd/tps65910.c b/drivers/mfd/tps65910.c
index 05d449b..be9e07b 100644
--- a/drivers/mfd/tps65910.c
+++ b/drivers/mfd/tps65910.c
@@ -146,9 +146,7 @@ static struct tps65910_board *tps65910_parse_dt(struct i2c_client *client,
 	struct tps65910_board *board_info;
 	unsigned int prop;
 	const struct of_device_id *match;
-	unsigned int prop_array[TPS6591X_MAX_NUM_GPIO];
 	int ret = 0;
-	int idx;
 
 	match = of_match_device(tps65910_of_match, &client->dev);
 	if (!match) {
@@ -177,21 +175,8 @@ static struct tps65910_board *tps65910_parse_dt(struct i2c_client *client,
 	else if (*chip_id == TPS65911)
 		dev_warn(&client->dev, "VMBCH2-Threshold not specified");
 
-	ret = of_property_read_u32_array(np, "ti,en-gpio-sleep",
-				   prop_array, TPS6591X_MAX_NUM_GPIO);
-	if (!ret)
-		for (idx = 0; idx < ARRAY_SIZE(prop_array); idx++)
-			board_info->en_gpio_sleep[idx] = (prop_array[idx] != 0);
-	else if (ret != -EINVAL) {
-		dev_err(&client->dev,
-			"error reading property ti,en-gpio-sleep: %d\n.", ret);
-		return NULL;
-	}
-
-
 	board_info->irq = client->irq;
 	board_info->irq_base = -1;
-	board_info->gpio_base = -1;
 
 	return board_info;
 }
-- 
cgit v0.10.2


From 6fe02e9f46fda7c33e48e4f9812663516cd25a4b Mon Sep 17 00:00:00 2001
From: Laxman Dewangan <ldewangan@nvidia.com>
Date: Sat, 19 May 2012 02:01:43 +0530
Subject: gpio: tps65910: dt: process gpio specific device node info

Parse the gpio specific device node information locally.

Signed-off-by: Laxman Dewangan <ldewangan@nvidia.com>
Acked-by: Grant Likely <grant.likely@secretlab.ca>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>

diff --git a/drivers/gpio/gpio-tps65910.c b/drivers/gpio/gpio-tps65910.c
index af6dc83..c1ad288 100644
--- a/drivers/gpio/gpio-tps65910.c
+++ b/drivers/gpio/gpio-tps65910.c
@@ -20,6 +20,7 @@
 #include <linux/i2c.h>
 #include <linux/platform_device.h>
 #include <linux/mfd/tps65910.h>
+#include <linux/of_device.h>
 
 struct tps65910_gpio {
 	struct gpio_chip gpio_chip;
@@ -81,6 +82,37 @@ static int tps65910_gpio_input(struct gpio_chip *gc, unsigned offset)
 						GPIO_CFG_MASK);
 }
 
+#ifdef CONFIG_OF
+static struct tps65910_board *tps65910_parse_dt_for_gpio(struct device *dev,
+		struct tps65910 *tps65910, int chip_ngpio)
+{
+	struct tps65910_board *tps65910_board = tps65910->of_plat_data;
+	unsigned int prop_array[TPS6591X_MAX_NUM_GPIO];
+	int ngpio = min(chip_ngpio, TPS6591X_MAX_NUM_GPIO);
+	int ret;
+	int idx;
+
+	tps65910_board->gpio_base = -1;
+	ret = of_property_read_u32_array(tps65910->dev->of_node,
+			"ti,en-gpio-sleep", prop_array, ngpio);
+	if (ret < 0) {
+		dev_dbg(dev, "ti,en-gpio-sleep not specified\n");
+		return tps65910_board;
+	}
+
+	for (idx = 0; idx < ngpio; idx++)
+		tps65910_board->en_gpio_sleep[idx] = (prop_array[idx] != 0);
+
+	return tps65910_board;
+}
+#else
+static struct tps65910_board *tps65910_parse_dt_for_gpio(struct device *dev,
+		struct tps65910 *tps65910, int chip_ngpio)
+{
+	return NULL;
+}
+#endif
+
 static int __devinit tps65910_gpio_probe(struct platform_device *pdev)
 {
 	struct tps65910 *tps65910 = dev_get_drvdata(pdev->dev.parent);
@@ -122,6 +154,10 @@ static int __devinit tps65910_gpio_probe(struct platform_device *pdev)
 	else
 		tps65910_gpio->gpio_chip.base = -1;
 
+	if (!pdata && tps65910->dev->of_node)
+		pdata = tps65910_parse_dt_for_gpio(&pdev->dev, tps65910,
+			tps65910_gpio->gpio_chip.ngpio);
+
 	if (!pdata)
 		goto skip_init;
 
-- 
cgit v0.10.2


From 21f082a66177852365df0c955ecaef50fba9a691 Mon Sep 17 00:00:00 2001
From: Lee Jones <lee.jones@linaro.org>
Date: Wed, 23 May 2012 10:22:10 +0100
Subject: mfd: ab8500-core should depend on MFD_DB8500_PRCMU

A recent move to eliminate excess historical baggage from ab8500 core
code resulting in errors when building with x86_64 allmodconfig:

In file included from drivers/mfd/ab8500-core.c:21:0:
include/linux/mfd/dbx500-prcmu.h:614:19: error: redefinition of 'prcmu_abb_read'
include/linux/mfd/db8500-prcmu.h:673:19: note: previous definition of 'prcmu_abb_read' was here
include/linux/mfd/dbx500-prcmu.h:619:19: error: redefinition of 'prcmu_abb_write'
include/linux/mfd/db8500-prcmu.h:678:19: note: previous definition of 'prcmu_abb_write' was here
include/linux/mfd/dbx500-prcmu.h:630:19: error: redefinition of 'prcmu_config_clkout'
include/linux/mfd/db8500-prcmu.h:643:19: note: previous definition of 'prcmu_config_clkout' was here
include/linux/mfd/dbx500-prcmu.h:692:20: error: redefinition of 'prcmu_ac_wake_req'
include/linux/mfd/db8500-prcmu.h:683:20: note: previous definition of 'prcmu_ac_wake_req' was here
include/linux/mfd/dbx500-prcmu.h:694:20: error: redefinition of 'prcmu_ac_sleep_req'
include/linux/mfd/db8500-prcmu.h:685:20: note: previous definition of 'prcmu_ac_sleep_req' was here

Problem:
When CONFIG_AB8500_CORE is set, building ab8500-core.c and
!(CONFIG_UX500_SOC_DB8500 | CONFIG_MFD_DB8500_PRCMU), both db8500-prcmu.h
and dbx500-prcmu.h take it upon themselves to _both_ create 'return 0'
inline functions for the following:

prcmu_abb_read()
prcmu_abb_write()
prcmu_config_clkout()
prcmu_ac_wake_req()
prcmu_ac_sleep_req()

Solution:
Depend on MFD_DB8500_PRCMU, which in turn depends on UX500_SOC_DB8500.

Reported-By: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>

diff --git a/drivers/mfd/Kconfig b/drivers/mfd/Kconfig
index 094bf4e..0c3f5de 100644
--- a/drivers/mfd/Kconfig
+++ b/drivers/mfd/Kconfig
@@ -709,7 +709,7 @@ config AB5500_DEBUG
 
 config AB8500_CORE
 	bool "ST-Ericsson AB8500 Mixed Signal Power Management chip"
-	depends on GENERIC_HARDIRQS && ABX500_CORE
+	depends on GENERIC_HARDIRQS && ABX500_CORE && MFD_DB8500_PRCMU
 	select MFD_CORE
 	help
 	  Select this option to enable access to AB8500 power management
-- 
cgit v0.10.2


From 78302a194c0ddf4438e50e3f9b327a6dce6bc8fc Mon Sep 17 00:00:00 2001
From: Samuel Ortiz <sameo@linux.intel.com>
Date: Wed, 23 May 2012 13:28:33 +0200
Subject: mfd: Fix max77693 build failure
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Without it we get:

drivers/mfd/max77693.c: In function ‘max77693_i2c_probe’:
drivers/mfd/max77693.c:157:2: error: implicit declaration of function
‘max77693_irq_init’ [-Werror=implicit-function-declaration]
drivers/mfd/max77693.c: In function ‘max77693_resume’:
drivers/mfd/max77693.c:215:2: error: implicit declaration of function
‘max77693_irq_resume’ [-Werror=implicit-function-declaration]
drivers/mfd/max77693-irq.c: In function ‘max77693_irq_lock’:
drivers/mfd/max77693-irq.c:104:2: error: ‘struct max77693_dev’ has no member
named ‘irqlock’
drivers/mfd/max77693-irq.c: In function ‘max77693_irq_sync_unlock’:
drivers/mfd/max77693-irq.c:119:11: error: ‘struct max77693_dev’ has no member
named ‘irq_masks_cache’
drivers/mfd/max77693-irq.c:119:42: error: ‘struct max77693_dev’ has no member
named ‘irq_masks_cur’
drivers/mfd/max77693-irq.c:122:13: error: ‘struct max77693_dev’ has no member
named ‘irq_masks_cur’
drivers/mfd/max77693-irq.c:125:24: error: ‘struct max77693_dev’ has no member
named ‘irqlock’
drivers/mfd/max77693-irq.c: In function ‘max77693_irq_mask’:
drivers/mfd/max77693-irq.c:141:11: error: ‘struct max77693_dev’ has no member
named ‘irq_masks_cur’
drivers/mfd/max77693-irq.c:143:11: error: ‘struct max77693_dev’ has no member
named ‘irq_masks_cur’
drivers/mfd/max77693-irq.c: In function ‘max77693_irq_unmask’:
drivers/mfd/max77693-irq.c:153:11: error: ‘struct max77693_dev’ has no member
named ‘irq_masks_cur’
drivers/mfd/max77693-irq.c:155:11: error: ‘struct max77693_dev’ has no member
named ‘irq_masks_cur’
drivers/mfd/max77693-irq.c: In function ‘max77693_irq_thread’:
drivers/mfd/max77693-irq.c:209:26: error: ‘struct max77693_dev’ has no member
named ‘irq_masks_cur’
drivers/mfd/max77693-irq.c:211:27: error: ‘struct max77693_dev’ has no member
named ‘irq_masks_cur’
drivers/mfd/max77693-irq.c:217:39: error: ‘struct max77693_dev’ has no member
named ‘irq_domain’
drivers/mfd/max77693-irq.c: In function ‘max77693_irq_init’:
drivers/mfd/max77693-irq.c:260:2: error: ‘struct max77693_dev’ has no member
named ‘irqlock’
drivers/mfd/max77693-irq.c:268:12: error: ‘struct max77693_dev’ has no member
named ‘irq_masks_cur’
drivers/mfd/max77693-irq.c:269:12: error: ‘struct max77693_dev’ has no member
named ‘irq_masks_cache’
drivers/mfd/max77693-irq.c:271:12: error: ‘struct max77693_dev’ has no member
named ‘irq_masks_cur’
drivers/mfd/max77693-irq.c:272:12: error: ‘struct max77693_dev’ has no member
named ‘irq_masks_cache’
drivers/mfd/max77693-irq.c:292:10: error: ‘struct max77693_dev’ has no member
named ‘irq_domain’

Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>

diff --git a/include/linux/mfd/max77693-private.h b/include/linux/mfd/max77693-private.h
index bf6077d..68263c5 100644
--- a/include/linux/mfd/max77693-private.h
+++ b/include/linux/mfd/max77693-private.h
@@ -198,8 +198,14 @@ struct max77693_dev {
 	struct regmap *regmap_muic;
 	struct regmap *regmap_haptic;
 
+	struct irq_domain *irq_domain;
+
 	int irq;
+	int irq_gpio;
 	bool wakeup;
+	struct mutex irqlock;
+	int irq_masks_cur[MAX77693_IRQ_GROUP_NR];
+	int irq_masks_cache[MAX77693_IRQ_GROUP_NR];
 };
 
 enum max77693_types {
@@ -214,4 +220,8 @@ extern int max77693_bulk_write(struct regmap *map, u8 reg, int count,
 				u8 *buf);
 extern int max77693_update_reg(struct regmap *map, u8 reg, u8 val, u8 mask);
 
+extern int max77693_irq_init(struct max77693_dev *max77686);
+extern void max77693_irq_exit(struct max77693_dev *max77686);
+extern int max77693_irq_resume(struct max77693_dev *max77686);
+
 #endif /*  __LINUX_MFD_MAX77693_PRIV_H */
-- 
cgit v0.10.2


From ca2cad6ae38ea0ff27a7a7a00bfaa571fbe9051f Mon Sep 17 00:00:00 2001
From: Samuel Ortiz <sameo@linux.intel.com>
Date: Wed, 23 May 2012 16:23:21 +0200
Subject: mfd: Fix twl6040 build failure
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Without it we get:

 CC      drivers/mfd/twl6040-core.o
drivers/mfd/twl6040-core.c: In function ‘twl6040_has_vibra’:
drivers/mfd/twl6040-core.c:55:2: error: implicit declaration of function
‘of_find_node_by_name’ [-Werror=implicit-function-declaration]

Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>

diff --git a/drivers/mfd/twl6040-core.c b/drivers/mfd/twl6040-core.c
index 450a28f..4ded9e7 100644
--- a/drivers/mfd/twl6040-core.c
+++ b/drivers/mfd/twl6040-core.c
@@ -45,6 +45,20 @@
 #define VIBRACTRL_MEMBER(reg) ((reg == TWL6040_REG_VIBCTLL) ? 0 : 1)
 #define TWL6040_NUM_SUPPLIES	(2)
 
+static bool twl6040_has_vibra(struct twl6040_platform_data *pdata,
+			      struct device_node *node)
+{
+	if (pdata && pdata->vibra)
+		return true;
+
+#ifdef CONFIG_OF
+	if (of_find_node_by_name(node, "vibra"))
+		return true;
+#endif
+
+	return false;
+}
+
 int twl6040_reg_read(struct twl6040 *twl6040, unsigned int reg)
 {
 	int ret;
@@ -617,7 +631,7 @@ static int __devinit twl6040_probe(struct i2c_client *client,
 	}
 	children++;
 
-	if ((pdata && pdata->vibra) || of_find_node_by_name(node, "vibra")) {
+	if (twl6040_has_vibra(pdata, node)) {
 		irq = twl6040->irq_base + TWL6040_IRQ_VIB;
 
 		cell = &twl6040->cells[children];
-- 
cgit v0.10.2


From 7b38c3682c5cab4f98751d5fe57b78a59020653d Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 23 May 2012 13:23:31 -0400
Subject: NFSv4.1: Fix session initialisation races

Session initialisation is not complete until the lease manager
has run. We need to ensure that both nfs4_init_session and
nfs4_init_ds_session do so, and that they check for any resulting
errors in clp->cl_cons_state.

Only after this is done, can nfs4_ds_connect check the contents
of clp->cl_exchange_flags.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Cc: Andy Adamson <andros@netapp.com>

diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 3c14468..25dde07 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -592,22 +592,6 @@ void nfs_mark_client_ready(struct nfs_client *clp, int state)
 }
 
 /*
- * With sessions, the client is not marked ready until after a
- * successful EXCHANGE_ID and CREATE_SESSION.
- *
- * Map errors cl_cons_state errors to EPROTONOSUPPORT to indicate
- * other versions of NFS can be tried.
- */
-int nfs4_check_client_ready(struct nfs_client *clp)
-{
-	if (!nfs4_has_session(clp))
-		return 0;
-	if (clp->cl_cons_state < NFS_CS_READY)
-		return -EPROTONOSUPPORT;
-	return 0;
-}
-
-/*
  * Initialise the timeout values for a connection
  */
 static void nfs_init_timeout_values(struct rpc_timeout *to, int proto,
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 547f24f..5ea571e 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -169,7 +169,6 @@ extern struct nfs_server *nfs_clone_server(struct nfs_server *,
 					   struct nfs_fattr *,
 					   rpc_authflavor_t);
 extern void nfs_mark_client_ready(struct nfs_client *clp, int state);
-extern int nfs4_check_client_ready(struct nfs_client *clp);
 extern struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp,
 					     const struct sockaddr *ds_addr,
 					     int ds_addrlen, int ds_proto,
@@ -234,7 +233,7 @@ extern const u32 nfs41_maxwrite_overhead;
 extern struct rpc_procinfo nfs4_procedures[];
 #endif
 
-extern int nfs4_init_ds_session(struct nfs_client *clp);
+extern int nfs4_init_ds_session(struct nfs_client *, unsigned long);
 
 /* proc.c */
 void nfs_close_context(struct nfs_open_context *ctx, int is_sync);
diff --git a/fs/nfs/nfs4filelayoutdev.c b/fs/nfs/nfs4filelayoutdev.c
index c610f84..a1fab8d 100644
--- a/fs/nfs/nfs4filelayoutdev.c
+++ b/fs/nfs/nfs4filelayoutdev.c
@@ -203,28 +203,7 @@ nfs4_ds_connect(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds)
 		goto out;
 	}
 
-	if ((clp->cl_exchange_flags & EXCHGID4_FLAG_MASK_PNFS) != 0) {
-		if (!is_ds_client(clp)) {
-			status = -ENODEV;
-			goto out_put;
-		}
-		ds->ds_clp = clp;
-		dprintk("%s [existing] server=%s\n", __func__,
-			ds->ds_remotestr);
-		goto out;
-	}
-
-	/*
-	 * Do not set NFS_CS_CHECK_LEASE_TIME instead set the DS lease to
-	 * be equal to the MDS lease. Renewal is scheduled in create_session.
-	 */
-	spin_lock(&mds_srv->nfs_client->cl_lock);
-	clp->cl_lease_time = mds_srv->nfs_client->cl_lease_time;
-	spin_unlock(&mds_srv->nfs_client->cl_lock);
-	clp->cl_last_renewal = jiffies;
-
-	/* New nfs_client */
-	status = nfs4_init_ds_session(clp);
+	status = nfs4_init_ds_session(clp, mds_srv->nfs_client->cl_lease_time);
 	if (status)
 		goto out_put;
 
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 0d46fe4..c856298 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -5603,53 +5603,78 @@ int nfs4_proc_destroy_session(struct nfs4_session *session)
 	return status;
 }
 
+/*
+ * With sessions, the client is not marked ready until after a
+ * successful EXCHANGE_ID and CREATE_SESSION.
+ *
+ * Map errors cl_cons_state errors to EPROTONOSUPPORT to indicate
+ * other versions of NFS can be tried.
+ */
+static int nfs41_check_session_ready(struct nfs_client *clp)
+{
+	int ret;
+	
+	if (clp->cl_cons_state == NFS_CS_SESSION_INITING) {
+		ret = nfs4_client_recover_expired_lease(clp);
+		if (ret)
+			return ret;
+	}
+	if (clp->cl_cons_state < NFS_CS_READY)
+		return -EPROTONOSUPPORT;
+	return 0;
+}
+
 int nfs4_init_session(struct nfs_server *server)
 {
 	struct nfs_client *clp = server->nfs_client;
 	struct nfs4_session *session;
 	unsigned int rsize, wsize;
-	int ret;
 
 	if (!nfs4_has_session(clp))
 		return 0;
 
 	session = clp->cl_session;
-	if (!test_and_clear_bit(NFS4_SESSION_INITING, &session->session_state))
-		return 0;
+	spin_lock(&clp->cl_lock);
+	if (test_and_clear_bit(NFS4_SESSION_INITING, &session->session_state)) {
 
-	rsize = server->rsize;
-	if (rsize == 0)
-		rsize = NFS_MAX_FILE_IO_SIZE;
-	wsize = server->wsize;
-	if (wsize == 0)
-		wsize = NFS_MAX_FILE_IO_SIZE;
+		rsize = server->rsize;
+		if (rsize == 0)
+			rsize = NFS_MAX_FILE_IO_SIZE;
+		wsize = server->wsize;
+		if (wsize == 0)
+			wsize = NFS_MAX_FILE_IO_SIZE;
 
-	session->fc_attrs.max_rqst_sz = wsize + nfs41_maxwrite_overhead;
-	session->fc_attrs.max_resp_sz = rsize + nfs41_maxread_overhead;
+		session->fc_attrs.max_rqst_sz = wsize + nfs41_maxwrite_overhead;
+		session->fc_attrs.max_resp_sz = rsize + nfs41_maxread_overhead;
+	}
+	spin_unlock(&clp->cl_lock);
 
-	ret = nfs4_recover_expired_lease(server);
-	if (!ret)
-		ret = nfs4_check_client_ready(clp);
-	return ret;
+	return nfs41_check_session_ready(clp);
 }
 
-int nfs4_init_ds_session(struct nfs_client *clp)
+int nfs4_init_ds_session(struct nfs_client *clp, unsigned long lease_time)
 {
 	struct nfs4_session *session = clp->cl_session;
 	int ret;
 
-	if (!test_and_clear_bit(NFS4_SESSION_INITING, &session->session_state))
-		return 0;
-
-	ret = nfs4_client_recover_expired_lease(clp);
-	if (!ret)
-		/* Test for the DS role */
-		if (!is_ds_client(clp))
-			ret = -ENODEV;
-	if (!ret)
-		ret = nfs4_check_client_ready(clp);
-	return ret;
+	spin_lock(&clp->cl_lock);
+	if (test_and_clear_bit(NFS4_SESSION_INITING, &session->session_state)) {
+		/*
+		 * Do not set NFS_CS_CHECK_LEASE_TIME instead set the
+		 * DS lease to be equal to the MDS lease.
+		 */
+		clp->cl_lease_time = lease_time;
+		clp->cl_last_renewal = jiffies;
+	}
+	spin_unlock(&clp->cl_lock);
 
+	ret = nfs41_check_session_ready(clp);
+	if (ret)
+		return ret;
+	/* Test for the DS role */
+	if (!is_ds_client(clp))
+		return -ENODEV;
+	return 0;
 }
 EXPORT_SYMBOL_GPL(nfs4_init_ds_session);
 
-- 
cgit v0.10.2


From 4697bd5e9419348ef9fa9b55cefe4355ad9d3d01 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 23 May 2012 13:24:36 -0400
Subject: NFSv4: Fix a race in the net namespace mount notification

Since the struct nfs_client gets added to the global nfs_client_list
before it is initialised, it is possible that rpc_pipefs_event can
end up trying to create idmapper entries on such a thing.

The solution is to have the mount notification wait for the
initialisation of each nfs_client to complete, and then to
skip any entries for which the it failed.

Reported-by: Stanislav Kinsbursky <skinsbursky@parallels.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Acked-by: Stanislav Kinsbursky <skinsbursky@parallels.com>

diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 25dde07..d356642 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -507,6 +507,17 @@ static struct nfs_client *nfs_match_client(const struct nfs_client_initdata *dat
 	return NULL;
 }
 
+static bool nfs_client_init_is_complete(const struct nfs_client *clp)
+{
+	return clp->cl_cons_state != NFS_CS_INITING;
+}
+
+int nfs_wait_client_init_complete(const struct nfs_client *clp)
+{
+	return wait_event_killable(nfs_client_active_wq,
+			nfs_client_init_is_complete(clp));
+}
+
 /*
  * Found an existing client.  Make sure it's ready before returning.
  */
@@ -516,8 +527,7 @@ nfs_found_client(const struct nfs_client_initdata *cl_init,
 {
 	int error;
 
-	error = wait_event_killable(nfs_client_active_wq,
-				clp->cl_cons_state < NFS_CS_INITING);
+	error = nfs_wait_client_init_complete(clp);
 	if (error < 0) {
 		nfs_put_client(clp);
 		return ERR_PTR(-ERESTARTSYS);
@@ -1333,7 +1343,7 @@ static int nfs4_init_client_minor_version(struct nfs_client *clp)
 		 * so that the client back channel can find the
 		 * nfs_client struct
 		 */
-		clp->cl_cons_state = NFS_CS_SESSION_INITING;
+		nfs_mark_client_ready(clp, NFS_CS_SESSION_INITING);
 	}
 #endif /* CONFIG_NFS_V4_1 */
 
diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c
index 2eaecf9..861be75 100644
--- a/fs/nfs/idmap.c
+++ b/fs/nfs/idmap.c
@@ -530,9 +530,24 @@ static struct nfs_client *nfs_get_client_for_event(struct net *net, int event)
 	struct nfs_net *nn = net_generic(net, nfs_net_id);
 	struct dentry *cl_dentry;
 	struct nfs_client *clp;
+	int err;
 
+restart:
 	spin_lock(&nn->nfs_client_lock);
 	list_for_each_entry(clp, &nn->nfs_client_list, cl_share_link) {
+		/* Wait for initialisation to finish */
+		if (clp->cl_cons_state == NFS_CS_INITING) {
+			atomic_inc(&clp->cl_count);
+			spin_unlock(&nn->nfs_client_lock);
+			err = nfs_wait_client_init_complete(clp);
+			nfs_put_client(clp);
+			if (err)
+				return NULL;
+			goto restart;
+		}
+		/* Skip nfs_clients that failed to initialise */
+		if (clp->cl_cons_state < 0)
+			continue;
 		if (clp->rpc_ops != &nfs_v4_clientops)
 			continue;
 		cl_dentry = clp->cl_idmap->idmap_pipe->dentry;
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 5ea571e..1848a72 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -168,6 +168,7 @@ extern struct nfs_server *nfs_clone_server(struct nfs_server *,
 					   struct nfs_fh *,
 					   struct nfs_fattr *,
 					   rpc_authflavor_t);
+extern int nfs_wait_client_init_complete(const struct nfs_client *clp);
 extern void nfs_mark_client_ready(struct nfs_client *clp, int state);
 extern struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp,
 					     const struct sockaddr *ds_addr,
-- 
cgit v0.10.2


From 54ac471c83aff6b1e068eb8029c797dc68a76e89 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 23 May 2012 13:26:10 -0400
Subject: NFS: Add memory barriers to the nfs_client->cl_cons_state
 initialisation

Ensure that a process that uses the nfs_client->cl_cons_state test
for whether the initialisation process is finished does not read
stale data.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index d356642..a50bdfb 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -459,6 +459,8 @@ static bool nfs4_cb_match_client(const struct sockaddr *addr,
 	    clp->cl_cons_state == NFS_CS_SESSION_INITING))
 		return false;
 
+	smp_rmb();
+
 	/* Match the version and minorversion */
 	if (clp->rpc_ops->version != 4 ||
 	    clp->cl_minorversion != minorversion)
@@ -539,6 +541,8 @@ nfs_found_client(const struct nfs_client_initdata *cl_init,
 		return ERR_PTR(error);
 	}
 
+	smp_rmb();
+
 	BUG_ON(clp->cl_cons_state != NFS_CS_READY);
 
 	dprintk("<-- %s found nfs_client %p for %s\n",
@@ -597,6 +601,7 @@ nfs_get_client(const struct nfs_client_initdata *cl_init,
  */
 void nfs_mark_client_ready(struct nfs_client *clp, int state)
 {
+	smp_wmb();
 	clp->cl_cons_state = state;
 	wake_up_all(&nfs_client_active_wq);
 }
diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c
index 861be75..b5b86a0 100644
--- a/fs/nfs/idmap.c
+++ b/fs/nfs/idmap.c
@@ -548,6 +548,7 @@ restart:
 		/* Skip nfs_clients that failed to initialise */
 		if (clp->cl_cons_state < 0)
 			continue;
+		smp_rmb();
 		if (clp->rpc_ops != &nfs_v4_clientops)
 			continue;
 		cl_dentry = clp->cl_idmap->idmap_pipe->dentry;
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index c856298..8f39bb3 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -5621,6 +5621,7 @@ static int nfs41_check_session_ready(struct nfs_client *clp)
 	}
 	if (clp->cl_cons_state < NFS_CS_READY)
 		return -EPROTONOSUPPORT;
+	smp_rmb();
 	return 0;
 }
 
-- 
cgit v0.10.2


From 29f772d41c01ad6b72c3de705e79779857badcde Mon Sep 17 00:00:00 2001
From: Chanwoo Choi <cw00.choi@samsung.com>
Date: Thu, 24 May 2012 15:08:58 +0900
Subject: mfd: Fix build break of max77693 by adding REGMAP_I2C option

This patch add REGMAP_I2C config option to fix build break
of max77693 mfd driver because max77693 use regmap interface
for i2c communication.

drivers/mfd/max77693.c:103: error: variable 'max77693_regmap_config' has initializer but incomplete type
drivers/mfd/max77693.c:104: error: unknown field 'reg_bits' specified in initializer
drivers/mfd/max77693.c:104: warning: excess elements in struct initializer
drivers/mfd/max77693.c:104: warning: (near initialization for 'max77693_regmap_config')
drivers/mfd/max77693.c:105: error: unknown field 'val_bits' specified in initializer
drivers/mfd/max77693.c:105: warning: excess elements in struct initializer
drivers/mfd/max77693.c:105: warning: (near initialization for 'max77693_regmap_config')
drivers/mfd/max77693.c:106: error: unknown field 'max_register' specified in initializer
drivers/mfd/max77693.c:106: warning: excess elements in struct initializer
drivers/mfd/max77693.c:106: warning: (near initialization for 'max77693_regmap_config')
drivers/mfd/max77693.c: In function 'max77693_i2c_probe':
drivers/mfd/max77693.c:122: error: implicit declaration of function 'devm_regmap_init_i2c'
drivers/mfd/max77693.c:122: warning: assignment makes pointer from integer without a cast

Signed-off-by: Chanwoo Choi <cw00.choi@samsung.com>
Signed-off-by: Myungjoo Ham <myungjoo.ham@samsung.com>
Signed-off-by: Kyungmin Park <kyungmin.park@samsung.com>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>

diff --git a/drivers/mfd/Kconfig b/drivers/mfd/Kconfig
index 0c3f5de..5b58be4 100644
--- a/drivers/mfd/Kconfig
+++ b/drivers/mfd/Kconfig
@@ -425,6 +425,7 @@ config MFD_MAX77693
 	bool "Maxim Semiconductor MAX77693 PMIC Support"
 	depends on I2C=y && GENERIC_HARDIRQS
 	select MFD_CORE
+	select REGMAP_I2C
 	help
 	  Say yes here to support for Maxim Semiconductor MAX77693.
 	  This is a companion Power Management IC with Flash, Haptic, Charger,
-- 
cgit v0.10.2


From 88034c3d88c2c48b215f2cc5eb22e564aa817f9c Mon Sep 17 00:00:00 2001
From: Andy Adamson <andros@netapp.com>
Date: Wed, 23 May 2012 05:02:34 -0400
Subject: NFSv4.1 mdsthreshold attribute xdr

We only support one layout type per file system, so one threshold_item4 per
mdsthreshold4.

Signed-off-by: Andy Adamson <andros@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 5ad2b2c..edb8ac7 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -101,9 +101,12 @@ static int nfs4_stat_to_errno(int);
 #define nfs4_path_maxsz		(1 + ((3 + NFS4_MAXPATHLEN) >> 2))
 #define nfs4_owner_maxsz	(1 + XDR_QUADLEN(IDMAP_NAMESZ))
 #define nfs4_group_maxsz	(1 + XDR_QUADLEN(IDMAP_NAMESZ))
+/* We support only one layout type per file system */
+#define decode_mdsthreshold_maxsz (1 + 1 + nfs4_fattr_bitmap_maxsz + 1 + 8)
 /* This is based on getfattr, which uses the most attributes: */
 #define nfs4_fattr_value_maxsz	(1 + (1 + 2 + 2 + 4 + 2 + 1 + 1 + 2 + 2 + \
-				3 + 3 + 3 + nfs4_owner_maxsz + nfs4_group_maxsz))
+				3 + 3 + 3 + nfs4_owner_maxsz + \
+				nfs4_group_maxsz + decode_mdsthreshold_maxsz))
 #define nfs4_fattr_maxsz	(nfs4_fattr_bitmap_maxsz + \
 				nfs4_fattr_value_maxsz)
 #define decode_getattr_maxsz    (op_decode_hdr_maxsz + nfs4_fattr_maxsz)
@@ -1172,6 +1175,16 @@ static void encode_getfattr(struct xdr_stream *xdr, const u32* bitmask, struct c
 			   bitmask[1] & nfs4_fattr_bitmap[1], hdr);
 }
 
+static void encode_getfattr_open(struct xdr_stream *xdr, const u32 *bitmask,
+				 struct compound_hdr *hdr)
+{
+	encode_getattr_three(xdr,
+			     bitmask[0] & nfs4_fattr_bitmap[0],
+			     bitmask[1] & nfs4_fattr_bitmap[1],
+			     bitmask[2] & FATTR4_WORD2_MDSTHRESHOLD,
+			     hdr);
+}
+
 static void encode_fsinfo(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr)
 {
 	encode_getattr_three(xdr,
@@ -2164,7 +2177,7 @@ static void nfs4_xdr_enc_open(struct rpc_rqst *req, struct xdr_stream *xdr,
 	encode_putfh(xdr, args->fh, &hdr);
 	encode_open(xdr, args, &hdr);
 	encode_getfh(xdr, &hdr);
-	encode_getfattr(xdr, args->bitmask, &hdr);
+	encode_getfattr_open(xdr, args->bitmask, &hdr);
 	encode_nops(&hdr);
 }
 
@@ -4186,6 +4199,110 @@ xdr_error:
 	return status;
 }
 
+static int decode_threshold_hint(struct xdr_stream *xdr,
+				  uint32_t *bitmap,
+				  uint64_t *res,
+				  uint32_t hint_bit)
+{
+	__be32 *p;
+
+	*res = 0;
+	if (likely(bitmap[0] & hint_bit)) {
+		p = xdr_inline_decode(xdr, 8);
+		if (unlikely(!p))
+			goto out_overflow;
+		xdr_decode_hyper(p, res);
+	}
+	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int decode_first_threshold_item4(struct xdr_stream *xdr,
+					struct nfs4_threshold *res)
+{
+	__be32 *p, *savep;
+	uint32_t bitmap[3] = {0,}, attrlen;
+	int status;
+
+	/* layout type */
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(!p)) {
+		print_overflow_msg(__func__, xdr);
+		return -EIO;
+	}
+	res->l_type = be32_to_cpup(p);
+
+	/* thi_hintset bitmap */
+	status = decode_attr_bitmap(xdr, bitmap);
+	if (status < 0)
+		goto xdr_error;
+
+	/* thi_hintlist length */
+	status = decode_attr_length(xdr, &attrlen, &savep);
+	if (status < 0)
+		goto xdr_error;
+	/* thi_hintlist */
+	status = decode_threshold_hint(xdr, bitmap, &res->rd_sz, THRESHOLD_RD);
+	if (status < 0)
+		goto xdr_error;
+	status = decode_threshold_hint(xdr, bitmap, &res->wr_sz, THRESHOLD_WR);
+	if (status < 0)
+		goto xdr_error;
+	status = decode_threshold_hint(xdr, bitmap, &res->rd_io_sz,
+				       THRESHOLD_RD_IO);
+	if (status < 0)
+		goto xdr_error;
+	status = decode_threshold_hint(xdr, bitmap, &res->wr_io_sz,
+				       THRESHOLD_WR_IO);
+	if (status < 0)
+		goto xdr_error;
+
+	status = verify_attr_len(xdr, savep, attrlen);
+	res->bm = bitmap[0];
+
+	dprintk("%s bm=0x%x rd_sz=%llu wr_sz=%llu rd_io=%llu wr_io=%llu\n",
+		 __func__, res->bm, res->rd_sz, res->wr_sz, res->rd_io_sz,
+		res->wr_io_sz);
+xdr_error:
+	dprintk("%s ret=%d!\n", __func__, status);
+	return status;
+}
+
+/*
+ * Thresholds on pNFS direct I/O vrs MDS I/O
+ */
+static int decode_attr_mdsthreshold(struct xdr_stream *xdr,
+				    uint32_t *bitmap,
+				    struct nfs4_threshold *res)
+{
+	__be32 *p;
+	int status = 0;
+	uint32_t num;
+
+	if (unlikely(bitmap[2] & (FATTR4_WORD2_MDSTHRESHOLD - 1U)))
+		return -EIO;
+	if (likely(bitmap[2] & FATTR4_WORD2_MDSTHRESHOLD)) {
+		p = xdr_inline_decode(xdr, 4);
+		if (unlikely(!p))
+			goto out_overflow;
+		num = be32_to_cpup(p);
+		if (num == 0)
+			return 0;
+		if (num > 1)
+			printk(KERN_INFO "%s: Warning: Multiple pNFS layout "
+				"drivers per filesystem not supported\n",
+				__func__);
+
+		status = decode_first_threshold_item4(xdr, res);
+	}
+	return status;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
 static int decode_getfattr_attrs(struct xdr_stream *xdr, uint32_t *bitmap,
 		struct nfs_fattr *fattr, struct nfs_fh *fh,
 		struct nfs4_fs_locations *fs_loc,
@@ -4292,6 +4409,10 @@ static int decode_getfattr_attrs(struct xdr_stream *xdr, uint32_t *bitmap,
 		goto xdr_error;
 	fattr->valid |= status;
 
+	status = decode_attr_mdsthreshold(xdr, bitmap, fattr->mdsthreshold);
+	if (status < 0)
+		goto xdr_error;
+
 xdr_error:
 	dprintk("%s: xdr returned %d\n", __func__, -status);
 	return status;
diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h
index 0987146..72b6bad 100644
--- a/include/linux/nfs4.h
+++ b/include/linux/nfs4.h
@@ -526,6 +526,13 @@ enum lock_type4 {
 #define FATTR4_WORD1_MOUNTED_ON_FILEID  (1UL << 23)
 #define FATTR4_WORD1_FS_LAYOUT_TYPES    (1UL << 30)
 #define FATTR4_WORD2_LAYOUT_BLKSIZE     (1UL << 1)
+#define FATTR4_WORD2_MDSTHRESHOLD       (1UL << 4)
+
+/* MDS threshold bitmap bits */
+#define THRESHOLD_RD                    (1UL << 0)
+#define THRESHOLD_WR                    (1UL << 1)
+#define THRESHOLD_RD_IO                 (1UL << 2)
+#define THRESHOLD_WR_IO                 (1UL << 3)
 
 #define NFSPROC4_NULL 0
 #define NFSPROC4_COMPOUND 1
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 0872f32..201c312 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -35,6 +35,15 @@ static inline int nfs_fsid_equal(const struct nfs_fsid *a, const struct nfs_fsid
 	return a->major == b->major && a->minor == b->minor;
 }
 
+struct nfs4_threshold {
+	__u32	bm;
+	__u32	l_type;
+	__u64	rd_sz;
+	__u64	wr_sz;
+	__u64	rd_io_sz;
+	__u64	wr_io_sz;
+};
+
 struct nfs_fattr {
 	unsigned int		valid;		/* which fields are valid */
 	umode_t			mode;
@@ -67,6 +76,7 @@ struct nfs_fattr {
 	unsigned long		gencount;
 	struct nfs4_string	*owner_name;
 	struct nfs4_string	*group_name;
+	struct nfs4_threshold	*mdsthreshold;	/* pNFS threshold hints */
 };
 
 #define NFS_ATTR_FATTR_TYPE		(1U << 0)
-- 
cgit v0.10.2


From 82be417aa37c05116e310b0f2171187ea389f89b Mon Sep 17 00:00:00 2001
From: Andy Adamson <andros@netapp.com>
Date: Wed, 23 May 2012 05:02:35 -0400
Subject: NFSv4.1 cache mdsthreshold values on OPEN

Signed-off-by: Andy Adamson <andros@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 9ad81ce..889f7e5 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -641,6 +641,7 @@ struct nfs_open_context *alloc_nfs_open_context(struct dentry *dentry, fmode_t f
 	nfs_init_lock_context(&ctx->lock_context);
 	ctx->lock_context.open_context = ctx;
 	INIT_LIST_HEAD(&ctx->list);
+	ctx->mdsthreshold = NULL;
 	return ctx;
 }
 
@@ -669,6 +670,7 @@ static void __put_nfs_open_context(struct nfs_open_context *ctx, int is_sync)
 		put_rpccred(ctx->cred);
 	dput(ctx->dentry);
 	nfs_sb_deactive(sb);
+	kfree(ctx->mdsthreshold);
 	kfree(ctx);
 }
 
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 8f39bb3..e725736 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -1782,7 +1782,14 @@ static inline void nfs4_exclusive_attrset(struct nfs4_opendata *opendata, struct
 /*
  * Returns a referenced nfs4_state
  */
-static int _nfs4_do_open(struct inode *dir, struct dentry *dentry, fmode_t fmode, int flags, struct iattr *sattr, struct rpc_cred *cred, struct nfs4_state **res)
+static int _nfs4_do_open(struct inode *dir,
+			struct dentry *dentry,
+			fmode_t fmode,
+			int flags,
+			struct iattr *sattr,
+			struct rpc_cred *cred,
+			struct nfs4_state **res,
+			struct nfs4_threshold **ctx_th)
 {
 	struct nfs4_state_owner  *sp;
 	struct nfs4_state     *state = NULL;
@@ -1807,6 +1814,11 @@ static int _nfs4_do_open(struct inode *dir, struct dentry *dentry, fmode_t fmode
 	if (opendata == NULL)
 		goto err_put_state_owner;
 
+	if (ctx_th && server->attr_bitmask[2] & FATTR4_WORD2_MDSTHRESHOLD) {
+		opendata->f_attr.mdsthreshold = pnfs_mdsthreshold_alloc();
+		if (!opendata->f_attr.mdsthreshold)
+			goto err_opendata_put;
+	}
 	if (dentry->d_inode != NULL)
 		opendata->state = nfs4_get_open_state(dentry->d_inode, sp);
 
@@ -1832,11 +1844,19 @@ static int _nfs4_do_open(struct inode *dir, struct dentry *dentry, fmode_t fmode
 			nfs_setattr_update_inode(state->inode, sattr);
 		nfs_post_op_update_inode(state->inode, opendata->o_res.f_attr);
 	}
+
+	if (pnfs_use_threshold(ctx_th, opendata->f_attr.mdsthreshold, server))
+		*ctx_th = opendata->f_attr.mdsthreshold;
+	else
+		kfree(opendata->f_attr.mdsthreshold);
+	opendata->f_attr.mdsthreshold = NULL;
+
 	nfs4_opendata_put(opendata);
 	nfs4_put_state_owner(sp);
 	*res = state;
 	return 0;
 err_opendata_put:
+	kfree(opendata->f_attr.mdsthreshold);
 	nfs4_opendata_put(opendata);
 err_put_state_owner:
 	nfs4_put_state_owner(sp);
@@ -1846,14 +1866,21 @@ out_err:
 }
 
 
-static struct nfs4_state *nfs4_do_open(struct inode *dir, struct dentry *dentry, fmode_t fmode, int flags, struct iattr *sattr, struct rpc_cred *cred)
+static struct nfs4_state *nfs4_do_open(struct inode *dir,
+					struct dentry *dentry,
+					fmode_t fmode,
+					int flags,
+					struct iattr *sattr,
+					struct rpc_cred *cred,
+					struct nfs4_threshold **ctx_th)
 {
 	struct nfs4_exception exception = { };
 	struct nfs4_state *res;
 	int status;
 
 	do {
-		status = _nfs4_do_open(dir, dentry, fmode, flags, sattr, cred, &res);
+		status = _nfs4_do_open(dir, dentry, fmode, flags, sattr, cred,
+				       &res, ctx_th);
 		if (status == 0)
 			break;
 		/* NOTE: BAD_SEQID means the server and client disagree about the
@@ -2177,7 +2204,8 @@ nfs4_atomic_open(struct inode *dir, struct nfs_open_context *ctx, int open_flags
 	struct nfs4_state *state;
 
 	/* Protect against concurrent sillydeletes */
-	state = nfs4_do_open(dir, ctx->dentry, ctx->mode, open_flags, attr, ctx->cred);
+	state = nfs4_do_open(dir, ctx->dentry, ctx->mode, open_flags, attr,
+			     ctx->cred, &ctx->mdsthreshold);
 	if (IS_ERR(state))
 		return ERR_CAST(state);
 	ctx->state = state;
@@ -2779,7 +2807,7 @@ nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
 		fmode = ctx->mode;
 	}
 	sattr->ia_mode &= ~current_umask();
-	state = nfs4_do_open(dir, de, fmode, flags, sattr, cred);
+	state = nfs4_do_open(dir, de, fmode, flags, sattr, cred, NULL);
 	d_drop(dentry);
 	if (IS_ERR(state)) {
 		status = PTR_ERR(state);
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 5d09a36..cbcb6ae 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -1630,3 +1630,15 @@ out_free:
 	kfree(data);
 	goto out;
 }
+
+struct nfs4_threshold *pnfs_mdsthreshold_alloc(void)
+{
+	struct nfs4_threshold *thp;
+
+	thp = kzalloc(sizeof(*thp), GFP_NOFS);
+	if (!thp) {
+		dprintk("%s mdsthreshold allocation failed\n", __func__);
+		return NULL;
+	}
+	return thp;
+}
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 7980756..29fd23c 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -227,6 +227,7 @@ int pnfs_read_done_resend_to_mds(struct inode *inode, struct list_head *head,
 			const struct nfs_pgio_completion_ops *compl_ops);
 int pnfs_write_done_resend_to_mds(struct inode *inode, struct list_head *head,
 			const struct nfs_pgio_completion_ops *compl_ops);
+struct nfs4_threshold *pnfs_mdsthreshold_alloc(void);
 
 /* nfs4_deviceid_flags */
 enum {
@@ -360,6 +361,14 @@ static inline int pnfs_return_layout(struct inode *ino)
 	return 0;
 }
 
+static inline bool
+pnfs_use_threshold(struct nfs4_threshold **dst, struct nfs4_threshold *src,
+		   struct nfs_server *nfss)
+{
+	return (dst && src && src->bm != 0 &&
+					nfss->pnfs_curr_ld->id == src->l_type);
+}
+
 #ifdef NFS_DEBUG
 void nfs4_print_deviceid(const struct nfs4_deviceid *dev_id);
 #else
@@ -485,6 +494,18 @@ static inline int pnfs_layoutcommit_inode(struct inode *inode, bool sync)
 	return 0;
 }
 
+static inline bool
+pnfs_use_threshold(struct nfs4_threshold **dst, struct nfs4_threshold *src,
+		   struct nfs_server *nfss)
+{
+	return false;
+}
+
+static inline struct nfs4_threshold *pnfs_mdsthreshold_alloc(void)
+{
+	return NULL;
+}
+
 #endif /* CONFIG_NFS_V4_1 */
 
 #endif /* FS_NFS_PNFS_H */
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 80a9385..ce910cb 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -102,6 +102,7 @@ struct nfs_open_context {
 	int error;
 
 	struct list_head list;
+	struct nfs4_threshold	*mdsthreshold;
 };
 
 struct nfs_open_dir_context {
-- 
cgit v0.10.2


From 2701d086dbfca03b2d28b25c6dc11dd78d0e26ad Mon Sep 17 00:00:00 2001
From: Andy Adamson <andros@netapp.com>
Date: Thu, 24 May 2012 13:13:24 -0400
Subject: NFSv4.1 add nfs_inode book keeping for mdsthreshold

Keep track of the number of bytes read or written via buffered, direct, and
mem-mapped i/o for use by mdsthreshold size_io hints.

Signed-off-by: Andy Adamson <andros@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index c47a46e..23d170b 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -447,6 +447,7 @@ static ssize_t nfs_direct_read(struct kiocb *iocb, const struct iovec *iov,
 	result = nfs_direct_read_schedule_iovec(dreq, iov, nr_segs, pos);
 	if (!result)
 		result = nfs_direct_wait(dreq);
+	NFS_I(inode)->read_io += result;
 out_release:
 	nfs_direct_req_release(dreq);
 out:
@@ -785,6 +786,7 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
 		pos += vec->iov_len;
 	}
 	nfs_pageio_complete(&desc);
+	NFS_I(dreq->inode)->write_io += desc.pg_bytes_written;
 
 	/*
 	 * If no bytes were started, return the error, and let the
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 8eda8a6..56311ca 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -424,6 +424,7 @@ static int nfs_write_end(struct file *file, struct address_space *mapping,
 
 	if (status < 0)
 		return status;
+	NFS_I(mapping->host)->write_io += copied;
 	return copied;
 }
 
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 889f7e5..a6f5fbb 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -323,6 +323,8 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
 		inode->i_gid = -2;
 		inode->i_blocks = 0;
 		memset(nfsi->cookieverf, 0, sizeof(nfsi->cookieverf));
+		nfsi->write_io = 0;
+		nfsi->read_io = 0;
 
 		nfsi->read_cache_jiffies = fattr->time_start;
 		nfsi->attr_gencount = fattr->gencount;
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index cbcb6ae..6620606 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -395,6 +395,9 @@ mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
 	dprintk("%s:Begin lo %p\n", __func__, lo);
 
 	if (list_empty(&lo->plh_segs)) {
+		/* Reset MDS Threshold I/O counters */
+		NFS_I(lo->plh_inode)->write_io = 0;
+		NFS_I(lo->plh_inode)->read_io = 0;
 		if (!test_and_set_bit(NFS_LAYOUT_DESTROYED, &lo->plh_flags))
 			put_layout_hdr_locked(lo);
 		return 0;
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 2cfdd77..86ced78 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -152,6 +152,7 @@ int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,
 	nfs_pageio_init_read(&pgio, inode, &nfs_async_read_completion_ops);
 	nfs_pageio_add_request(&pgio, new);
 	nfs_pageio_complete(&pgio);
+	NFS_I(inode)->read_io += pgio.pg_bytes_written;
 	return 0;
 }
 
@@ -656,6 +657,7 @@ int nfs_readpages(struct file *filp, struct address_space *mapping,
 	ret = read_cache_pages(mapping, pages, readpage_async_filler, &desc);
 
 	nfs_pageio_complete(&pgio);
+	NFS_I(inode)->read_io += pgio.pg_bytes_written;
 	npages = (pgio.pg_bytes_written + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
 	nfs_add_stats(inode, NFSIOS_READPAGES, npages);
 read_complete:
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index ce910cb..b23cfc1 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -202,6 +202,9 @@ struct nfs_inode {
 	/* pNFS layout information */
 	struct pnfs_layout_hdr *layout;
 #endif /* CONFIG_NFS_V4*/
+	/* how many bytes have been written/read and how many bytes queued up */
+	__u64 write_io;
+	__u64 read_io;
 #ifdef CONFIG_NFS_FSCACHE
 	struct fscache_cookie	*fscache;
 #endif
-- 
cgit v0.10.2


From d23d61c8d351f5ced44ce87caf1fa3baab4c3f89 Mon Sep 17 00:00:00 2001
From: Andy Adamson <andros@netapp.com>
Date: Wed, 23 May 2012 05:02:37 -0400
Subject: NFSv4.1 test the mdsthreshold hint parameters

Signed-off-by: Andy Adamson <andros@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 6620606..b8323aa 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -936,6 +936,81 @@ pnfs_find_lseg(struct pnfs_layout_hdr *lo,
 }
 
 /*
+ * Use mdsthreshold hints set at each OPEN to determine if I/O should go
+ * to the MDS or over pNFS
+ *
+ * The nfs_inode read_io and write_io fields are cumulative counters reset
+ * when there are no layout segments. Note that in pnfs_update_layout iomode
+ * is set to IOMODE_READ for a READ request, and set to IOMODE_RW for a
+ * WRITE request.
+ *
+ * A return of true means use MDS I/O.
+ *
+ * From rfc 5661:
+ * If a file's size is smaller than the file size threshold, data accesses
+ * SHOULD be sent to the metadata server.  If an I/O request has a length that
+ * is below the I/O size threshold, the I/O SHOULD be sent to the metadata
+ * server.  If both file size and I/O size are provided, the client SHOULD
+ * reach or exceed  both thresholds before sending its read or write
+ * requests to the data server.
+ */
+static bool pnfs_within_mdsthreshold(struct nfs_open_context *ctx,
+				     struct inode *ino, int iomode)
+{
+	struct nfs4_threshold *t = ctx->mdsthreshold;
+	struct nfs_inode *nfsi = NFS_I(ino);
+	loff_t fsize = i_size_read(ino);
+	bool size = false, size_set = false, io = false, io_set = false, ret = false;
+
+	if (t == NULL)
+		return ret;
+
+	dprintk("%s bm=0x%x rd_sz=%llu wr_sz=%llu rd_io=%llu wr_io=%llu\n",
+		__func__, t->bm, t->rd_sz, t->wr_sz, t->rd_io_sz, t->wr_io_sz);
+
+	switch (iomode) {
+	case IOMODE_READ:
+		if (t->bm & THRESHOLD_RD) {
+			dprintk("%s fsize %llu\n", __func__, fsize);
+			size_set = true;
+			if (fsize < t->rd_sz)
+				size = true;
+		}
+		if (t->bm & THRESHOLD_RD_IO) {
+			dprintk("%s nfsi->read_io %llu\n", __func__,
+				nfsi->read_io);
+			io_set = true;
+			if (nfsi->read_io < t->rd_io_sz)
+				io = true;
+		}
+		break;
+	case IOMODE_RW:
+		if (t->bm & THRESHOLD_WR) {
+			dprintk("%s fsize %llu\n", __func__, fsize);
+			size_set = true;
+			if (fsize < t->wr_sz)
+				size = true;
+		}
+		if (t->bm & THRESHOLD_WR_IO) {
+			dprintk("%s nfsi->write_io %llu\n", __func__,
+				nfsi->write_io);
+			io_set = true;
+			if (nfsi->write_io < t->wr_io_sz)
+				io = true;
+		}
+		break;
+	}
+	if (size_set && io_set) {
+		if (size && io)
+			ret = true;
+	} else if (size || io)
+		ret = true;
+
+	dprintk("<-- %s size %d io %d ret %d\n", __func__, size, io, ret);
+	return ret;
+}
+
+/*
  * Layout segment is retreived from the server if not cached.
  * The appropriate layout segment is referenced and returned to the caller.
  */
@@ -962,6 +1037,10 @@ pnfs_update_layout(struct inode *ino,
 
 	if (!pnfs_enabled_sb(NFS_SERVER(ino)))
 		return NULL;
+
+	if (pnfs_within_mdsthreshold(ctx, ino, iomode))
+		return NULL;
+
 	spin_lock(&ino->i_lock);
 	lo = pnfs_find_alloc_layout(ino, ctx, gfp_flags);
 	if (lo == NULL) {
-- 
cgit v0.10.2


From 7c44f1ae4a21458a1ea3d6482ffb3136f1df6d2b Mon Sep 17 00:00:00 2001
From: Weston Andros Adamson <dros@netapp.com>
Date: Thu, 24 May 2012 13:22:50 -0400
Subject: nfs4.1: add BIND_CONN_TO_SESSION operation

This patch adds the BIND_CONN_TO_SESSION operation which is needed for
upcoming SP4_MACH_CRED work and useful for recovering from broken connections
without destroying the session.

Signed-off-by: Weston Andros Adamson <dros@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index e6da021..2c7f1cf 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -211,6 +211,7 @@ struct vfsmount *nfs4_submount(struct nfs_server *, struct dentry *,
 extern int nfs4_proc_setclientid(struct nfs_client *, u32, unsigned short, struct rpc_cred *, struct nfs4_setclientid_res *);
 extern int nfs4_proc_setclientid_confirm(struct nfs_client *, struct nfs4_setclientid_res *arg, struct rpc_cred *);
 extern int nfs4_proc_get_rootfh(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *);
+extern int nfs4_proc_bind_conn_to_session(struct nfs_client *);
 extern int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred);
 extern int nfs4_init_clientid(struct nfs_client *, struct rpc_cred *);
 extern int nfs41_init_clientid(struct nfs_client *, struct rpc_cred *);
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index e725736..e8988c0 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -5100,6 +5100,60 @@ nfs41_same_server_scope(struct nfs41_server_scope *a,
 }
 
 /*
+ * nfs4_proc_bind_conn_to_session()
+ *
+ * The 4.1 client currently uses the same TCP connection for the
+ * fore and backchannel.
+ */
+int nfs4_proc_bind_conn_to_session(struct nfs_client *clp)
+{
+	int status;
+	struct nfs41_bind_conn_to_session_res res;
+	struct rpc_message msg = {
+		.rpc_proc =
+			&nfs4_procedures[NFSPROC4_CLNT_BIND_CONN_TO_SESSION],
+		.rpc_argp = clp,
+		.rpc_resp = &res,
+	};
+
+	dprintk("--> %s\n", __func__);
+	BUG_ON(clp == NULL);
+
+	res.session = kzalloc(sizeof(struct nfs4_session), GFP_NOFS);
+	if (unlikely(res.session == NULL)) {
+		status = -ENOMEM;
+		goto out;
+	}
+
+	status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT);
+	if (status == 0) {
+		if (memcmp(res.session->sess_id.data,
+		    clp->cl_session->sess_id.data, NFS4_MAX_SESSIONID_LEN)) {
+			dprintk("NFS: %s: Session ID mismatch\n", __func__);
+			status = -EIO;
+			goto out_session;
+		}
+		if (res.dir != NFS4_CDFS4_BOTH) {
+			dprintk("NFS: %s: Unexpected direction from server\n",
+				__func__);
+			status = -EIO;
+			goto out_session;
+		}
+		if (res.use_conn_in_rdma_mode) {
+			dprintk("NFS: %s: Server returned RDMA mode = true\n",
+				__func__);
+			status = -EIO;
+			goto out_session;
+		}
+	}
+out_session:
+	kfree(res.session);
+out:
+	dprintk("<-- %s status= %d\n", __func__, status);
+	return status;
+}
+
+/*
  * nfs4_proc_exchange_id()
  *
  * Since the clientid has expired, all compounds using sessions
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index edb8ac7..a6b95b7 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -326,6 +326,16 @@ static int nfs4_stat_to_errno(int);
 				     1 /* csr_flags */ + \
 				     decode_channel_attrs_maxsz + \
 				     decode_channel_attrs_maxsz)
+#define encode_bind_conn_to_session_maxsz  (op_encode_hdr_maxsz + \
+				     /* bctsa_sessid */ \
+				     XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + \
+				     1 /* bctsa_dir */ + \
+				     1 /* bctsa_use_conn_in_rdma_mode */)
+#define decode_bind_conn_to_session_maxsz  (op_decode_hdr_maxsz +	\
+				     /* bctsr_sessid */ \
+				     XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + \
+				     1 /* bctsr_dir */ + \
+				     1 /* bctsr_use_conn_in_rdma_mode */)
 #define encode_destroy_session_maxsz    (op_encode_hdr_maxsz + 4)
 #define decode_destroy_session_maxsz    (op_decode_hdr_maxsz)
 #define encode_sequence_maxsz	(op_encode_hdr_maxsz + \
@@ -719,6 +729,12 @@ static int nfs4_stat_to_errno(int);
 				decode_putfh_maxsz + \
 				decode_secinfo_maxsz)
 #if defined(CONFIG_NFS_V4_1)
+#define NFS4_enc_bind_conn_to_session_sz \
+				(compound_encode_hdr_maxsz + \
+				 encode_bind_conn_to_session_maxsz)
+#define NFS4_dec_bind_conn_to_session_sz \
+				(compound_decode_hdr_maxsz + \
+				 decode_bind_conn_to_session_maxsz)
 #define NFS4_enc_exchange_id_sz \
 				(compound_encode_hdr_maxsz + \
 				 encode_exchange_id_maxsz)
@@ -1669,6 +1685,20 @@ static void encode_secinfo(struct xdr_stream *xdr, const struct qstr *name, stru
 
 #if defined(CONFIG_NFS_V4_1)
 /* NFSv4.1 operations */
+static void encode_bind_conn_to_session(struct xdr_stream *xdr,
+				   struct nfs4_session *session,
+				   struct compound_hdr *hdr)
+{
+	__be32 *p;
+
+	encode_op_hdr(xdr, OP_BIND_CONN_TO_SESSION,
+		decode_bind_conn_to_session_maxsz, hdr);
+	encode_opaque_fixed(xdr, session->sess_id.data, NFS4_MAX_SESSIONID_LEN);
+	p = xdr_reserve_space(xdr, 8);
+	*p++ = cpu_to_be32(NFS4_CDFC4_BACK_OR_BOTH);
+	*p = 0;	/* use_conn_in_rdma_mode = False */
+}
+
 static void encode_exchange_id(struct xdr_stream *xdr,
 			       struct nfs41_exchange_id_args *args,
 			       struct compound_hdr *hdr)
@@ -2630,6 +2660,22 @@ static void nfs4_xdr_enc_secinfo(struct rpc_rqst *req,
 
 #if defined(CONFIG_NFS_V4_1)
 /*
+ * BIND_CONN_TO_SESSION request
+ */
+static void nfs4_xdr_enc_bind_conn_to_session(struct rpc_rqst *req,
+				struct xdr_stream *xdr,
+				struct nfs_client *clp)
+{
+	struct compound_hdr hdr = {
+		.minorversion = clp->cl_mvops->minor_version,
+	};
+
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_bind_conn_to_session(xdr, clp->cl_session, &hdr);
+	encode_nops(&hdr);
+}
+
+/*
  * EXCHANGE_ID request
  */
 static void nfs4_xdr_enc_exchange_id(struct rpc_rqst *req,
@@ -5366,6 +5412,37 @@ static int decode_sessionid(struct xdr_stream *xdr, struct nfs4_sessionid *sid)
 	return decode_opaque_fixed(xdr, sid->data, NFS4_MAX_SESSIONID_LEN);
 }
 
+static int decode_bind_conn_to_session(struct xdr_stream *xdr,
+				struct nfs41_bind_conn_to_session_res *res)
+{
+	__be32 *p;
+	int status;
+
+	status = decode_op_hdr(xdr, OP_BIND_CONN_TO_SESSION);
+	if (!status)
+		status = decode_sessionid(xdr, &res->session->sess_id);
+	if (unlikely(status))
+		return status;
+
+	/* dir flags, rdma mode bool */
+	p = xdr_inline_decode(xdr, 8);
+	if (unlikely(!p))
+		goto out_overflow;
+
+	res->dir = be32_to_cpup(p++);
+	if (res->dir == 0 || res->dir > NFS4_CDFS4_BOTH)
+		return -EIO;
+	if (be32_to_cpup(p) == 0)
+		res->use_conn_in_rdma_mode = false;
+	else
+		res->use_conn_in_rdma_mode = true;
+
+	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
 static int decode_create_session(struct xdr_stream *xdr,
 				 struct nfs41_create_session_res *res)
 {
@@ -6648,6 +6725,22 @@ out:
 
 #if defined(CONFIG_NFS_V4_1)
 /*
+ * Decode BIND_CONN_TO_SESSION response
+ */
+static int nfs4_xdr_dec_bind_conn_to_session(struct rpc_rqst *rqstp,
+					struct xdr_stream *xdr,
+					void *res)
+{
+	struct compound_hdr hdr;
+	int status;
+
+	status = decode_compound_hdr(xdr, &hdr);
+	if (!status)
+		status = decode_bind_conn_to_session(xdr, res);
+	return status;
+}
+
+/*
  * Decode EXCHANGE_ID response
  */
 static int nfs4_xdr_dec_exchange_id(struct rpc_rqst *rqstp,
@@ -7128,6 +7221,8 @@ struct rpc_procinfo	nfs4_procedures[] = {
 	PROC(RELEASE_LOCKOWNER,	enc_release_lockowner,	dec_release_lockowner),
 	PROC(SECINFO,		enc_secinfo,		dec_secinfo),
 #if defined(CONFIG_NFS_V4_1)
+	PROC(BIND_CONN_TO_SESSION,
+			enc_bind_conn_to_session, dec_bind_conn_to_session),
 	PROC(EXCHANGE_ID,	enc_exchange_id,	dec_exchange_id),
 	PROC(CREATE_SESSION,	enc_create_session,	dec_create_session),
 	PROC(DESTROY_SESSION,	enc_destroy_session,	dec_destroy_session),
diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h
index 72b6bad..a2b71cb 100644
--- a/include/linux/nfs4.h
+++ b/include/linux/nfs4.h
@@ -69,6 +69,10 @@
 #define NFS4_CDFC4_FORE_OR_BOTH 0x3
 #define NFS4_CDFC4_BACK_OR_BOTH 0x7
 
+#define NFS4_CDFS4_FORE 0x1
+#define NFS4_CDFS4_BACK 0x2
+#define NFS4_CDFS4_BOTH 0x3
+
 #define NFS4_SET_TO_SERVER_TIME	0
 #define NFS4_SET_TO_CLIENT_TIME	1
 
@@ -589,6 +593,7 @@ enum {
 	NFSPROC4_CLNT_SECINFO,
 
 	/* nfs41 */
+	NFSPROC4_CLNT_BIND_CONN_TO_SESSION,
 	NFSPROC4_CLNT_EXCHANGE_ID,
 	NFSPROC4_CLNT_CREATE_SESSION,
 	NFSPROC4_CLNT_DESTROY_SESSION,
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 201c312..6387fc0 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -1125,6 +1125,12 @@ struct nfs41_impl_id {
 	struct nfstime4			date;
 };
 
+struct nfs41_bind_conn_to_session_res {
+	struct nfs4_session		*session;
+	u32				dir;
+	bool				use_conn_in_rdma_mode;
+};
+
 struct nfs41_exchange_id_res {
 	struct nfs_client		*client;
 	u32				flags;
-- 
cgit v0.10.2


From a9e64442f1399e9f6ceaeeeb03a26a560c949fac Mon Sep 17 00:00:00 2001
From: Weston Andros Adamson <dros@netapp.com>
Date: Thu, 24 May 2012 12:26:37 -0400
Subject: nfs41: Use BIND_CONN_TO_SESSION for CB_PATH_DOWN*

The state manager can handle SEQ4_STATUS_CB_PATH_DOWN* flags with a
BIND_CONN_TO_SESSION instead of destroying the session and creating a new one.

Signed-off-by: Weston Andros Adamson <dros@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 2c7f1cf..5fcb1ad 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -25,6 +25,7 @@ enum nfs4_client_state {
 	NFS4CLNT_LEASE_CONFIRM,
 	NFS4CLNT_SERVER_SCOPE_MISMATCH,
 	NFS4CLNT_PURGE_STATE,
+	NFS4CLNT_BIND_CONN_TO_SESSION,
 };
 
 enum nfs4_session_state {
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 32cce4a..03fa802 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -1640,13 +1640,20 @@ static void nfs41_handle_recallable_state_revoked(struct nfs_client *clp)
 	nfs_expire_all_delegations(clp);
 }
 
-static void nfs41_handle_cb_path_down(struct nfs_client *clp)
+static void nfs41_handle_backchannel_fault(struct nfs_client *clp)
 {
 	nfs_expire_all_delegations(clp);
 	if (test_and_set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state) == 0)
 		nfs4_schedule_state_manager(clp);
 }
 
+static void nfs41_handle_cb_path_down(struct nfs_client *clp)
+{
+	if (test_and_set_bit(NFS4CLNT_BIND_CONN_TO_SESSION,
+		&clp->cl_state) == 0)
+		nfs4_schedule_state_manager(clp);
+}
+
 void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags)
 {
 	if (!flags)
@@ -1664,9 +1671,10 @@ void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags)
 		nfs41_handle_state_revoked(clp);
 	if (flags & SEQ4_STATUS_RECALLABLE_STATE_REVOKED)
 		nfs41_handle_recallable_state_revoked(clp);
-	if (flags & (SEQ4_STATUS_CB_PATH_DOWN |
-			    SEQ4_STATUS_BACKCHANNEL_FAULT |
-			    SEQ4_STATUS_CB_PATH_DOWN_SESSION))
+	if (flags & SEQ4_STATUS_BACKCHANNEL_FAULT)
+		nfs41_handle_backchannel_fault(clp);
+	else if (flags & (SEQ4_STATUS_CB_PATH_DOWN |
+				SEQ4_STATUS_CB_PATH_DOWN_SESSION))
 		nfs41_handle_cb_path_down(clp);
 }
 
@@ -1691,6 +1699,7 @@ static int nfs4_reset_session(struct nfs_client *clp)
 	clear_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state);
 	/* create_session negotiated new slot table */
 	clear_bit(NFS4CLNT_RECALL_SLOT, &clp->cl_state);
+	clear_bit(NFS4CLNT_BIND_CONN_TO_SESSION, &clp->cl_state);
 
 	 /* Let the state manager reestablish state */
 	if (!test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state))
@@ -1727,10 +1736,19 @@ static int nfs4_recall_slot(struct nfs_client *clp)
 	return 0;
 }
 
+static int nfs4_bind_conn_to_session(struct nfs_client *clp)
+{
+	return nfs4_proc_bind_conn_to_session(clp);
+}
 #else /* CONFIG_NFS_V4_1 */
 static int nfs4_reset_session(struct nfs_client *clp) { return 0; }
 static int nfs4_end_drain_session(struct nfs_client *clp) { return 0; }
 static int nfs4_recall_slot(struct nfs_client *clp) { return 0; }
+
+static int nfs4_bind_conn_to_session(struct nfs_client *clp)
+{
+	return 0;
+}
 #endif /* CONFIG_NFS_V4_1 */
 
 /* Set NFS4CLNT_LEASE_EXPIRED for all v4.0 errors and for recoverable errors
@@ -1814,6 +1832,14 @@ static void nfs4_state_manager(struct nfs_client *clp)
 				goto out_error;
 		}
 
+		/* Send BIND_CONN_TO_SESSION */
+		if (test_and_clear_bit(NFS4CLNT_BIND_CONN_TO_SESSION,
+				&clp->cl_state) && nfs4_has_session(clp)) {
+			status = nfs4_bind_conn_to_session(clp);
+			if (status < 0)
+				goto out_error;
+		}
+
 		/* First recover reboot state... */
 		if (test_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state)) {
 			status = nfs4_do_reclaim(clp,
-- 
cgit v0.10.2


From bbafffd293e47f4cd5f0ae8b91d7d5767b242a5e Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Thu, 24 May 2012 16:31:39 -0400
Subject: NFSv4.1: Exchange ID must use GFP_NOFS allocation mode

Exchange ID can be called in a lease reclaim situation, so it
will deadlock if it then tries to write out dirty NFS pages.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index e8988c0..f8817e8 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -5192,20 +5192,20 @@ int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred)
 				clp->cl_rpcclient->cl_auth->au_flavor);
 
 	res.server_owner = kzalloc(sizeof(struct nfs41_server_owner),
-					GFP_KERNEL);
+					GFP_NOFS);
 	if (unlikely(res.server_owner == NULL)) {
 		status = -ENOMEM;
 		goto out;
 	}
 
 	res.server_scope = kzalloc(sizeof(struct nfs41_server_scope),
-					GFP_KERNEL);
+					GFP_NOFS);
 	if (unlikely(res.server_scope == NULL)) {
 		status = -ENOMEM;
 		goto out_server_owner;
 	}
 
-	res.impl_id = kzalloc(sizeof(struct nfs41_impl_id), GFP_KERNEL);
+	res.impl_id = kzalloc(sizeof(struct nfs41_impl_id), GFP_NOFS);
 	if (unlikely(res.impl_id == NULL)) {
 		status = -ENOMEM;
 		goto out_server_scope;
-- 
cgit v0.10.2


From 2a6ee6aa2f6dfc47fce8380ec9e31601c96a693e Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Fri, 25 May 2012 15:00:06 -0400
Subject: NFSv4: Clean up the error handling for nfs4_reclaim_lease

Try to consolidate the error handling for nfs4_reclaim_lease into
a single function instead of doing a bit here, and a bit there...

Also ensure that NFS4CLNT_PURGE_STATE handles errors correctly.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 03fa802..758b9a8 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -1574,26 +1574,57 @@ out:
 	return nfs4_recovery_handle_error(clp, status);
 }
 
+/* Set NFS4CLNT_LEASE_EXPIRED for all v4.0 errors and for recoverable errors
+ * on EXCHANGE_ID for v4.1
+ */
+static int nfs4_handle_reclaim_lease_error(struct nfs_client *clp, int status)
+{
+	switch (status) {
+	case -NFS4ERR_CLID_INUSE:
+	case -NFS4ERR_STALE_CLIENTID:
+		clear_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state);
+		break;
+	case -EACCES:
+		if (clp->cl_machine_cred == NULL)
+			return -EACCES;
+		/* Handle case where the user hasn't set up machine creds */
+		nfs4_clear_machine_cred(clp);
+	case -NFS4ERR_DELAY:
+	case -ETIMEDOUT:
+	case -EAGAIN:
+		ssleep(1);
+		break;
+
+	case -NFS4ERR_MINOR_VERS_MISMATCH:
+		if (clp->cl_cons_state == NFS_CS_SESSION_INITING)
+			nfs_mark_client_ready(clp, -EPROTONOSUPPORT);
+		return -EPROTONOSUPPORT;
+	case -EKEYEXPIRED:
+		nfs4_warn_keyexpired(clp->cl_hostname);
+	case -NFS4ERR_NOT_SAME: /* FixMe: implement recovery
+				 * in nfs4_exchange_id */
+	default:
+		return status;
+	}
+	set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
+	return 0;
+}
+
 static int nfs4_reclaim_lease(struct nfs_client *clp)
 {
 	struct rpc_cred *cred;
 	const struct nfs4_state_recovery_ops *ops =
 		clp->cl_mvops->reboot_recovery_ops;
-	int status = -ENOENT;
+	int status;
 
 	cred = ops->get_clid_cred(clp);
-	if (cred != NULL) {
-		status = ops->establish_clid(clp, cred);
-		put_rpccred(cred);
-		/* Handle case where the user hasn't set up machine creds */
-		if (status == -EACCES && cred == clp->cl_machine_cred) {
-			nfs4_clear_machine_cred(clp);
-			status = -EAGAIN;
-		}
-		if (status == -NFS4ERR_MINOR_VERS_MISMATCH)
-			status = -EPROTONOSUPPORT;
-	}
-	return status;
+	if (cred == NULL)
+		return -ENOENT;
+	status = ops->establish_clid(clp, cred);
+	put_rpccred(cred);
+	if (status != 0)
+		return nfs4_handle_reclaim_lease_error(clp, status);
+	return 0;
 }
 
 #ifdef CONFIG_NFS_V4_1
@@ -1751,32 +1782,6 @@ static int nfs4_bind_conn_to_session(struct nfs_client *clp)
 }
 #endif /* CONFIG_NFS_V4_1 */
 
-/* Set NFS4CLNT_LEASE_EXPIRED for all v4.0 errors and for recoverable errors
- * on EXCHANGE_ID for v4.1
- */
-static void nfs4_set_lease_expired(struct nfs_client *clp, int status)
-{
-	switch (status) {
-	case -NFS4ERR_CLID_INUSE:
-	case -NFS4ERR_STALE_CLIENTID:
-		clear_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state);
-		break;
-	case -NFS4ERR_DELAY:
-	case -ETIMEDOUT:
-	case -EAGAIN:
-		ssleep(1);
-		break;
-
-	case -EKEYEXPIRED:
-		nfs4_warn_keyexpired(clp->cl_hostname);
-	case -NFS4ERR_NOT_SAME: /* FixMe: implement recovery
-				 * in nfs4_exchange_id */
-	default:
-		return;
-	}
-	set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
-}
-
 static void nfs4_state_manager(struct nfs_client *clp)
 {
 	int status = 0;
@@ -1784,7 +1789,9 @@ static void nfs4_state_manager(struct nfs_client *clp)
 	/* Ensure exclusive access to NFSv4 state */
 	do {
 		if (test_bit(NFS4CLNT_PURGE_STATE, &clp->cl_state)) {
-			nfs4_reclaim_lease(clp);
+			status = nfs4_reclaim_lease(clp);
+			if (status < 0)
+				goto out_error;
 			clear_bit(NFS4CLNT_PURGE_STATE, &clp->cl_state);
 			set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
 		}
@@ -1792,16 +1799,10 @@ static void nfs4_state_manager(struct nfs_client *clp)
 		if (test_and_clear_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state)) {
 			/* We're going to have to re-establish a clientid */
 			status = nfs4_reclaim_lease(clp);
-			if (status) {
-				nfs4_set_lease_expired(clp, status);
-				if (test_bit(NFS4CLNT_LEASE_EXPIRED,
-							&clp->cl_state))
-					continue;
-				if (clp->cl_cons_state ==
-							NFS_CS_SESSION_INITING)
-					nfs_mark_client_ready(clp, status);
+			if (status < 0)
 				goto out_error;
-			}
+			if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state))
+				continue;
 			clear_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state);
 
 			if (test_and_clear_bit(NFS4CLNT_SERVER_SCOPE_MISMATCH,
-- 
cgit v0.10.2


From be0bfed002e0c64a91dacc42a4dab6e883e6bc7e Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Fri, 25 May 2012 16:02:15 -0400
Subject: NFSv4: When purging the lease, we must clear NFS4CLNT_LEASE_CONFIRM

Otherwise we can end up not sending a new exchange-id/setclientid

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 758b9a8..604c600 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -1647,6 +1647,7 @@ static void nfs4_reset_all_state(struct nfs_client *clp)
 {
 	if (test_and_set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) == 0) {
 		set_bit(NFS4CLNT_PURGE_STATE, &clp->cl_state);
+		clear_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state);
 		nfs4_state_start_reclaim_nograce(clp);
 		nfs4_schedule_state_manager(clp);
 	}
-- 
cgit v0.10.2


From 89a217360ef4e96eb83758cb9647f1c42581b097 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Fri, 25 May 2012 15:00:06 -0400
Subject: NFSv4.1: Handle NFS4ERR_SEQ_MISORDERED when confirming the lease

Apparently the patch "NFS: Always use the same SETCLIENTID boot verifier"
is tickling a Linux nfs server bug, and causing a regression: the server
can get into a situation where it keeps replying NFS4ERR_SEQ_MISORDERED
to our CREATE_SESSION request even when we are sending the correct
sequence ID.

Fix this by purging the lease and then retrying.

Reported-by: Bryan Schumaker <bjschuma@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 604c600..419f8c4 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -1580,6 +1580,11 @@ out:
 static int nfs4_handle_reclaim_lease_error(struct nfs_client *clp, int status)
 {
 	switch (status) {
+	case -NFS4ERR_SEQ_MISORDERED:
+		if (test_and_set_bit(NFS4CLNT_PURGE_STATE, &clp->cl_state))
+			return -ESERVERFAULT;
+		/* Lease confirmation error: retry after purging the lease */
+		ssleep(1);
 	case -NFS4ERR_CLID_INUSE:
 	case -NFS4ERR_STALE_CLIENTID:
 		clear_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state);
-- 
cgit v0.10.2


From ad24ecfbcddfa88541bccc980e753aeda8bf4031 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Fri, 25 May 2012 17:11:42 -0400
Subject: NFSv4.1: Move NFSPROC4_CLNT_BIND_CONN_TO_SESSION to the end of the
 operations

For backward compatibility with nfs-utils.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Cc: Weston Andros Adamson <dros@netapp.com>

diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index a6b95b7..1d4d259 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -7221,8 +7221,6 @@ struct rpc_procinfo	nfs4_procedures[] = {
 	PROC(RELEASE_LOCKOWNER,	enc_release_lockowner,	dec_release_lockowner),
 	PROC(SECINFO,		enc_secinfo,		dec_secinfo),
 #if defined(CONFIG_NFS_V4_1)
-	PROC(BIND_CONN_TO_SESSION,
-			enc_bind_conn_to_session, dec_bind_conn_to_session),
 	PROC(EXCHANGE_ID,	enc_exchange_id,	dec_exchange_id),
 	PROC(CREATE_SESSION,	enc_create_session,	dec_create_session),
 	PROC(DESTROY_SESSION,	enc_destroy_session,	dec_destroy_session),
@@ -7237,6 +7235,8 @@ struct rpc_procinfo	nfs4_procedures[] = {
 	PROC(TEST_STATEID,	enc_test_stateid,	dec_test_stateid),
 	PROC(FREE_STATEID,	enc_free_stateid,	dec_free_stateid),
 	PROC(GETDEVICELIST,	enc_getdevicelist,	dec_getdevicelist),
+	PROC(BIND_CONN_TO_SESSION,
+			enc_bind_conn_to_session, dec_bind_conn_to_session),
 #endif /* CONFIG_NFS_V4_1 */
 };
 
diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h
index a2b71cb..54006a9 100644
--- a/include/linux/nfs4.h
+++ b/include/linux/nfs4.h
@@ -593,7 +593,6 @@ enum {
 	NFSPROC4_CLNT_SECINFO,
 
 	/* nfs41 */
-	NFSPROC4_CLNT_BIND_CONN_TO_SESSION,
 	NFSPROC4_CLNT_EXCHANGE_ID,
 	NFSPROC4_CLNT_CREATE_SESSION,
 	NFSPROC4_CLNT_DESTROY_SESSION,
@@ -608,6 +607,7 @@ enum {
 	NFSPROC4_CLNT_TEST_STATEID,
 	NFSPROC4_CLNT_FREE_STATEID,
 	NFSPROC4_CLNT_GETDEVICELIST,
+	NFSPROC4_CLNT_BIND_CONN_TO_SESSION,
 };
 
 /* nfs41 types */
-- 
cgit v0.10.2


From 848f5bda54ef19435ff78f124082bf6eff2ab620 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Fri, 25 May 2012 17:51:23 -0400
Subject: NFSv4.1: Ensure we use the correct credentials for session
 create/destroy

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 5fcb1ad..a5dbe62 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -241,8 +241,8 @@ extern int nfs41_setup_sequence(struct nfs4_session *session,
 		struct rpc_task *task);
 extern void nfs4_destroy_session(struct nfs4_session *session);
 extern struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp);
-extern int nfs4_proc_create_session(struct nfs_client *);
-extern int nfs4_proc_destroy_session(struct nfs4_session *);
+extern int nfs4_proc_create_session(struct nfs_client *, struct rpc_cred *);
+extern int nfs4_proc_destroy_session(struct nfs4_session *, struct rpc_cred *);
 extern int nfs4_init_session(struct nfs_server *server);
 extern int nfs4_proc_get_lease_time(struct nfs_client *clp,
 		struct nfs_fsinfo *fsinfo);
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index f8817e8..8fa3a36 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -5480,8 +5480,12 @@ struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp)
 void nfs4_destroy_session(struct nfs4_session *session)
 {
 	struct rpc_xprt *xprt;
+	struct rpc_cred *cred;
 
-	nfs4_proc_destroy_session(session);
+	cred = nfs4_get_exchange_id_cred(session->clp);
+	nfs4_proc_destroy_session(session, cred);
+	if (cred)
+		put_rpccred(cred);
 
 	rcu_read_lock();
 	xprt = rcu_dereference(session->clp->cl_rpcclient->cl_xprt);
@@ -5591,7 +5595,8 @@ static int nfs4_verify_channel_attrs(struct nfs41_create_session_args *args,
 	return nfs4_verify_back_channel_attrs(args, session);
 }
 
-static int _nfs4_proc_create_session(struct nfs_client *clp)
+static int _nfs4_proc_create_session(struct nfs_client *clp,
+		struct rpc_cred *cred)
 {
 	struct nfs4_session *session = clp->cl_session;
 	struct nfs41_create_session_args args = {
@@ -5605,6 +5610,7 @@ static int _nfs4_proc_create_session(struct nfs_client *clp)
 		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CREATE_SESSION],
 		.rpc_argp = &args,
 		.rpc_resp = &res,
+		.rpc_cred = cred,
 	};
 	int status;
 
@@ -5629,7 +5635,7 @@ static int _nfs4_proc_create_session(struct nfs_client *clp)
  * It is the responsibility of the caller to verify the session is
  * expired before calling this routine.
  */
-int nfs4_proc_create_session(struct nfs_client *clp)
+int nfs4_proc_create_session(struct nfs_client *clp, struct rpc_cred *cred)
 {
 	int status;
 	unsigned *ptr;
@@ -5637,7 +5643,7 @@ int nfs4_proc_create_session(struct nfs_client *clp)
 
 	dprintk("--> %s clp=%p session=%p\n", __func__, clp, session);
 
-	status = _nfs4_proc_create_session(clp);
+	status = _nfs4_proc_create_session(clp, cred);
 	if (status)
 		goto out;
 
@@ -5659,10 +5665,15 @@ out:
  * Issue the over-the-wire RPC DESTROY_SESSION.
  * The caller must serialize access to this routine.
  */
-int nfs4_proc_destroy_session(struct nfs4_session *session)
+int nfs4_proc_destroy_session(struct nfs4_session *session,
+		struct rpc_cred *cred)
 {
+	struct rpc_message msg = {
+		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_DESTROY_SESSION],
+		.rpc_argp = session,
+		.rpc_cred = cred,
+	};
 	int status = 0;
-	struct rpc_message msg;
 
 	dprintk("--> nfs4_proc_destroy_session\n");
 
@@ -5670,10 +5681,6 @@ int nfs4_proc_destroy_session(struct nfs4_session *session)
 	if (session->clp->cl_cons_state != NFS_CS_READY)
 		return status;
 
-	msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_DESTROY_SESSION];
-	msg.rpc_argp = session;
-	msg.rpc_resp = NULL;
-	msg.rpc_cred = NULL;
 	status = rpc_call_sync(session->clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT);
 
 	if (status)
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 419f8c4..1587840 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -256,7 +256,7 @@ int nfs41_init_clientid(struct nfs_client *clp, struct rpc_cred *cred)
 		goto out;
 	set_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state);
 do_confirm:
-	status = nfs4_proc_create_session(clp);
+	status = nfs4_proc_create_session(clp, cred);
 	if (status != 0)
 		goto out;
 	clear_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state);
@@ -1717,10 +1717,12 @@ void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags)
 
 static int nfs4_reset_session(struct nfs_client *clp)
 {
+	struct rpc_cred *cred;
 	int status;
 
 	nfs4_begin_drain_session(clp);
-	status = nfs4_proc_destroy_session(clp->cl_session);
+	cred = nfs4_get_exchange_id_cred(clp);
+	status = nfs4_proc_destroy_session(clp->cl_session, cred);
 	if (status && status != -NFS4ERR_BADSESSION &&
 	    status != -NFS4ERR_DEADSESSION) {
 		status = nfs4_recovery_handle_error(clp, status);
@@ -1728,7 +1730,7 @@ static int nfs4_reset_session(struct nfs_client *clp)
 	}
 
 	memset(clp->cl_session->sess_id.data, 0, NFS4_MAX_SESSIONID_LEN);
-	status = nfs4_proc_create_session(clp);
+	status = nfs4_proc_create_session(clp, cred);
 	if (status) {
 		status = nfs4_recovery_handle_error(clp, status);
 		goto out;
@@ -1742,6 +1744,8 @@ static int nfs4_reset_session(struct nfs_client *clp)
 	if (!test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state))
 		nfs41_setup_state_renewal(clp);
 out:
+	if (cred)
+		put_rpccred(cred);
 	return status;
 }
 
-- 
cgit v0.10.2


From 2cf047c994c8a62f65e520342d0287fca8807a53 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Fri, 25 May 2012 17:57:41 -0400
Subject: NFSv4.1: Ensure we use the correct credentials for
 bind_conn_to_session

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Cc: Weston Andros Adamson <dros@netapp.com>

diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index a5dbe62..f730730 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -212,7 +212,7 @@ struct vfsmount *nfs4_submount(struct nfs_server *, struct dentry *,
 extern int nfs4_proc_setclientid(struct nfs_client *, u32, unsigned short, struct rpc_cred *, struct nfs4_setclientid_res *);
 extern int nfs4_proc_setclientid_confirm(struct nfs_client *, struct nfs4_setclientid_res *arg, struct rpc_cred *);
 extern int nfs4_proc_get_rootfh(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *);
-extern int nfs4_proc_bind_conn_to_session(struct nfs_client *);
+extern int nfs4_proc_bind_conn_to_session(struct nfs_client *, struct rpc_cred *cred);
 extern int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred);
 extern int nfs4_init_clientid(struct nfs_client *, struct rpc_cred *);
 extern int nfs41_init_clientid(struct nfs_client *, struct rpc_cred *);
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 8fa3a36..3fdff0c 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -5105,7 +5105,7 @@ nfs41_same_server_scope(struct nfs41_server_scope *a,
  * The 4.1 client currently uses the same TCP connection for the
  * fore and backchannel.
  */
-int nfs4_proc_bind_conn_to_session(struct nfs_client *clp)
+int nfs4_proc_bind_conn_to_session(struct nfs_client *clp, struct rpc_cred *cred)
 {
 	int status;
 	struct nfs41_bind_conn_to_session_res res;
@@ -5114,6 +5114,7 @@ int nfs4_proc_bind_conn_to_session(struct nfs_client *clp)
 			&nfs4_procedures[NFSPROC4_CLNT_BIND_CONN_TO_SESSION],
 		.rpc_argp = clp,
 		.rpc_resp = &res,
+		.rpc_cred = cred,
 	};
 
 	dprintk("--> %s\n", __func__);
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 1587840..7dbca66 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -1779,7 +1779,14 @@ static int nfs4_recall_slot(struct nfs_client *clp)
 
 static int nfs4_bind_conn_to_session(struct nfs_client *clp)
 {
-	return nfs4_proc_bind_conn_to_session(clp);
+	struct rpc_cred *cred;
+	int ret;
+
+	cred = nfs4_get_exchange_id_cred(clp);
+	ret = nfs4_proc_bind_conn_to_session(clp, cred);
+	if (cred)
+		put_rpccred(cred);
+	return ret;
 }
 #else /* CONFIG_NFS_V4_1 */
 static int nfs4_reset_session(struct nfs_client *clp) { return 0; }
-- 
cgit v0.10.2


From 662455391040a783b89d0232e743c27c23617dbd Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Fri, 25 May 2012 17:18:09 -0400
Subject: NFSv4.1: Add DESTROY_CLIENTID

Ensure that we destroy our lease on last unmount

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index a50bdfb..7d10875 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -209,6 +209,7 @@ static void nfs4_shutdown_session(struct nfs_client *clp)
 	if (nfs4_has_session(clp)) {
 		nfs4_deviceid_purge_client(clp);
 		nfs4_destroy_session(clp->cl_session);
+		nfs4_destroy_clientid(clp);
 	}
 
 }
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index f730730..b20b516 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -214,6 +214,7 @@ extern int nfs4_proc_setclientid_confirm(struct nfs_client *, struct nfs4_setcli
 extern int nfs4_proc_get_rootfh(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *);
 extern int nfs4_proc_bind_conn_to_session(struct nfs_client *, struct rpc_cred *cred);
 extern int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred);
+extern int nfs4_destroy_clientid(struct nfs_client *clp);
 extern int nfs4_init_clientid(struct nfs_client *, struct rpc_cred *);
 extern int nfs41_init_clientid(struct nfs_client *, struct rpc_cred *);
 extern int nfs4_do_close(struct nfs4_state *state, gfp_t gfp_mask, int wait, bool roc);
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 3fdff0c..485a6c0 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -5261,6 +5261,65 @@ out:
 	return status;
 }
 
+static int _nfs4_proc_destroy_clientid(struct nfs_client *clp,
+		struct rpc_cred *cred)
+{
+	struct rpc_message msg = {
+		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_DESTROY_CLIENTID],
+		.rpc_argp = clp,
+		.rpc_cred = cred,
+	};
+	int status;
+
+	status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT);
+	if (status)
+		pr_warn("NFS: Got error %d from the server %s on "
+			"DESTROY_CLIENTID.", status, clp->cl_hostname);
+	return status;
+}
+
+static int nfs4_proc_destroy_clientid(struct nfs_client *clp,
+		struct rpc_cred *cred)
+{
+	unsigned int loop;
+	int ret;
+
+	for (loop = NFS4_MAX_LOOP_ON_RECOVER; loop != 0; loop--) {
+		ret = _nfs4_proc_destroy_clientid(clp, cred);
+		switch (ret) {
+		case -NFS4ERR_DELAY:
+		case -NFS4ERR_CLIENTID_BUSY:
+			ssleep(1);
+			break;
+		default:
+			return ret;
+		}
+	}
+	return 0;
+}
+
+int nfs4_destroy_clientid(struct nfs_client *clp)
+{
+	struct rpc_cred *cred;
+	int ret = 0;
+
+	if (clp->cl_mvops->minor_version < 1)
+		goto out;
+	if (clp->cl_exchange_flags == 0)
+		goto out;
+	cred = nfs4_get_exchange_id_cred(clp);
+	ret = nfs4_proc_destroy_clientid(clp, cred);
+	if (cred)
+		put_rpccred(cred);
+	switch (ret) {
+	case 0:
+	case -NFS4ERR_STALE_CLIENTID:
+		clp->cl_exchange_flags = 0;
+	}
+out:
+	return ret;
+}
+
 struct nfs4_get_lease_time_data {
 	struct nfs4_get_lease_time_args *args;
 	struct nfs4_get_lease_time_res *res;
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 1d4d259..b9ce3fd 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -338,6 +338,8 @@ static int nfs4_stat_to_errno(int);
 				     1 /* bctsr_use_conn_in_rdma_mode */)
 #define encode_destroy_session_maxsz    (op_encode_hdr_maxsz + 4)
 #define decode_destroy_session_maxsz    (op_decode_hdr_maxsz)
+#define encode_destroy_clientid_maxsz   (op_encode_hdr_maxsz + 2)
+#define decode_destroy_clientid_maxsz   (op_decode_hdr_maxsz)
 #define encode_sequence_maxsz	(op_encode_hdr_maxsz + \
 				XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + 4)
 #define decode_sequence_maxsz	(op_decode_hdr_maxsz + \
@@ -751,6 +753,10 @@ static int nfs4_stat_to_errno(int);
 					 encode_destroy_session_maxsz)
 #define NFS4_dec_destroy_session_sz	(compound_decode_hdr_maxsz + \
 					 decode_destroy_session_maxsz)
+#define NFS4_enc_destroy_clientid_sz	(compound_encode_hdr_maxsz + \
+					 encode_destroy_clientid_maxsz)
+#define NFS4_dec_destroy_clientid_sz	(compound_decode_hdr_maxsz + \
+					 decode_destroy_clientid_maxsz)
 #define NFS4_enc_sequence_sz \
 				(compound_decode_hdr_maxsz + \
 				 encode_sequence_maxsz)
@@ -1804,6 +1810,14 @@ static void encode_destroy_session(struct xdr_stream *xdr,
 	encode_opaque_fixed(xdr, session->sess_id.data, NFS4_MAX_SESSIONID_LEN);
 }
 
+static void encode_destroy_clientid(struct xdr_stream *xdr,
+				   uint64_t clientid,
+				   struct compound_hdr *hdr)
+{
+	encode_op_hdr(xdr, OP_DESTROY_CLIENTID, decode_destroy_clientid_maxsz, hdr);
+	encode_uint64(xdr, clientid);
+}
+
 static void encode_reclaim_complete(struct xdr_stream *xdr,
 				    struct nfs41_reclaim_complete_args *args,
 				    struct compound_hdr *hdr)
@@ -2724,6 +2738,22 @@ static void nfs4_xdr_enc_destroy_session(struct rpc_rqst *req,
 }
 
 /*
+ * a DESTROY_CLIENTID request
+ */
+static void nfs4_xdr_enc_destroy_clientid(struct rpc_rqst *req,
+					 struct xdr_stream *xdr,
+					 struct nfs_client *clp)
+{
+	struct compound_hdr hdr = {
+		.minorversion = clp->cl_mvops->minor_version,
+	};
+
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_destroy_clientid(xdr, clp->cl_clientid, &hdr);
+	encode_nops(&hdr);
+}
+
+/*
  * a SEQUENCE request
  */
 static void nfs4_xdr_enc_sequence(struct rpc_rqst *req, struct xdr_stream *xdr,
@@ -5479,6 +5509,11 @@ static int decode_destroy_session(struct xdr_stream *xdr, void *dummy)
 	return decode_op_hdr(xdr, OP_DESTROY_SESSION);
 }
 
+static int decode_destroy_clientid(struct xdr_stream *xdr, void *dummy)
+{
+	return decode_op_hdr(xdr, OP_DESTROY_CLIENTID);
+}
+
 static int decode_reclaim_complete(struct xdr_stream *xdr, void *dummy)
 {
 	return decode_op_hdr(xdr, OP_RECLAIM_COMPLETE);
@@ -6789,6 +6824,22 @@ static int nfs4_xdr_dec_destroy_session(struct rpc_rqst *rqstp,
 }
 
 /*
+ * Decode DESTROY_CLIENTID response
+ */
+static int nfs4_xdr_dec_destroy_clientid(struct rpc_rqst *rqstp,
+					struct xdr_stream *xdr,
+					void *res)
+{
+	struct compound_hdr hdr;
+	int status;
+
+	status = decode_compound_hdr(xdr, &hdr);
+	if (!status)
+		status = decode_destroy_clientid(xdr, res);
+	return status;
+}
+
+/*
  * Decode SEQUENCE response
  */
 static int nfs4_xdr_dec_sequence(struct rpc_rqst *rqstp,
@@ -7237,6 +7288,7 @@ struct rpc_procinfo	nfs4_procedures[] = {
 	PROC(GETDEVICELIST,	enc_getdevicelist,	dec_getdevicelist),
 	PROC(BIND_CONN_TO_SESSION,
 			enc_bind_conn_to_session, dec_bind_conn_to_session),
+	PROC(DESTROY_CLIENTID,	enc_destroy_clientid,	dec_destroy_clientid),
 #endif /* CONFIG_NFS_V4_1 */
 };
 
diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h
index 54006a9..af2d2fa 100644
--- a/include/linux/nfs4.h
+++ b/include/linux/nfs4.h
@@ -608,6 +608,7 @@ enum {
 	NFSPROC4_CLNT_FREE_STATEID,
 	NFSPROC4_CLNT_GETDEVICELIST,
 	NFSPROC4_CLNT_BIND_CONN_TO_SESSION,
+	NFSPROC4_CLNT_DESTROY_CLIENTID,
 };
 
 /* nfs41 types */
-- 
cgit v0.10.2


From 32b0131069c5bebf52368a9fe170f8d58b78fa8d Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Sat, 26 May 2012 13:41:04 -0400
Subject: NFSv4.1: Don't clobber the seqid if exchange_id returns a confirmed
 clientid

If the EXCHGID4_FLAG_CONFIRMED_R flag is set, the client is in theory
supposed to already know the correct value of the seqid, in which case
RFC5661 states that it should ignore the value returned.

Also ensure that if the sanity check in nfs4_check_cl_exchange_flags
fails, then we must not change the nfs_client fields.

Finally, clean up the code: we don't need to retest the value of
'status' unless it can change.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 485a6c0..9f0a96f 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -5171,7 +5171,7 @@ int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred)
 		.flags = EXCHGID4_FLAG_SUPP_MOVED_REFER,
 	};
 	struct nfs41_exchange_id_res res = {
-		.client = clp,
+		0
 	};
 	int status;
 	struct rpc_message msg = {
@@ -5214,22 +5214,22 @@ int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred)
 
 	status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT);
 	if (status == 0)
-		status = nfs4_check_cl_exchange_flags(clp->cl_exchange_flags);
+		status = nfs4_check_cl_exchange_flags(res.flags);
 
 	if (status == 0) {
+		clp->cl_clientid = res.clientid;
+		clp->cl_exchange_flags = (res.flags & ~EXCHGID4_FLAG_CONFIRMED_R);
+		if (!(res.flags & EXCHGID4_FLAG_CONFIRMED_R))
+			clp->cl_seqid = res.seqid;
+
 		kfree(clp->cl_serverowner);
 		clp->cl_serverowner = res.server_owner;
 		res.server_owner = NULL;
-	}
 
-	if (status == 0) {
 		/* use the most recent implementation id */
 		kfree(clp->cl_implid);
 		clp->cl_implid = res.impl_id;
-	} else
-		kfree(res.impl_id);
 
-	if (status == 0) {
 		if (clp->cl_serverscope != NULL &&
 		    !nfs41_same_server_scope(clp->cl_serverscope,
 					     res.server_scope)) {
@@ -5244,7 +5244,8 @@ int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred)
 			clp->cl_serverscope = res.server_scope;
 			goto out;
 		}
-	}
+	} else
+		kfree(res.impl_id);
 
 out_server_owner:
 	kfree(res.server_owner);
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index b9ce3fd..ee4a74d 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -5319,7 +5319,6 @@ static int decode_exchange_id(struct xdr_stream *xdr,
 	uint32_t dummy;
 	char *dummy_str;
 	int status;
-	struct nfs_client *clp = res->client;
 	uint32_t impl_id_count;
 
 	status = decode_op_hdr(xdr, OP_EXCHANGE_ID);
@@ -5329,12 +5328,12 @@ static int decode_exchange_id(struct xdr_stream *xdr,
 	p = xdr_inline_decode(xdr, 8);
 	if (unlikely(!p))
 		goto out_overflow;
-	xdr_decode_hyper(p, &clp->cl_clientid);
+	xdr_decode_hyper(p, &res->clientid);
 	p = xdr_inline_decode(xdr, 12);
 	if (unlikely(!p))
 		goto out_overflow;
-	clp->cl_seqid = be32_to_cpup(p++);
-	clp->cl_exchange_flags = be32_to_cpup(p++);
+	res->seqid = be32_to_cpup(p++);
+	res->flags = be32_to_cpup(p++);
 
 	/* We ask for SP4_NONE */
 	dummy = be32_to_cpup(p);
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 6387fc0..d1a7bf5 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -1132,7 +1132,8 @@ struct nfs41_bind_conn_to_session_res {
 };
 
 struct nfs41_exchange_id_res {
-	struct nfs_client		*client;
+	u64				clientid;
+	u32				seqid;
 	u32				flags;
 	struct nfs41_server_owner	*server_owner;
 	struct nfs41_server_scope	*server_scope;
-- 
cgit v0.10.2


From 43ac544cb36adf38338c01968f8e3a5f81b7d629 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Sun, 27 May 2012 13:47:21 -0400
Subject: NFSv4.1: nfs4_bind_conn_to_session should drain the session

In order to avoid races with other RPC calls that end up setting the
NFS4CLNT_BIND_CONN_TO_SESSION flag.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 7dbca66..d21ed03 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -1782,10 +1782,12 @@ static int nfs4_bind_conn_to_session(struct nfs_client *clp)
 	struct rpc_cred *cred;
 	int ret;
 
+	nfs4_begin_drain_session(clp);
 	cred = nfs4_get_exchange_id_cred(clp);
 	ret = nfs4_proc_bind_conn_to_session(clp, cred);
 	if (cred)
 		put_rpccred(cred);
+	clear_bit(NFS4CLNT_BIND_CONN_TO_SESSION, &clp->cl_state);
 	return ret;
 }
 #else /* CONFIG_NFS_V4_1 */
-- 
cgit v0.10.2


From bf674c8228710fa4149df3988862dc112860df99 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Sun, 27 May 2012 12:53:10 -0400
Subject: NFSv4.1: Handle errors in nfs4_bind_conn_to_session

Ensure that we handle NFS4ERR_DELAY errors separately, and then
let nfs4_recovery_handle_error() handle all other cases.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index d21ed03..d685fd4 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -1788,7 +1788,17 @@ static int nfs4_bind_conn_to_session(struct nfs_client *clp)
 	if (cred)
 		put_rpccred(cred);
 	clear_bit(NFS4CLNT_BIND_CONN_TO_SESSION, &clp->cl_state);
-	return ret;
+	switch (ret) {
+	case 0:
+		break;
+	case -NFS4ERR_DELAY:
+		ssleep(1);
+		set_bit(NFS4CLNT_BIND_CONN_TO_SESSION, &clp->cl_state);
+		break;
+	default:
+		return nfs4_recovery_handle_error(clp, ret);
+	}
+	return 0;
 }
 #else /* CONFIG_NFS_V4_1 */
 static int nfs4_reset_session(struct nfs_client *clp) { return 0; }
@@ -1858,6 +1868,7 @@ static void nfs4_state_manager(struct nfs_client *clp)
 			status = nfs4_bind_conn_to_session(clp);
 			if (status < 0)
 				goto out_error;
+			continue;
 		}
 
 		/* First recover reboot state... */
-- 
cgit v0.10.2


From 7c5d7256845e30d295de4d72d4c52943bff7d1ac Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Sun, 27 May 2012 12:58:48 -0400
Subject: NFSv4.1: Handle NFS4ERR_CONN_NOT_BOUND_TO_SESSION in the state
 manager

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index d685fd4..e593ae9 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -1493,12 +1493,14 @@ static int nfs4_recovery_handle_error(struct nfs_client *clp, int error)
 		case -NFS4ERR_BADSLOT:
 		case -NFS4ERR_BAD_HIGH_SLOT:
 		case -NFS4ERR_DEADSESSION:
-		case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
 		case -NFS4ERR_SEQ_FALSE_RETRY:
 		case -NFS4ERR_SEQ_MISORDERED:
 			set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state);
 			/* Zero session reset errors */
 			break;
+		case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
+			set_bit(NFS4CLNT_BIND_CONN_TO_SESSION, &clp->cl_state);
+			break;
 		case -EKEYEXPIRED:
 			/* Nothing we can do */
 			nfs4_warn_keyexpired(clp->cl_hostname);
-- 
cgit v0.10.2


From 9f594791dd530c2dc06953fb32505a26cc28371f Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Sun, 27 May 2012 13:02:53 -0400
Subject: NFSv4.1: Handle other occurrences of
 NFS4ERR_CONN_NOT_BOUND_TO_SESSION

Let nfs4_schedule_session_recovery() handle the details of choosing
between resetting the session, and other session related recovery.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index b20b516..c6827f93 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -313,9 +313,9 @@ struct rpc_cred *nfs4_get_renew_cred_locked(struct nfs_client *clp);
 #if defined(CONFIG_NFS_V4_1)
 struct rpc_cred *nfs4_get_machine_cred_locked(struct nfs_client *clp);
 struct rpc_cred *nfs4_get_exchange_id_cred(struct nfs_client *clp);
-extern void nfs4_schedule_session_recovery(struct nfs4_session *);
+extern void nfs4_schedule_session_recovery(struct nfs4_session *, int);
 #else
-static inline void nfs4_schedule_session_recovery(struct nfs4_session *session)
+static inline void nfs4_schedule_session_recovery(struct nfs4_session *session, int err)
 {
 }
 #endif /* CONFIG_NFS_V4_1 */
diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
index ddea4d3..e134029 100644
--- a/fs/nfs/nfs4filelayout.c
+++ b/fs/nfs/nfs4filelayout.c
@@ -165,7 +165,7 @@ static int filelayout_async_handle_error(struct rpc_task *task,
 		dprintk("%s ERROR %d, Reset session. Exchangeid "
 			"flags 0x%x\n", __func__, task->tk_status,
 			clp->cl_exchange_flags);
-		nfs4_schedule_session_recovery(clp->cl_session);
+		nfs4_schedule_session_recovery(clp->cl_session, task->tk_status);
 		break;
 	case -NFS4ERR_DELAY:
 	case -NFS4ERR_GRACE:
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 9f0a96f..6be5fa3 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -306,7 +306,7 @@ static int nfs4_handle_exception(struct nfs_server *server, int errorcode, struc
 		case -NFS4ERR_SEQ_MISORDERED:
 			dprintk("%s ERROR: %d Reset session\n", __func__,
 				errorcode);
-			nfs4_schedule_session_recovery(clp->cl_session);
+			nfs4_schedule_session_recovery(clp->cl_session, errorcode);
 			exception->retry = 1;
 			break;
 #endif /* defined(CONFIG_NFS_V4_1) */
@@ -1330,7 +1330,7 @@ int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state
 			case -NFS4ERR_BAD_HIGH_SLOT:
 			case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
 			case -NFS4ERR_DEADSESSION:
-				nfs4_schedule_session_recovery(server->nfs_client->cl_session);
+				nfs4_schedule_session_recovery(server->nfs_client->cl_session, err);
 				goto out;
 			case -NFS4ERR_STALE_CLIENTID:
 			case -NFS4ERR_STALE_STATEID:
@@ -3906,7 +3906,7 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server,
 		case -NFS4ERR_SEQ_MISORDERED:
 			dprintk("%s ERROR %d, Reset session\n", __func__,
 				task->tk_status);
-			nfs4_schedule_session_recovery(clp->cl_session);
+			nfs4_schedule_session_recovery(clp->cl_session, task->tk_status);
 			task->tk_status = 0;
 			return -EAGAIN;
 #endif /* CONFIG_NFS_V4_1 */
@@ -4847,7 +4847,7 @@ int nfs4_lock_delegation_recall(struct nfs4_state *state, struct file_lock *fl)
 			case -NFS4ERR_BAD_HIGH_SLOT:
 			case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
 			case -NFS4ERR_DEADSESSION:
-				nfs4_schedule_session_recovery(server->nfs_client->cl_session);
+				nfs4_schedule_session_recovery(server->nfs_client->cl_session, err);
 				goto out;
 			case -ERESTARTSYS:
 				/*
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index e593ae9..d46a905 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -1635,11 +1635,17 @@ static int nfs4_reclaim_lease(struct nfs_client *clp)
 }
 
 #ifdef CONFIG_NFS_V4_1
-void nfs4_schedule_session_recovery(struct nfs4_session *session)
+void nfs4_schedule_session_recovery(struct nfs4_session *session, int err)
 {
 	struct nfs_client *clp = session->clp;
 
-	set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state);
+	switch (err) {
+	default:
+		set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state);
+		break;
+	case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
+		set_bit(NFS4CLNT_BIND_CONN_TO_SESSION, &clp->cl_state);
+	}
 	nfs4_schedule_lease_recovery(clp);
 }
 EXPORT_SYMBOL_GPL(nfs4_schedule_session_recovery);
-- 
cgit v0.10.2


From f2c1b5100db340441963649fabb4e43e2a65df77 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Sun, 27 May 2012 14:46:46 -0400
Subject: NFSv4.1: nfs4_reset_session should use
 nfs4_handle_reclaim_lease_error

The results from a call to nfs4_proc_create_session() should always
be fed into nfs4_handle_reclaim_lease_error, so that we can
handle errors such as NFS4ERR_SEQ_MISORDERED correctly.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index d46a905..dfedde6 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -1740,7 +1740,7 @@ static int nfs4_reset_session(struct nfs_client *clp)
 	memset(clp->cl_session->sess_id.data, 0, NFS4_MAX_SESSIONID_LEN);
 	status = nfs4_proc_create_session(clp, cred);
 	if (status) {
-		status = nfs4_recovery_handle_error(clp, status);
+		status = nfs4_handle_reclaim_lease_error(clp, status);
 		goto out;
 	}
 	clear_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state);
-- 
cgit v0.10.2


From 359d7d1c976851c658aa7085761015812ed3b56f Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Mon, 28 May 2012 10:01:34 -0400
Subject: NFSv4: update_changeattr does not need to set NFS_INO_REVAL_PAGECACHE

We're already invalidating the data cache, and setting the new change
attribute. Since directories don't care about the i_size field, there
is no need to be forcing any extra revalidation of the page cache.

We do keep the NFS_INO_INVALID_ATTR flag, in order to force an
attribute cache revalidation on stat() calls since we do not
update the mtime and ctime fields.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 6be5fa3..af2db2c 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -774,7 +774,7 @@ static void update_changeattr(struct inode *dir, struct nfs4_change_info *cinfo)
 	struct nfs_inode *nfsi = NFS_I(dir);
 
 	spin_lock(&dir->i_lock);
-	nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE|NFS_INO_INVALID_DATA;
+	nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA;
 	if (!cinfo->atomic || cinfo->before != dir->i_version)
 		nfs_force_lookup_revalidate(dir);
 	dir->i_version = cinfo->after;
-- 
cgit v0.10.2


From fb13bfa7e1bcfdcfdece47c24b62f1a1cad957e9 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Mon, 28 May 2012 11:36:28 -0400
Subject: NFSv4: Map NFS4ERR_SHARE_DENIED into an EACCES error instead of EIO

If a file OPEN is denied due to a share lock, the resulting
NFS4ERR_SHARE_DENIED is currently mapped to the default EIO.
This patch adds a more appropriate mapping, and brings Linux
into line with what Solaris 10 does.

See https://bugzilla.kernel.org/show_bug.cgi?id=43286

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Cc: stable@vger.kernel.org

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index af2db2c..42d9e9c 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -103,6 +103,8 @@ static int nfs4_map_errors(int err)
 	case -NFS4ERR_BADOWNER:
 	case -NFS4ERR_BADNAME:
 		return -EINVAL;
+	case -NFS4ERR_SHARE_DENIED:
+		return -EACCES;
 	default:
 		dprintk("%s could not handle NFSv4 error %d\n",
 				__func__, -err);
-- 
cgit v0.10.2


From cc0a98436820b161b595b8cc1d2329bcf7328107 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Mon, 28 May 2012 15:12:27 -0400
Subject: NFSv4: Add debugging printks to state manager

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index dfedde6..c679b9e 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -1108,6 +1108,8 @@ void nfs4_schedule_lease_recovery(struct nfs_client *clp)
 		return;
 	if (!test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state))
 		set_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state);
+	dprintk("%s: scheduling lease recovery for server %s\n", __func__,
+			clp->cl_hostname);
 	nfs4_schedule_state_manager(clp);
 }
 EXPORT_SYMBOL_GPL(nfs4_schedule_lease_recovery);
@@ -1124,6 +1126,8 @@ static void nfs40_handle_cb_pathdown(struct nfs_client *clp)
 {
 	set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
 	nfs_expire_all_delegations(clp);
+	dprintk("%s: handling CB_PATHDOWN recovery for server %s\n", __func__,
+			clp->cl_hostname);
 }
 
 void nfs4_schedule_path_down_recovery(struct nfs_client *clp)
@@ -1160,6 +1164,8 @@ void nfs4_schedule_stateid_recovery(const struct nfs_server *server, struct nfs4
 	struct nfs_client *clp = server->nfs_client;
 
 	nfs4_state_mark_reclaim_nograce(clp, state);
+	dprintk("%s: scheduling stateid recovery for server %s\n", __func__,
+			clp->cl_hostname);
 	nfs4_schedule_state_manager(clp);
 }
 EXPORT_SYMBOL_GPL(nfs4_schedule_stateid_recovery);
@@ -1506,8 +1512,12 @@ static int nfs4_recovery_handle_error(struct nfs_client *clp, int error)
 			nfs4_warn_keyexpired(clp->cl_hostname);
 			break;
 		default:
+			dprintk("%s: failed to handle error %d for server %s\n",
+					__func__, error, clp->cl_hostname);
 			return error;
 	}
+	dprintk("%s: handled error %d for server %s\n", __func__, error,
+			clp->cl_hostname);
 	return 0;
 }
 
@@ -1605,15 +1615,21 @@ static int nfs4_handle_reclaim_lease_error(struct nfs_client *clp, int status)
 	case -NFS4ERR_MINOR_VERS_MISMATCH:
 		if (clp->cl_cons_state == NFS_CS_SESSION_INITING)
 			nfs_mark_client_ready(clp, -EPROTONOSUPPORT);
+		dprintk("%s: exit with error %d for server %s\n",
+				__func__, -EPROTONOSUPPORT, clp->cl_hostname);
 		return -EPROTONOSUPPORT;
 	case -EKEYEXPIRED:
 		nfs4_warn_keyexpired(clp->cl_hostname);
 	case -NFS4ERR_NOT_SAME: /* FixMe: implement recovery
 				 * in nfs4_exchange_id */
 	default:
+		dprintk("%s: exit with error %d for server %s\n", __func__,
+				status, clp->cl_hostname);
 		return status;
 	}
 	set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
+	dprintk("%s: handled error %d for server %s\n", __func__, status,
+			clp->cl_hostname);
 	return 0;
 }
 
@@ -1653,6 +1669,8 @@ EXPORT_SYMBOL_GPL(nfs4_schedule_session_recovery);
 void nfs41_handle_recall_slot(struct nfs_client *clp)
 {
 	set_bit(NFS4CLNT_RECALL_SLOT, &clp->cl_state);
+	dprintk("%s: scheduling slot recall for server %s\n", __func__,
+			clp->cl_hostname);
 	nfs4_schedule_state_manager(clp);
 }
 
@@ -1662,6 +1680,8 @@ static void nfs4_reset_all_state(struct nfs_client *clp)
 		set_bit(NFS4CLNT_PURGE_STATE, &clp->cl_state);
 		clear_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state);
 		nfs4_state_start_reclaim_nograce(clp);
+		dprintk("%s: scheduling reset of all state for server %s!\n",
+				__func__, clp->cl_hostname);
 		nfs4_schedule_state_manager(clp);
 	}
 }
@@ -1670,6 +1690,8 @@ static void nfs41_handle_server_reboot(struct nfs_client *clp)
 {
 	if (test_and_set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) == 0) {
 		nfs4_state_start_reclaim_reboot(clp);
+		dprintk("%s: server %s rebooted!\n", __func__,
+				clp->cl_hostname);
 		nfs4_schedule_state_manager(clp);
 	}
 }
@@ -1677,12 +1699,15 @@ static void nfs41_handle_server_reboot(struct nfs_client *clp)
 static void nfs41_handle_state_revoked(struct nfs_client *clp)
 {
 	nfs4_reset_all_state(clp);
+	dprintk("%s: state revoked on server %s\n", __func__, clp->cl_hostname);
 }
 
 static void nfs41_handle_recallable_state_revoked(struct nfs_client *clp)
 {
 	/* This will need to handle layouts too */
 	nfs_expire_all_delegations(clp);
+	dprintk("%s: Recallable state revoked on server %s!\n", __func__,
+			clp->cl_hostname);
 }
 
 static void nfs41_handle_backchannel_fault(struct nfs_client *clp)
@@ -1690,6 +1715,8 @@ static void nfs41_handle_backchannel_fault(struct nfs_client *clp)
 	nfs_expire_all_delegations(clp);
 	if (test_and_set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state) == 0)
 		nfs4_schedule_state_manager(clp);
+	dprintk("%s: server %s declared a backchannel fault\n", __func__,
+			clp->cl_hostname);
 }
 
 static void nfs41_handle_cb_path_down(struct nfs_client *clp)
@@ -1740,6 +1767,8 @@ static int nfs4_reset_session(struct nfs_client *clp)
 	memset(clp->cl_session->sess_id.data, 0, NFS4_MAX_SESSIONID_LEN);
 	status = nfs4_proc_create_session(clp, cred);
 	if (status) {
+		dprintk("%s: session reset failed with status %d for server %s!\n",
+			__func__, status, clp->cl_hostname);
 		status = nfs4_handle_reclaim_lease_error(clp, status);
 		goto out;
 	}
@@ -1747,6 +1776,8 @@ static int nfs4_reset_session(struct nfs_client *clp)
 	/* create_session negotiated new slot table */
 	clear_bit(NFS4CLNT_RECALL_SLOT, &clp->cl_state);
 	clear_bit(NFS4CLNT_BIND_CONN_TO_SESSION, &clp->cl_state);
+	dprintk("%s: session reset was successful for server %s!\n",
+			__func__, clp->cl_hostname);
 
 	 /* Let the state manager reestablish state */
 	if (!test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state))
@@ -1798,6 +1829,8 @@ static int nfs4_bind_conn_to_session(struct nfs_client *clp)
 	clear_bit(NFS4CLNT_BIND_CONN_TO_SESSION, &clp->cl_state);
 	switch (ret) {
 	case 0:
+		dprintk("%s: bind_conn_to_session was successful for server %s!\n",
+			__func__, clp->cl_hostname);
 		break;
 	case -NFS4ERR_DELAY:
 		ssleep(1);
-- 
cgit v0.10.2


From d3ca8b64b97ef4dc54d7bb0b88bbc01a1fca8cb9 Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@linux.intel.com>
Date: Tue, 29 May 2012 13:45:01 +0100
Subject: pty: Fix lock inversion

The ptmx_open path takes the tty and devpts locks in the wrong order
because tty_init_dev locks and returns a locked tty.  As far as I can
tell this is actually safe anyway because the tty being returned is new
so nobody can get a reference to lock it at this point.

However we don't even need the devpts lock at this point, it's only held
as a byproduct of the way the locks were pushe down.

Signed-off-by: Alan Cox <alan@linux.intel.com>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/tty/pty.c b/drivers/tty/pty.c
index 59af394..65c7c62 100644
--- a/drivers/tty/pty.c
+++ b/drivers/tty/pty.c
@@ -633,7 +633,6 @@ static int ptmx_open(struct inode *inode, struct file *filp)
 	mutex_unlock(&devpts_mutex);
 
 	mutex_lock(&tty_mutex);
-	mutex_lock(&devpts_mutex);
 	tty = tty_init_dev(ptm_driver, index);
 
 	if (IS_ERR(tty)) {
@@ -643,7 +642,6 @@ static int ptmx_open(struct inode *inode, struct file *filp)
 
 	/* The tty returned here is locked so we can safely
 	   drop the mutex */
-	mutex_unlock(&devpts_mutex);
 	mutex_unlock(&tty_mutex);
 
 	set_bit(TTY_PTY_LOCK, &tty->flags); /* LOCK THE SLAVE */
-- 
cgit v0.10.2


From 8f6576ad476b2a22d05ddafd2ddaee102577a4ed Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@linux.intel.com>
Date: Tue, 29 May 2012 13:45:16 +0100
Subject: tty: fix ldisc lock inversion trace

This is caused by tty_release using tty_lock_pair to lock both sides of
the pty/tty pair, and then tty_ldisc_release dropping and relocking one
side only.  We can drop both fine, so drop both to avoid any lock
ordering concerns.

Rework the release path to fix the new locking model.

Signed-off-by: Alan Cox <alan@linux.intel.com>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/tty/tty_ldisc.c b/drivers/tty/tty_ldisc.c
index 173a900..ba8be39 100644
--- a/drivers/tty/tty_ldisc.c
+++ b/drivers/tty/tty_ldisc.c
@@ -894,6 +894,23 @@ int tty_ldisc_setup(struct tty_struct *tty, struct tty_struct *o_tty)
 	tty_ldisc_enable(tty);
 	return 0;
 }
+
+static void tty_ldisc_kill(struct tty_struct *tty)
+{
+	mutex_lock(&tty->ldisc_mutex);
+	/*
+	 * Now kill off the ldisc
+	 */
+	tty_ldisc_close(tty, tty->ldisc);
+	tty_ldisc_put(tty->ldisc);
+	/* Force an oops if we mess this up */
+	tty->ldisc = NULL;
+
+	/* Ensure the next open requests the N_TTY ldisc */
+	tty_set_termios_ldisc(tty, N_TTY);
+	mutex_unlock(&tty->ldisc_mutex);
+}
+
 /**
  *	tty_ldisc_release		-	release line discipline
  *	@tty: tty being shut down
@@ -912,27 +929,19 @@ void tty_ldisc_release(struct tty_struct *tty, struct tty_struct *o_tty)
 	 * race with the set_ldisc code path.
 	 */
 
-	tty_unlock(tty);
+	tty_unlock_pair(tty, o_tty);
 	tty_ldisc_halt(tty);
 	tty_ldisc_flush_works(tty);
-	tty_lock(tty);
-
-	mutex_lock(&tty->ldisc_mutex);
-	/*
-	 * Now kill off the ldisc
-	 */
-	tty_ldisc_close(tty, tty->ldisc);
-	tty_ldisc_put(tty->ldisc);
-	/* Force an oops if we mess this up */
-	tty->ldisc = NULL;
+	if (o_tty) {
+		tty_ldisc_halt(o_tty);
+		tty_ldisc_flush_works(o_tty);
+	}
+	tty_lock_pair(tty, o_tty);
 
-	/* Ensure the next open requests the N_TTY ldisc */
-	tty_set_termios_ldisc(tty, N_TTY);
-	mutex_unlock(&tty->ldisc_mutex);
 
-	/* This will need doing differently if we need to lock */
+	tty_ldisc_kill(tty);
 	if (o_tty)
-		tty_ldisc_release(o_tty, NULL);
+		tty_ldisc_kill(o_tty);
 
 	/* And the memory resources remaining (buffers, termios) will be
 	   disposed of when the kref hits zero */
-- 
cgit v0.10.2


From af2e840971dee21ba9b87e9ecee7d5cc6109baaa Mon Sep 17 00:00:00 2001
From: Paul Gortmaker <paul.gortmaker@windriver.com>
Date: Tue, 29 May 2012 15:06:14 -0700
Subject: pagemap.h: fix warning about possibly used before init var

Commit f56f821feb7b ("mm: extend prefault helpers to fault in more than
PAGE_SIZE") added in the new functions: fault_in_multipages_writeable()
and fault_in_multipages_readable().

However, we currently see:

  include/linux/pagemap.h:492: warning: 'ret' may be used uninitialized in this function
  include/linux/pagemap.h:492: note: 'ret' was declared here

Unlike a lot of gcc nags, this one appears somewhat legit.  i.e.  passing
in an invalid negative value of "size" does make it look like all the
conditionals in there would be bypassed and the uninitialized value would
be returned.

Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index efa26b4..7cfad3b 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -460,11 +460,11 @@ static inline int fault_in_pages_readable(const char __user *uaddr, int size)
  */
 static inline int fault_in_multipages_writeable(char __user *uaddr, int size)
 {
-	int ret;
+	int ret = 0;
 	char __user *end = uaddr + size - 1;
 
 	if (unlikely(size == 0))
-		return 0;
+		return ret;
 
 	/*
 	 * Writing zeroes into userspace here is OK, because we know that if
@@ -489,11 +489,11 @@ static inline int fault_in_multipages_readable(const char __user *uaddr,
 					       int size)
 {
 	volatile char c;
-	int ret;
+	int ret = 0;
 	const char __user *end = uaddr + size - 1;
 
 	if (unlikely(size == 0))
-		return 0;
+		return ret;
 
 	while (uaddr <= end) {
 		ret = __get_user(c, uaddr);
-- 
cgit v0.10.2


From 4c9c6a1bc869daeb1575442cbf94ba5878c08a67 Mon Sep 17 00:00:00 2001
From: Cong Wang <xiyou.wangcong@gmail.com>
Date: Tue, 29 May 2012 15:06:15 -0700
Subject: cris: select GENERIC_ATOMIC64

Cris doesn't implement atomic64 operations neither, should select
GENERIC_ATOMIC64.

Signed-off-by: WANG Cong <xiyou.wangcong@gmail.com>
Cc: Mikael Starvik <starvik@axis.com>
Cc: Jesper Nilsson <jesper.nilsson@axis.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/arch/cris/Kconfig b/arch/cris/Kconfig
index 22d34d6..bb34465 100644
--- a/arch/cris/Kconfig
+++ b/arch/cris/Kconfig
@@ -40,6 +40,7 @@ config CRIS
 	bool
 	default y
 	select HAVE_IDE
+	select GENERIC_ATOMIC64
 	select HAVE_GENERIC_HARDIRQS
 	select GENERIC_IRQ_SHOW
 	select GENERIC_IOMAP
-- 
cgit v0.10.2


From 08fa29d916c6e271ad13978cd993e7238c68db97 Mon Sep 17 00:00:00 2001
From: Sasha Levin <levinsasha928@gmail.com>
Date: Tue, 29 May 2012 15:06:15 -0700
Subject: mm: fix NULL ptr deref when walking hugepages

A missing validation of the value returned by find_vma() could cause a
NULL ptr dereference when walking the pagetable.

This is triggerable from usermode by a simple user by trying to read a
page info out of /proc/pid/pagemap which doesn't exist.

Introduced by commit 025c5b2451e4 ("thp: optimize away unnecessary page
table locking").

Signed-off-by: Sasha Levin <levinsasha928@gmail.com>
Reviewed-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: <stable@vger.kernel.org>		[3.4.x]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 1030a71..7faaf2a 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -784,7 +784,7 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
 
 	/* find the first VMA at or above 'addr' */
 	vma = find_vma(walk->mm, addr);
-	if (pmd_trans_huge_lock(pmd, vma) == 1) {
+	if (vma && pmd_trans_huge_lock(pmd, vma) == 1) {
 		for (; addr != end; addr += PAGE_SIZE) {
 			unsigned long offset;
 
-- 
cgit v0.10.2


From 71dd0b8ae83120db9ade1380fae7a49cc6b3f465 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <borislav.petkov@amd.com>
Date: Tue, 29 May 2012 15:06:16 -0700
Subject: mm/memory_failure: let the compiler add the function name

These things tend to get out of sync with time so let the compiler
automatically enter the current function name using __func__.

No functional change.

Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>
Acked-by: Andi Kleen <andi@firstfloor.org>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index c99ad4e..ab1e714 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1388,16 +1388,16 @@ static int get_any_page(struct page *p, unsigned long pfn, int flags)
 	 */
 	if (!get_page_unless_zero(compound_head(p))) {
 		if (PageHuge(p)) {
-			pr_info("get_any_page: %#lx free huge page\n", pfn);
+			pr_info("%s: %#lx free huge page\n", __func__, pfn);
 			ret = dequeue_hwpoisoned_huge_page(compound_head(p));
 		} else if (is_free_buddy_page(p)) {
-			pr_info("get_any_page: %#lx free buddy page\n", pfn);
+			pr_info("%s: %#lx free buddy page\n", __func__, pfn);
 			/* Set hwpoison bit while page is still isolated */
 			SetPageHWPoison(p);
 			ret = 0;
 		} else {
-			pr_info("get_any_page: %#lx: unknown zero refcount page type %lx\n",
-				pfn, p->flags);
+			pr_info("%s: %#lx: unknown zero refcount page type %lx\n",
+				__func__, pfn, p->flags);
 			ret = -EIO;
 		}
 	} else {
-- 
cgit v0.10.2


From 89c522c78a6fbc0e24b71e65b65df8b247d5aebd Mon Sep 17 00:00:00 2001
From: Wang Sheng-Hui <shhuiw@gmail.com>
Date: Tue, 29 May 2012 15:06:16 -0700
Subject: mm/mempolicy.c: use enum value MPOL_REBIND_ONCE in
 mpol_rebind_policy()

We have enum definition in mempolicy.h: MPOL_REBIND_ONCE.  It should
replace the magic number 0 for step comparison in function
mpol_rebind_policy.

Signed-off-by: Wang Sheng-Hui <shhuiw@gmail.com>
Acked-by: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 88f9422..a388888 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -390,7 +390,7 @@ static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask,
 {
 	if (!pol)
 		return;
-	if (!mpol_store_user_nodemask(pol) && step == 0 &&
+	if (!mpol_store_user_nodemask(pol) && step == MPOL_REBIND_ONCE &&
 	    nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
 		return;
 
-- 
cgit v0.10.2


From f2135a4a57a798ca8014a138afb626b96d90c842 Mon Sep 17 00:00:00 2001
From: Wang Sheng-Hui <shhuiw@gmail.com>
Date: Tue, 29 May 2012 15:06:17 -0700
Subject: mm/hugetlb.c: use long vars instead of int in region_count()

The arguments f & t and fields from & to of struct file_region are
defined as long.  So use long instead of int to type the temp vars.

Signed-off-by: Wang Sheng-Hui <shhuiw@gmail.com>
Acked-by: David Rientjes <rientjes@google.com>
Acked-by: Hillf Danton <dhillf@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 4e28416..41a647d 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -273,8 +273,8 @@ static long region_count(struct list_head *head, long f, long t)
 
 	/* Locate each segment we overlap with, and count that overlap. */
 	list_for_each_entry(rg, head, link) {
-		int seg_from;
-		int seg_to;
+		long seg_from;
+		long seg_to;
 
 		if (rg->to <= f)
 			continue;
-- 
cgit v0.10.2


From aa2e878efa7949c8502c9760f92835222714f090 Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Tue, 29 May 2012 15:06:17 -0700
Subject: mm, thp: remove unnecessary ret variable

The "ret" variable is unnecessary in __do_huge_pmd_anonymous_page(), so
remove it.

Signed-off-by: David Rientjes <rientjes@google.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index f0e5306..8ab2d24 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -636,7 +636,6 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
 					unsigned long haddr, pmd_t *pmd,
 					struct page *page)
 {
-	int ret = 0;
 	pgtable_t pgtable;
 
 	VM_BUG_ON(!PageCompound(page));
@@ -675,7 +674,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
 		spin_unlock(&mm->page_table_lock);
 	}
 
-	return ret;
+	return 0;
 }
 
 static inline gfp_t alloc_hugepage_gfpmask(int defrag, gfp_t extra_gfp)
-- 
cgit v0.10.2


From edad9d2c337d43278a9d5aeb0ed531c2e838f8a6 Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Tue, 29 May 2012 15:06:17 -0700
Subject: mm, thp: allow fallback when pte_alloc_one() fails for huge pmd

The transparent hugepages feature is careful to not invoke the oom
killer when a hugepage cannot be allocated.

pte_alloc_one() failing in __do_huge_pmd_anonymous_page(), however,
currently results in VM_FAULT_OOM which invokes the pagefault oom killer
to kill a memory-hogging task.

This is unnecessary since it's possible to drop the reference to the
hugepage and fallback to allocating a small page.

Signed-off-by: David Rientjes <rientjes@google.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 8ab2d24..d7d7165 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -640,11 +640,8 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
 
 	VM_BUG_ON(!PageCompound(page));
 	pgtable = pte_alloc_one(mm, haddr);
-	if (unlikely(!pgtable)) {
-		mem_cgroup_uncharge_page(page);
-		put_page(page);
+	if (unlikely(!pgtable))
 		return VM_FAULT_OOM;
-	}
 
 	clear_huge_page(page, haddr, HPAGE_PMD_NR);
 	__SetPageUptodate(page);
@@ -723,8 +720,14 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
 			put_page(page);
 			goto out;
 		}
+		if (unlikely(__do_huge_pmd_anonymous_page(mm, vma, haddr, pmd,
+							  page))) {
+			mem_cgroup_uncharge_page(page);
+			put_page(page);
+			goto out;
+		}
 
-		return __do_huge_pmd_anonymous_page(mm, vma, haddr, pmd, page);
+		return 0;
 	}
 out:
 	/*
-- 
cgit v0.10.2


From e709ffd6169ccd259eb5874e853303e91e94e829 Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@redhat.com>
Date: Tue, 29 May 2012 15:06:18 -0700
Subject: mm: remove swap token code

The swap token code no longer fits in with the current VM model.  It
does not play well with cgroups or the better NUMA placement code in
development, since we have only one swap token globally.

It also has the potential to mess with scalability of the system, by
increasing the number of non-reclaimable pages on the active and
inactive anon LRU lists.

Last but not least, the swap token code has been broken for a year
without complaints, as reported by Konstantin Khlebnikov.  This suggests
we no longer have much use for it.

The days of sub-1G memory systems with heavy use of swap are over.  If
we ever need thrashing reducing code in the future, we will have to
implement something that does scale.

Signed-off-by: Rik van Riel <riel@redhat.com>
Cc: Konstantin Khlebnikov <khlebnikov@openvz.org>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Hugh Dickins <hughd@google.com>
Acked-by: Bob Picco <bpicco@meloft.net>
Acked-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 26574c7..dad95bd 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -345,17 +345,6 @@ struct mm_struct {
 	/* Architecture-specific MM context */
 	mm_context_t context;
 
-	/* Swap token stuff */
-	/*
-	 * Last value of global fault stamp as seen by this process.
-	 * In other words, this value gives an indication of how long
-	 * it has been since this task got the token.
-	 * Look at mm/thrash.c
-	 */
-	unsigned int faultstamp;
-	unsigned int token_priority;
-	unsigned int last_interval;
-
 	unsigned long flags; /* Must use atomic bitops to access the bits */
 
 	struct core_state *core_state; /* coredumping support */
diff --git a/include/linux/swap.h b/include/linux/swap.h
index b1fd5c7..bc3073c 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -355,23 +355,6 @@ extern int reuse_swap_page(struct page *);
 extern int try_to_free_swap(struct page *);
 struct backing_dev_info;
 
-/* linux/mm/thrash.c */
-extern struct mm_struct *swap_token_mm;
-extern void grab_swap_token(struct mm_struct *);
-extern void __put_swap_token(struct mm_struct *);
-extern void disable_swap_token(struct mem_cgroup *memcg);
-
-static inline int has_swap_token(struct mm_struct *mm)
-{
-	return (mm == swap_token_mm);
-}
-
-static inline void put_swap_token(struct mm_struct *mm)
-{
-	if (has_swap_token(mm))
-		__put_swap_token(mm);
-}
-
 #ifdef CONFIG_CGROUP_MEM_RES_CTLR
 extern void
 mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout);
@@ -476,24 +459,6 @@ static inline swp_entry_t get_swap_page(void)
 	return entry;
 }
 
-/* linux/mm/thrash.c */
-static inline void put_swap_token(struct mm_struct *mm)
-{
-}
-
-static inline void grab_swap_token(struct mm_struct *mm)
-{
-}
-
-static inline int has_swap_token(struct mm_struct *mm)
-{
-	return 0;
-}
-
-static inline void disable_swap_token(struct mem_cgroup *memcg)
-{
-}
-
 static inline void
 mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent)
 {
diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h
index f64560e..5721954 100644
--- a/include/trace/events/vmscan.h
+++ b/include/trace/events/vmscan.h
@@ -395,88 +395,6 @@ TRACE_EVENT(mm_vmscan_lru_shrink_inactive,
 		show_reclaim_flags(__entry->reclaim_flags))
 );
 
-TRACE_EVENT(replace_swap_token,
-	TP_PROTO(struct mm_struct *old_mm,
-		 struct mm_struct *new_mm),
-
-	TP_ARGS(old_mm, new_mm),
-
-	TP_STRUCT__entry(
-		__field(struct mm_struct*,	old_mm)
-		__field(unsigned int,		old_prio)
-		__field(struct mm_struct*,	new_mm)
-		__field(unsigned int,		new_prio)
-	),
-
-	TP_fast_assign(
-		__entry->old_mm   = old_mm;
-		__entry->old_prio = old_mm ? old_mm->token_priority : 0;
-		__entry->new_mm   = new_mm;
-		__entry->new_prio = new_mm->token_priority;
-	),
-
-	TP_printk("old_token_mm=%p old_prio=%u new_token_mm=%p new_prio=%u",
-		  __entry->old_mm, __entry->old_prio,
-		  __entry->new_mm, __entry->new_prio)
-);
-
-DECLARE_EVENT_CLASS(put_swap_token_template,
-	TP_PROTO(struct mm_struct *swap_token_mm),
-
-	TP_ARGS(swap_token_mm),
-
-	TP_STRUCT__entry(
-		__field(struct mm_struct*, swap_token_mm)
-	),
-
-	TP_fast_assign(
-		__entry->swap_token_mm = swap_token_mm;
-	),
-
-	TP_printk("token_mm=%p", __entry->swap_token_mm)
-);
-
-DEFINE_EVENT(put_swap_token_template, put_swap_token,
-	TP_PROTO(struct mm_struct *swap_token_mm),
-	TP_ARGS(swap_token_mm)
-);
-
-DEFINE_EVENT_CONDITION(put_swap_token_template, disable_swap_token,
-	TP_PROTO(struct mm_struct *swap_token_mm),
-	TP_ARGS(swap_token_mm),
-	TP_CONDITION(swap_token_mm != NULL)
-);
-
-TRACE_EVENT_CONDITION(update_swap_token_priority,
-	TP_PROTO(struct mm_struct *mm,
-		 unsigned int old_prio,
-		 struct mm_struct *swap_token_mm),
-
-	TP_ARGS(mm, old_prio, swap_token_mm),
-
-	TP_CONDITION(mm->token_priority != old_prio),
-
-	TP_STRUCT__entry(
-		__field(struct mm_struct*, mm)
-		__field(unsigned int, old_prio)
-		__field(unsigned int, new_prio)
-		__field(struct mm_struct*, swap_token_mm)
-		__field(unsigned int, swap_token_prio)
-	),
-
-	TP_fast_assign(
-		__entry->mm		= mm;
-		__entry->old_prio	= old_prio;
-		__entry->new_prio	= mm->token_priority;
-		__entry->swap_token_mm	= swap_token_mm;
-		__entry->swap_token_prio = swap_token_mm ? swap_token_mm->token_priority : 0;
-	),
-
-	TP_printk("mm=%p old_prio=%u new_prio=%u swap_token_mm=%p token_prio=%u",
-		  __entry->mm, __entry->old_prio, __entry->new_prio,
-		  __entry->swap_token_mm, __entry->swap_token_prio)
-);
-
 #endif /* _TRACE_VMSCAN_H */
 
 /* This part must be outside protection */
diff --git a/kernel/fork.c b/kernel/fork.c
index 47b4e4f..5b13eea 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -614,7 +614,6 @@ void mmput(struct mm_struct *mm)
 			list_del(&mm->mmlist);
 			spin_unlock(&mmlist_lock);
 		}
-		put_swap_token(mm);
 		if (mm->binfmt)
 			module_put(mm->binfmt->module);
 		mmdrop(mm);
@@ -831,10 +830,6 @@ struct mm_struct *dup_mm(struct task_struct *tsk)
 	memcpy(mm, oldmm, sizeof(*mm));
 	mm_init_cpumask(mm);
 
-	/* Initializing for Swap token stuff */
-	mm->token_priority = 0;
-	mm->last_interval = 0;
-
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 	mm->pmd_huge_pte = NULL;
 #endif
@@ -913,10 +908,6 @@ static int copy_mm(unsigned long clone_flags, struct task_struct *tsk)
 		goto fail_nomem;
 
 good_mm:
-	/* Initializing for Swap token stuff */
-	mm->token_priority = 0;
-	mm->last_interval = 0;
-
 	tsk->mm = mm;
 	tsk->active_mm = mm;
 	return 0;
diff --git a/mm/Makefile b/mm/Makefile
index 8aada89..ccecbf9 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -25,7 +25,7 @@ endif
 obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o
 
 obj-$(CONFIG_BOUNCE)	+= bounce.o
-obj-$(CONFIG_SWAP)	+= page_io.o swap_state.o swapfile.o thrash.o
+obj-$(CONFIG_SWAP)	+= page_io.o swap_state.o swapfile.o
 obj-$(CONFIG_HAS_DMA)	+= dmapool.o
 obj-$(CONFIG_HUGETLBFS)	+= hugetlb.o
 obj-$(CONFIG_NUMA) 	+= mempolicy.o
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index f342778..92675fe 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -5598,7 +5598,6 @@ static void mem_cgroup_move_task(struct cgroup *cont,
 	if (mm) {
 		if (mc.to)
 			mem_cgroup_move_charge(mm);
-		put_swap_token(mm);
 		mmput(mm);
 	}
 	if (mc.to)
diff --git a/mm/memory.c b/mm/memory.c
index e40f675..2bf9e11 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2908,7 +2908,6 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	delayacct_set_flag(DELAYACCT_PF_SWAPIN);
 	page = lookup_swap_cache(entry);
 	if (!page) {
-		grab_swap_token(mm); /* Contend for token _before_ read-in */
 		page = swapin_readahead(entry,
 					GFP_HIGHUSER_MOVABLE, vma, address);
 		if (!page) {
@@ -2938,6 +2937,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	}
 
 	locked = lock_page_or_retry(page, mm, flags);
+
 	delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
 	if (!locked) {
 		ret |= VM_FAULT_RETRY;
diff --git a/mm/rmap.c b/mm/rmap.c
index 5b5ad58..0f3b7cd 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -755,12 +755,6 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma,
 		pte_unmap_unlock(pte, ptl);
 	}
 
-	/* Pretend the page is referenced if the task has the
-	   swap token and is in the middle of a page fault. */
-	if (mm != current->mm && has_swap_token(mm) &&
-			rwsem_is_locked(&mm->mmap_sem))
-		referenced++;
-
 	(*mapcount)--;
 
 	if (referenced)
diff --git a/mm/thrash.c b/mm/thrash.c
deleted file mode 100644
index 57ad495..0000000
--- a/mm/thrash.c
+++ /dev/null
@@ -1,155 +0,0 @@
-/*
- * mm/thrash.c
- *
- * Copyright (C) 2004, Red Hat, Inc.
- * Copyright (C) 2004, Rik van Riel <riel@redhat.com>
- * Released under the GPL, see the file COPYING for details.
- *
- * Simple token based thrashing protection, using the algorithm
- * described in: http://www.cse.ohio-state.edu/hpcs/WWW/HTML/publications/abs05-1.html
- *
- * Sep 2006, Ashwin Chaugule <ashwin.chaugule@celunite.com>
- * Improved algorithm to pass token:
- * Each task has a priority which is incremented if it contended
- * for the token in an interval less than its previous attempt.
- * If the token is acquired, that task's priority is boosted to prevent
- * the token from bouncing around too often and to let the task make
- * some progress in its execution.
- */
-
-#include <linux/jiffies.h>
-#include <linux/mm.h>
-#include <linux/sched.h>
-#include <linux/swap.h>
-#include <linux/memcontrol.h>
-
-#include <trace/events/vmscan.h>
-
-#define TOKEN_AGING_INTERVAL	(0xFF)
-
-static DEFINE_SPINLOCK(swap_token_lock);
-struct mm_struct *swap_token_mm;
-static struct mem_cgroup *swap_token_memcg;
-
-#ifdef CONFIG_CGROUP_MEM_RES_CTLR
-static struct mem_cgroup *swap_token_memcg_from_mm(struct mm_struct *mm)
-{
-	struct mem_cgroup *memcg;
-
-	memcg = try_get_mem_cgroup_from_mm(mm);
-	if (memcg)
-		css_put(mem_cgroup_css(memcg));
-
-	return memcg;
-}
-#else
-static struct mem_cgroup *swap_token_memcg_from_mm(struct mm_struct *mm)
-{
-	return NULL;
-}
-#endif
-
-void grab_swap_token(struct mm_struct *mm)
-{
-	int current_interval;
-	unsigned int old_prio = mm->token_priority;
-	static unsigned int global_faults;
-	static unsigned int last_aging;
-
-	global_faults++;
-
-	current_interval = global_faults - mm->faultstamp;
-
-	if (!spin_trylock(&swap_token_lock))
-		return;
-
-	/* First come first served */
-	if (!swap_token_mm)
-		goto replace_token;
-
-	/*
-	 * Usually, we don't need priority aging because long interval faults
-	 * makes priority decrease quickly. But there is one exception. If the
-	 * token owner task is sleeping, it never make long interval faults.
-	 * Thus, we need a priority aging mechanism instead. The requirements
-	 * of priority aging are
-	 *  1) An aging interval is reasonable enough long. Too short aging
-	 *     interval makes quick swap token lost and decrease performance.
-	 *  2) The swap token owner task have to get priority aging even if
-	 *     it's under sleep.
-	 */
-	if ((global_faults - last_aging) > TOKEN_AGING_INTERVAL) {
-		swap_token_mm->token_priority /= 2;
-		last_aging = global_faults;
-	}
-
-	if (mm == swap_token_mm) {
-		mm->token_priority += 2;
-		goto update_priority;
-	}
-
-	if (current_interval < mm->last_interval)
-		mm->token_priority++;
-	else {
-		if (likely(mm->token_priority > 0))
-			mm->token_priority--;
-	}
-
-	/* Check if we deserve the token */
-	if (mm->token_priority > swap_token_mm->token_priority)
-		goto replace_token;
-
-update_priority:
-	trace_update_swap_token_priority(mm, old_prio, swap_token_mm);
-
-out:
-	mm->faultstamp = global_faults;
-	mm->last_interval = current_interval;
-	spin_unlock(&swap_token_lock);
-	return;
-
-replace_token:
-	mm->token_priority += 2;
-	trace_replace_swap_token(swap_token_mm, mm);
-	swap_token_mm = mm;
-	swap_token_memcg = swap_token_memcg_from_mm(mm);
-	last_aging = global_faults;
-	goto out;
-}
-
-/* Called on process exit. */
-void __put_swap_token(struct mm_struct *mm)
-{
-	spin_lock(&swap_token_lock);
-	if (likely(mm == swap_token_mm)) {
-		trace_put_swap_token(swap_token_mm);
-		swap_token_mm = NULL;
-		swap_token_memcg = NULL;
-	}
-	spin_unlock(&swap_token_lock);
-}
-
-static bool match_memcg(struct mem_cgroup *a, struct mem_cgroup *b)
-{
-	if (!a)
-		return true;
-	if (!b)
-		return true;
-	if (a == b)
-		return true;
-	return false;
-}
-
-void disable_swap_token(struct mem_cgroup *memcg)
-{
-	/* memcg reclaim don't disable unrelated mm token. */
-	if (match_memcg(memcg, swap_token_memcg)) {
-		spin_lock(&swap_token_lock);
-		if (match_memcg(memcg, swap_token_memcg)) {
-			trace_disable_swap_token(swap_token_mm);
-			swap_token_mm = NULL;
-			swap_token_memcg = NULL;
-		}
-		spin_unlock(&swap_token_lock);
-	}
-}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 33dc256..ca46080 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2352,8 +2352,6 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
 
 	for (priority = DEF_PRIORITY; priority >= 0; priority--) {
 		sc->nr_scanned = 0;
-		if (!priority)
-			disable_swap_token(sc->target_mem_cgroup);
 		aborted_reclaim = shrink_zones(priority, zonelist, sc);
 
 		/*
@@ -2704,10 +2702,6 @@ loop_again:
 		unsigned long lru_pages = 0;
 		int has_under_min_watermark_zone = 0;
 
-		/* The swap token gets in the way of swapout... */
-		if (!priority)
-			disable_swap_token(NULL);
-
 		all_zones_ok = 1;
 		balanced = 0;
 
-- 
cgit v0.10.2


From c53919adc045bf803252e912f23028a68525753d Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@suse.de>
Date: Tue, 29 May 2012 15:06:19 -0700
Subject: mm: vmscan: remove lumpy reclaim

This series removes lumpy reclaim and some stalling logic that was
unintentionally being used by memory compaction.  The end result is that
stalling on dirty pages during page reclaim now depends on
wait_iff_congested().

Four kernels were compared

  3.3.0     vanilla
  3.4.0-rc2 vanilla
  3.4.0-rc2 lumpyremove-v2 is patch one from this series
  3.4.0-rc2 nosync-v2r3 is the full series

Removing lumpy reclaim saves almost 900 bytes of text whereas the full
series removes 1200 bytes.

     text     data      bss       dec     hex  filename
  6740375  1927944  2260992  10929311  a6c49f  vmlinux-3.4.0-rc2-vanilla
  6739479  1927944  2260992  10928415  a6c11f  vmlinux-3.4.0-rc2-lumpyremove-v2
  6739159  1927944  2260992  10928095  a6bfdf  vmlinux-3.4.0-rc2-nosync-v2

There are behaviour changes in the series and so tests were run with
monitoring of ftrace events.  This disrupts results so the performance
results are distorted but the new behaviour should be clearer.

fs-mark running in a threaded configuration showed little of interest as
it did not push reclaim aggressively

  FS-Mark Multi Threaded
                          3.3.0-vanilla       rc2-vanilla       lumpyremove-v2r3       nosync-v2r3
  Files/s  min           3.20 ( 0.00%)        3.20 ( 0.00%)        3.20 ( 0.00%)        3.20 ( 0.00%)
  Files/s  mean          3.20 ( 0.00%)        3.20 ( 0.00%)        3.20 ( 0.00%)        3.20 ( 0.00%)
  Files/s  stddev        0.00 ( 0.00%)        0.00 ( 0.00%)        0.00 ( 0.00%)        0.00 ( 0.00%)
  Files/s  max           3.20 ( 0.00%)        3.20 ( 0.00%)        3.20 ( 0.00%)        3.20 ( 0.00%)
  Overhead min      508667.00 ( 0.00%)   521350.00 (-2.49%)   544292.00 (-7.00%)   547168.00 (-7.57%)
  Overhead mean     551185.00 ( 0.00%)   652690.73 (-18.42%)   991208.40 (-79.83%)   570130.53 (-3.44%)
  Overhead stddev    18200.69 ( 0.00%)   331958.29 (-1723.88%)  1579579.43 (-8578.68%)     9576.81 (47.38%)
  Overhead max      576775.00 ( 0.00%)  1846634.00 (-220.17%)  6901055.00 (-1096.49%)   585675.00 (-1.54%)
  MMTests Statistics: duration
  Sys Time Running Test (seconds)             309.90    300.95    307.33    298.95
  User+Sys Time Running Test (seconds)        319.32    309.67    315.69    307.51
  Total Elapsed Time (seconds)               1187.85   1193.09   1191.98   1193.73

  MMTests Statistics: vmstat
  Page Ins                                       80532       82212       81420       79480
  Page Outs                                  111434984   111456240   111437376   111582628
  Swap Ins                                           0           0           0           0
  Swap Outs                                          0           0           0           0
  Direct pages scanned                           44881       27889       27453       34843
  Kswapd pages scanned                        25841428    25860774    25861233    25843212
  Kswapd pages reclaimed                      25841393    25860741    25861199    25843179
  Direct pages reclaimed                         44881       27889       27453       34843
  Kswapd efficiency                                99%         99%         99%         99%
  Kswapd velocity                            21754.791   21675.460   21696.029   21649.127
  Direct efficiency                               100%        100%        100%        100%
  Direct velocity                               37.783      23.375      23.031      29.188
  Percentage direct scans                           0%          0%          0%          0%

ftrace showed that there was no stalling on writeback or pages submitted
for IO from reclaim context.

postmark was similar and while it was more interesting, it also did not
push reclaim heavily.

  POSTMARK
                                       3.3.0-vanilla       rc2-vanilla  lumpyremove-v2r3       nosync-v2r3
  Transactions per second:               16.00 ( 0.00%)    20.00 (25.00%)    18.00 (12.50%)    17.00 ( 6.25%)
  Data megabytes read per second:        18.80 ( 0.00%)    24.27 (29.10%)    22.26 (18.40%)    20.54 ( 9.26%)
  Data megabytes written per second:     35.83 ( 0.00%)    46.25 (29.08%)    42.42 (18.39%)    39.14 ( 9.24%)
  Files created alone per second:        28.00 ( 0.00%)    38.00 (35.71%)    34.00 (21.43%)    30.00 ( 7.14%)
  Files create/transact per second:       8.00 ( 0.00%)    10.00 (25.00%)     9.00 (12.50%)     8.00 ( 0.00%)
  Files deleted alone per second:       556.00 ( 0.00%)  1224.00 (120.14%)  3062.00 (450.72%)  6124.00 (1001.44%)
  Files delete/transact per second:       8.00 ( 0.00%)    10.00 (25.00%)     9.00 (12.50%)     8.00 ( 0.00%)

  MMTests Statistics: duration
  Sys Time Running Test (seconds)             113.34    107.99    109.73    108.72
  User+Sys Time Running Test (seconds)        145.51    139.81    143.32    143.55
  Total Elapsed Time (seconds)               1159.16    899.23    980.17   1062.27

  MMTests Statistics: vmstat
  Page Ins                                    13710192    13729032    13727944    13760136
  Page Outs                                   43071140    42987228    42733684    42931624
  Swap Ins                                           0           0           0           0
  Swap Outs                                          0           0           0           0
  Direct pages scanned                               0           0           0           0
  Kswapd pages scanned                         9941613     9937443     9939085     9929154
  Kswapd pages reclaimed                       9940926     9936751     9938397     9928465
  Direct pages reclaimed                             0           0           0           0
  Kswapd efficiency                                99%         99%         99%         99%
  Kswapd velocity                             8576.567   11051.058   10140.164    9347.109
  Direct efficiency                               100%        100%        100%        100%
  Direct velocity                                0.000       0.000       0.000       0.000

It looks like here that the full series regresses performance but as
ftrace showed no usage of wait_iff_congested() or sync reclaim I am
assuming it's a disruption due to monitoring.  Other data such as memory
usage, page IO, swap IO all looked similar.

Running a benchmark with a plain DD showed nothing very interesting.
The full series stalled in wait_iff_congested() slightly less but stall
times on vanilla kernels were marginal.

Running a benchmark that hammered on file-backed mappings showed stalls
due to congestion but not in sync writebacks

  MICRO
                                       3.3.0-vanilla       rc2-vanilla  lumpyremove-v2r3       nosync-v2r3
  MMTests Statistics: duration
  Sys Time Running Test (seconds)             308.13    294.50    298.75    299.53
  User+Sys Time Running Test (seconds)        330.45    316.28    318.93    320.79
  Total Elapsed Time (seconds)               1814.90   1833.88   1821.14   1832.91

  MMTests Statistics: vmstat
  Page Ins                                      108712      120708       97224      110344
  Page Outs                                  155514576   156017404   155813676   156193256
  Swap Ins                                           0           0           0           0
  Swap Outs                                          0           0           0           0
  Direct pages scanned                         2599253     1550480     2512822     2414760
  Kswapd pages scanned                        69742364    71150694    68839041    69692533
  Kswapd pages reclaimed                      34824488    34773341    34796602    34799396
  Direct pages reclaimed                         53693       94750       61792       75205
  Kswapd efficiency                                49%         48%         50%         49%
  Kswapd velocity                            38427.662   38797.901   37799.972   38022.889
  Direct efficiency                                 2%          6%          2%          3%
  Direct velocity                             1432.174     845.464    1379.807    1317.446
  Percentage direct scans                           3%          2%          3%          3%
  Page writes by reclaim                             0           0           0           0
  Page writes file                                   0           0           0           0
  Page writes anon                                   0           0           0           0
  Page reclaim immediate                             0           0           0        1218
  Page rescued immediate                             0           0           0           0
  Slabs scanned                                  15360       16384       13312       16384
  Direct inode steals                                0           0           0           0
  Kswapd inode steals                             4340        4327        1630        4323

  FTrace Reclaim Statistics: congestion_wait
  Direct number congest     waited                 0          0          0          0
  Direct time   congest     waited               0ms        0ms        0ms        0ms
  Direct full   congest     waited                 0          0          0          0
  Direct number conditional waited               900        870        754        789
  Direct time   conditional waited               0ms        0ms        0ms       20ms
  Direct full   conditional waited                 0          0          0          0
  KSwapd number congest     waited              2106       2308       2116       1915
  KSwapd time   congest     waited          139924ms   157832ms   125652ms   132516ms
  KSwapd full   congest     waited              1346       1530       1202       1278
  KSwapd number conditional waited             12922      16320      10943      14670
  KSwapd time   conditional waited               0ms        0ms        0ms        0ms
  KSwapd full   conditional waited                 0          0          0          0

Reclaim statistics are not radically changed.  The stall times in kswapd
are massive but it is clear that it is due to calls to congestion_wait()
and that is almost certainly the call in balance_pgdat().  Otherwise
stalls due to dirty pages are non-existant.

I ran a benchmark that stressed high-order allocation.  This is very
artifical load but was used in the past to evaluate lumpy reclaim and
compaction.  Generally I look at allocation success rates and latency
figures.

  STRESS-HIGHALLOC
                   3.3.0-vanilla       rc2-vanilla  lumpyremove-v2r3       nosync-v2r3
  Pass 1          81.00 ( 0.00%)    28.00 (-53.00%)    24.00 (-57.00%)    28.00 (-53.00%)
  Pass 2          82.00 ( 0.00%)    39.00 (-43.00%)    38.00 (-44.00%)    43.00 (-39.00%)
  while Rested    88.00 ( 0.00%)    87.00 (-1.00%)    88.00 ( 0.00%)    88.00 ( 0.00%)

  MMTests Statistics: duration
  Sys Time Running Test (seconds)             740.93    681.42    685.14    684.87
  User+Sys Time Running Test (seconds)       2922.65   3269.52   3281.35   3279.44
  Total Elapsed Time (seconds)               1161.73   1152.49   1159.55   1161.44

  MMTests Statistics: vmstat
  Page Ins                                     4486020     2807256     2855944     2876244
  Page Outs                                    7261600     7973688     7975320     7986120
  Swap Ins                                       31694           0           0           0
  Swap Outs                                      98179           0           0           0
  Direct pages scanned                           53494       57731       34406      113015
  Kswapd pages scanned                         6271173     1287481     1278174     1219095
  Kswapd pages reclaimed                       2029240     1281025     1260708     1201583
  Direct pages reclaimed                          1468       14564       16649       92456
  Kswapd efficiency                                32%         99%         98%         98%
  Kswapd velocity                             5398.133    1117.130    1102.302    1049.641
  Direct efficiency                                 2%         25%         48%         81%
  Direct velocity                               46.047      50.092      29.672      97.306
  Percentage direct scans                           0%          4%          2%          8%
  Page writes by reclaim                       1616049           0           0           0
  Page writes file                             1517870           0           0           0
  Page writes anon                               98179           0           0           0
  Page reclaim immediate                        103778       27339        9796       17831
  Page rescued immediate                             0           0           0           0
  Slabs scanned                                1096704      986112      980992      998400
  Direct inode steals                              223      215040      216736      247881
  Kswapd inode steals                           175331       61548       68444       63066
  Kswapd skipped wait                            21991           0           1           0
  THP fault alloc                                    1         135         125         134
  THP collapse alloc                               393         311         228         236
  THP splits                                        25          13           7           8
  THP fault fallback                                 0           0           0           0
  THP collapse fail                                  3           5           7           7
  Compaction stalls                                865        1270        1422        1518
  Compaction success                               370         401         353         383
  Compaction failures                              495         869        1069        1135
  Compaction pages moved                        870155     3828868     4036106     4423626
  Compaction move failure                        26429       23865       29742       27514

Success rates are completely hosed for 3.4-rc2 which is almost certainly
due to commit fe2c2a106663 ("vmscan: reclaim at order 0 when compaction
is enabled").  I expected this would happen for kswapd and impair
allocation success rates (https://lkml.org/lkml/2012/1/25/166) but I did
not anticipate this much a difference: 80% less scanning, 37% less
reclaim by kswapd

In comparison, reclaim/compaction is not aggressive and gives up easily
which is the intended behaviour.  hugetlbfs uses __GFP_REPEAT and would
be much more aggressive about reclaim/compaction than THP allocations
are.  The stress test above is allocating like neither THP or hugetlbfs
but is much closer to THP.

Mainline is now impaired in terms of high order allocation under heavy
load although I do not know to what degree as I did not test with
__GFP_REPEAT.  Keep this in mind for bugs related to hugepage pool
resizing, THP allocation and high order atomic allocation failures from
network devices.

In terms of congestion throttling, I see the following for this test

  FTrace Reclaim Statistics: congestion_wait
  Direct number congest     waited                 3          0          0          0
  Direct time   congest     waited               0ms        0ms        0ms        0ms
  Direct full   congest     waited                 0          0          0          0
  Direct number conditional waited               957        512       1081       1075
  Direct time   conditional waited               0ms        0ms        0ms        0ms
  Direct full   conditional waited                 0          0          0          0
  KSwapd number congest     waited                36          4          3          5
  KSwapd time   congest     waited            3148ms      400ms      300ms      500ms
  KSwapd full   congest     waited                30          4          3          5
  KSwapd number conditional waited             88514        197        332        542
  KSwapd time   conditional waited            4980ms        0ms        0ms        0ms
  KSwapd full   conditional waited                49          0          0          0

The "conditional waited" times are the most interesting as this is
directly impacted by the number of dirty pages encountered during scan.
As lumpy reclaim is no longer scanning contiguous ranges, it is finding
fewer dirty pages.  This brings wait times from about 5 seconds to 0.
kswapd itself is still calling congestion_wait() so it'll still stall but
it's a lot less.

In terms of the type of IO we were doing, I see this

  FTrace Reclaim Statistics: mm_vmscan_writepage
  Direct writes anon  sync                         0          0          0          0
  Direct writes anon  async                        0          0          0          0
  Direct writes file  sync                         0          0          0          0
  Direct writes file  async                        0          0          0          0
  Direct writes mixed sync                         0          0          0          0
  Direct writes mixed async                        0          0          0          0
  KSwapd writes anon  sync                         0          0          0          0
  KSwapd writes anon  async                    91682          0          0          0
  KSwapd writes file  sync                         0          0          0          0
  KSwapd writes file  async                   822629          0          0          0
  KSwapd writes mixed sync                         0          0          0          0
  KSwapd writes mixed async                        0          0          0          0

In 3.2, kswapd was doing a bunch of async writes of pages but
reclaim/compaction was never reaching a point where it was doing sync
IO.  This does not guarantee that reclaim/compaction was not calling
wait_on_page_writeback() but I would consider it unlikely.  It indicates
that merging patches 2 and 3 to stop reclaim/compaction calling
wait_on_page_writeback() should be safe.

This patch:

Lumpy reclaim had a purpose but in the mind of some, it was to kick the
system so hard it trashed.  For others the purpose was to complicate
vmscan.c.  Over time it was giving softer shoes and a nicer attitude but
memory compaction needs to step up and replace it so this patch sends
lumpy reclaim to the farm.

The tracepoint format changes for isolating LRU pages with this patch
applied.  Furthermore reclaim/compaction can no longer queue dirty pages
in pageout() if the underlying BDI is congested.  Lumpy reclaim used
this logic and reclaim/compaction was using it in error.

Signed-off-by: Mel Gorman <mgorman@suse.de>
Acked-by: Rik van Riel <riel@redhat.com>
Acked-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Konstantin Khlebnikov <khlebnikov@openvz.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Ying Han <yinghan@google.com>
Cc: Andy Whitcroft <apw@shadowen.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h
index 5721954..bdaf32f 100644
--- a/include/trace/events/vmscan.h
+++ b/include/trace/events/vmscan.h
@@ -263,22 +263,16 @@ DECLARE_EVENT_CLASS(mm_vmscan_lru_isolate_template,
 		unsigned long nr_requested,
 		unsigned long nr_scanned,
 		unsigned long nr_taken,
-		unsigned long nr_lumpy_taken,
-		unsigned long nr_lumpy_dirty,
-		unsigned long nr_lumpy_failed,
 		isolate_mode_t isolate_mode,
 		int file),
 
-	TP_ARGS(order, nr_requested, nr_scanned, nr_taken, nr_lumpy_taken, nr_lumpy_dirty, nr_lumpy_failed, isolate_mode, file),
+	TP_ARGS(order, nr_requested, nr_scanned, nr_taken, isolate_mode, file),
 
 	TP_STRUCT__entry(
 		__field(int, order)
 		__field(unsigned long, nr_requested)
 		__field(unsigned long, nr_scanned)
 		__field(unsigned long, nr_taken)
-		__field(unsigned long, nr_lumpy_taken)
-		__field(unsigned long, nr_lumpy_dirty)
-		__field(unsigned long, nr_lumpy_failed)
 		__field(isolate_mode_t, isolate_mode)
 		__field(int, file)
 	),
@@ -288,22 +282,16 @@ DECLARE_EVENT_CLASS(mm_vmscan_lru_isolate_template,
 		__entry->nr_requested = nr_requested;
 		__entry->nr_scanned = nr_scanned;
 		__entry->nr_taken = nr_taken;
-		__entry->nr_lumpy_taken = nr_lumpy_taken;
-		__entry->nr_lumpy_dirty = nr_lumpy_dirty;
-		__entry->nr_lumpy_failed = nr_lumpy_failed;
 		__entry->isolate_mode = isolate_mode;
 		__entry->file = file;
 	),
 
-	TP_printk("isolate_mode=%d order=%d nr_requested=%lu nr_scanned=%lu nr_taken=%lu contig_taken=%lu contig_dirty=%lu contig_failed=%lu file=%d",
+	TP_printk("isolate_mode=%d order=%d nr_requested=%lu nr_scanned=%lu nr_taken=%lu file=%d",
 		__entry->isolate_mode,
 		__entry->order,
 		__entry->nr_requested,
 		__entry->nr_scanned,
 		__entry->nr_taken,
-		__entry->nr_lumpy_taken,
-		__entry->nr_lumpy_dirty,
-		__entry->nr_lumpy_failed,
 		__entry->file)
 );
 
@@ -313,13 +301,10 @@ DEFINE_EVENT(mm_vmscan_lru_isolate_template, mm_vmscan_lru_isolate,
 		unsigned long nr_requested,
 		unsigned long nr_scanned,
 		unsigned long nr_taken,
-		unsigned long nr_lumpy_taken,
-		unsigned long nr_lumpy_dirty,
-		unsigned long nr_lumpy_failed,
 		isolate_mode_t isolate_mode,
 		int file),
 
-	TP_ARGS(order, nr_requested, nr_scanned, nr_taken, nr_lumpy_taken, nr_lumpy_dirty, nr_lumpy_failed, isolate_mode, file)
+	TP_ARGS(order, nr_requested, nr_scanned, nr_taken, isolate_mode, file)
 
 );
 
@@ -329,13 +314,10 @@ DEFINE_EVENT(mm_vmscan_lru_isolate_template, mm_vmscan_memcg_isolate,
 		unsigned long nr_requested,
 		unsigned long nr_scanned,
 		unsigned long nr_taken,
-		unsigned long nr_lumpy_taken,
-		unsigned long nr_lumpy_dirty,
-		unsigned long nr_lumpy_failed,
 		isolate_mode_t isolate_mode,
 		int file),
 
-	TP_ARGS(order, nr_requested, nr_scanned, nr_taken, nr_lumpy_taken, nr_lumpy_dirty, nr_lumpy_failed, isolate_mode, file)
+	TP_ARGS(order, nr_requested, nr_scanned, nr_taken, isolate_mode, file)
 
 );
 
diff --git a/mm/vmscan.c b/mm/vmscan.c
index ca46080..546d02c 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -58,9 +58,6 @@
  * RECLAIM_MODE_SINGLE: Reclaim only order-0 pages
  * RECLAIM_MODE_ASYNC:  Do not block
  * RECLAIM_MODE_SYNC:   Allow blocking e.g. call wait_on_page_writeback
- * RECLAIM_MODE_LUMPYRECLAIM: For high-order allocations, take a reference
- *			page from the LRU and reclaim all pages within a
- *			naturally aligned range
  * RECLAIM_MODE_COMPACTION: For high-order allocations, reclaim a number of
  *			order-0 pages and then compact the zone
  */
@@ -68,7 +65,6 @@ typedef unsigned __bitwise__ reclaim_mode_t;
 #define RECLAIM_MODE_SINGLE		((__force reclaim_mode_t)0x01u)
 #define RECLAIM_MODE_ASYNC		((__force reclaim_mode_t)0x02u)
 #define RECLAIM_MODE_SYNC		((__force reclaim_mode_t)0x04u)
-#define RECLAIM_MODE_LUMPYRECLAIM	((__force reclaim_mode_t)0x08u)
 #define RECLAIM_MODE_COMPACTION		((__force reclaim_mode_t)0x10u)
 
 struct scan_control {
@@ -367,27 +363,17 @@ out:
 static void set_reclaim_mode(int priority, struct scan_control *sc,
 				   bool sync)
 {
+	/* Sync reclaim used only for compaction */
 	reclaim_mode_t syncmode = sync ? RECLAIM_MODE_SYNC : RECLAIM_MODE_ASYNC;
 
 	/*
-	 * Initially assume we are entering either lumpy reclaim or
-	 * reclaim/compaction.Depending on the order, we will either set the
-	 * sync mode or just reclaim order-0 pages later.
-	 */
-	if (COMPACTION_BUILD)
-		sc->reclaim_mode = RECLAIM_MODE_COMPACTION;
-	else
-		sc->reclaim_mode = RECLAIM_MODE_LUMPYRECLAIM;
-
-	/*
-	 * Avoid using lumpy reclaim or reclaim/compaction if possible by
-	 * restricting when its set to either costly allocations or when
+	 * Restrict reclaim/compaction to costly allocations or when
 	 * under memory pressure
 	 */
-	if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
-		sc->reclaim_mode |= syncmode;
-	else if (sc->order && priority < DEF_PRIORITY - 2)
-		sc->reclaim_mode |= syncmode;
+	if (COMPACTION_BUILD && sc->order &&
+			(sc->order > PAGE_ALLOC_COSTLY_ORDER ||
+			 priority < DEF_PRIORITY - 2))
+		sc->reclaim_mode = RECLAIM_MODE_COMPACTION | syncmode;
 	else
 		sc->reclaim_mode = RECLAIM_MODE_SINGLE | RECLAIM_MODE_ASYNC;
 }
@@ -416,10 +402,6 @@ static int may_write_to_queue(struct backing_dev_info *bdi,
 		return 1;
 	if (bdi == current->backing_dev_info)
 		return 1;
-
-	/* lumpy reclaim for hugepage often need a lot of write */
-	if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
-		return 1;
 	return 0;
 }
 
@@ -710,10 +692,6 @@ static enum page_references page_check_references(struct page *page,
 	referenced_ptes = page_referenced(page, 1, mz->mem_cgroup, &vm_flags);
 	referenced_page = TestClearPageReferenced(page);
 
-	/* Lumpy reclaim - ignore references */
-	if (sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM)
-		return PAGEREF_RECLAIM;
-
 	/*
 	 * Mlock lost the isolation race with us.  Let try_to_unmap()
 	 * move the page to the unevictable list.
@@ -824,7 +802,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 				wait_on_page_writeback(page);
 			else {
 				unlock_page(page);
-				goto keep_lumpy;
+				goto keep_reclaim_mode;
 			}
 		}
 
@@ -908,7 +886,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 				goto activate_locked;
 			case PAGE_SUCCESS:
 				if (PageWriteback(page))
-					goto keep_lumpy;
+					goto keep_reclaim_mode;
 				if (PageDirty(page))
 					goto keep;
 
@@ -1008,7 +986,7 @@ keep_locked:
 		unlock_page(page);
 keep:
 		reset_reclaim_mode(sc);
-keep_lumpy:
+keep_reclaim_mode:
 		list_add(&page->lru, &ret_pages);
 		VM_BUG_ON(PageLRU(page) || PageUnevictable(page));
 	}
@@ -1064,11 +1042,7 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode, int file)
 	if (!all_lru_mode && !!page_is_file_cache(page) != file)
 		return ret;
 
-	/*
-	 * When this function is being called for lumpy reclaim, we
-	 * initially look into all LRU pages, active, inactive and
-	 * unevictable; only give shrink_page_list evictable pages.
-	 */
+	/* Do not give back unevictable pages for compaction */
 	if (PageUnevictable(page))
 		return ret;
 
@@ -1153,9 +1127,6 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
 	struct lruvec *lruvec;
 	struct list_head *src;
 	unsigned long nr_taken = 0;
-	unsigned long nr_lumpy_taken = 0;
-	unsigned long nr_lumpy_dirty = 0;
-	unsigned long nr_lumpy_failed = 0;
 	unsigned long scan;
 	int lru = LRU_BASE;
 
@@ -1168,10 +1139,6 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
 
 	for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) {
 		struct page *page;
-		unsigned long pfn;
-		unsigned long end_pfn;
-		unsigned long page_pfn;
-		int zone_id;
 
 		page = lru_to_page(src);
 		prefetchw_prev_lru_page(page, src, flags);
@@ -1193,84 +1160,6 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
 		default:
 			BUG();
 		}
-
-		if (!sc->order || !(sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM))
-			continue;
-
-		/*
-		 * Attempt to take all pages in the order aligned region
-		 * surrounding the tag page.  Only take those pages of
-		 * the same active state as that tag page.  We may safely
-		 * round the target page pfn down to the requested order
-		 * as the mem_map is guaranteed valid out to MAX_ORDER,
-		 * where that page is in a different zone we will detect
-		 * it from its zone id and abort this block scan.
-		 */
-		zone_id = page_zone_id(page);
-		page_pfn = page_to_pfn(page);
-		pfn = page_pfn & ~((1 << sc->order) - 1);
-		end_pfn = pfn + (1 << sc->order);
-		for (; pfn < end_pfn; pfn++) {
-			struct page *cursor_page;
-
-			/* The target page is in the block, ignore it. */
-			if (unlikely(pfn == page_pfn))
-				continue;
-
-			/* Avoid holes within the zone. */
-			if (unlikely(!pfn_valid_within(pfn)))
-				break;
-
-			cursor_page = pfn_to_page(pfn);
-
-			/* Check that we have not crossed a zone boundary. */
-			if (unlikely(page_zone_id(cursor_page) != zone_id))
-				break;
-
-			/*
-			 * If we don't have enough swap space, reclaiming of
-			 * anon page which don't already have a swap slot is
-			 * pointless.
-			 */
-			if (nr_swap_pages <= 0 && PageSwapBacked(cursor_page) &&
-			    !PageSwapCache(cursor_page))
-				break;
-
-			if (__isolate_lru_page(cursor_page, mode, file) == 0) {
-				unsigned int isolated_pages;
-
-				mem_cgroup_lru_del(cursor_page);
-				list_move(&cursor_page->lru, dst);
-				isolated_pages = hpage_nr_pages(cursor_page);
-				nr_taken += isolated_pages;
-				nr_lumpy_taken += isolated_pages;
-				if (PageDirty(cursor_page))
-					nr_lumpy_dirty += isolated_pages;
-				scan++;
-				pfn += isolated_pages - 1;
-			} else {
-				/*
-				 * Check if the page is freed already.
-				 *
-				 * We can't use page_count() as that
-				 * requires compound_head and we don't
-				 * have a pin on the page here. If a
-				 * page is tail, we may or may not
-				 * have isolated the head, so assume
-				 * it's not free, it'd be tricky to
-				 * track the head status without a
-				 * page pin.
-				 */
-				if (!PageTail(cursor_page) &&
-				    !atomic_read(&cursor_page->_count))
-					continue;
-				break;
-			}
-		}
-
-		/* If we break out of the loop above, lumpy reclaim failed */
-		if (pfn < end_pfn)
-			nr_lumpy_failed++;
 	}
 
 	*nr_scanned = scan;
@@ -1278,7 +1167,6 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
 	trace_mm_vmscan_lru_isolate(sc->order,
 			nr_to_scan, scan,
 			nr_taken,
-			nr_lumpy_taken, nr_lumpy_dirty, nr_lumpy_failed,
 			mode, file);
 	return nr_taken;
 }
@@ -1466,13 +1354,13 @@ static inline bool should_reclaim_stall(unsigned long nr_taken,
 					int priority,
 					struct scan_control *sc)
 {
-	int lumpy_stall_priority;
+	int stall_priority;
 
 	/* kswapd should not stall on sync IO */
 	if (current_is_kswapd())
 		return false;
 
-	/* Only stall on lumpy reclaim */
+	/* Only stall for memory compaction */
 	if (sc->reclaim_mode & RECLAIM_MODE_SINGLE)
 		return false;
 
@@ -1487,11 +1375,11 @@ static inline bool should_reclaim_stall(unsigned long nr_taken,
 	 * priority to be much higher before stalling.
 	 */
 	if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
-		lumpy_stall_priority = DEF_PRIORITY;
+		stall_priority = DEF_PRIORITY;
 	else
-		lumpy_stall_priority = DEF_PRIORITY / 3;
+		stall_priority = DEF_PRIORITY / 3;
 
-	return priority <= lumpy_stall_priority;
+	return priority <= stall_priority;
 }
 
 /*
@@ -1523,8 +1411,6 @@ shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz,
 	}
 
 	set_reclaim_mode(priority, sc, false);
-	if (sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM)
-		isolate_mode |= ISOLATE_ACTIVE;
 
 	lru_add_drain();
 
-- 
cgit v0.10.2


From 41ac1999c3e3563e1810b14878a869c79c9368bb Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@suse.de>
Date: Tue, 29 May 2012 15:06:19 -0700
Subject: mm: vmscan: do not stall on writeback during memory compaction

This patch stops reclaim/compaction entering sync reclaim as this was
only intended for lumpy reclaim and an oversight.  Page migration has
its own logic for stalling on writeback pages if necessary and memory
compaction is already using it.

Waiting on page writeback is bad for a number of reasons but the primary
one is that waiting on writeback to a slow device like USB can take a
considerable length of time.  Page reclaim instead uses
wait_iff_congested() to throttle if too many dirty pages are being
scanned.

Signed-off-by: Mel Gorman <mgorman@suse.de>
Acked-by: Rik van Riel <riel@redhat.com>
Acked-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Konstantin Khlebnikov <khlebnikov@openvz.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Ying Han <yinghan@google.com>
Cc: Andy Whitcroft <apw@shadowen.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h
index bdaf32f..82f6933 100644
--- a/include/trace/events/vmscan.h
+++ b/include/trace/events/vmscan.h
@@ -13,7 +13,7 @@
 #define RECLAIM_WB_ANON		0x0001u
 #define RECLAIM_WB_FILE		0x0002u
 #define RECLAIM_WB_MIXED	0x0010u
-#define RECLAIM_WB_SYNC		0x0004u
+#define RECLAIM_WB_SYNC		0x0004u /* Unused, all reclaim async */
 #define RECLAIM_WB_ASYNC	0x0008u
 
 #define show_reclaim_flags(flags)				\
@@ -27,13 +27,13 @@
 
 #define trace_reclaim_flags(page, sync) ( \
 	(page_is_file_cache(page) ? RECLAIM_WB_FILE : RECLAIM_WB_ANON) | \
-	(sync & RECLAIM_MODE_SYNC ? RECLAIM_WB_SYNC : RECLAIM_WB_ASYNC)   \
+	(RECLAIM_WB_ASYNC) \
 	)
 
-#define trace_shrink_flags(file, sync) ( \
-	(sync & RECLAIM_MODE_SYNC ? RECLAIM_WB_MIXED : \
-			(file ? RECLAIM_WB_FILE : RECLAIM_WB_ANON)) |  \
-	(sync & RECLAIM_MODE_SYNC ? RECLAIM_WB_SYNC : RECLAIM_WB_ASYNC) \
+#define trace_shrink_flags(file, sync) \
+	( \
+		(file ? RECLAIM_WB_FILE : RECLAIM_WB_ANON) | \
+		(RECLAIM_WB_ASYNC) \
 	)
 
 TRACE_EVENT(mm_vmscan_kswapd_sleep,
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 546d02c..e27f27d 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -56,15 +56,11 @@
 /*
  * reclaim_mode determines how the inactive list is shrunk
  * RECLAIM_MODE_SINGLE: Reclaim only order-0 pages
- * RECLAIM_MODE_ASYNC:  Do not block
- * RECLAIM_MODE_SYNC:   Allow blocking e.g. call wait_on_page_writeback
  * RECLAIM_MODE_COMPACTION: For high-order allocations, reclaim a number of
  *			order-0 pages and then compact the zone
  */
 typedef unsigned __bitwise__ reclaim_mode_t;
 #define RECLAIM_MODE_SINGLE		((__force reclaim_mode_t)0x01u)
-#define RECLAIM_MODE_ASYNC		((__force reclaim_mode_t)0x02u)
-#define RECLAIM_MODE_SYNC		((__force reclaim_mode_t)0x04u)
 #define RECLAIM_MODE_COMPACTION		((__force reclaim_mode_t)0x10u)
 
 struct scan_control {
@@ -360,12 +356,8 @@ out:
 	return ret;
 }
 
-static void set_reclaim_mode(int priority, struct scan_control *sc,
-				   bool sync)
+static void set_reclaim_mode(int priority, struct scan_control *sc)
 {
-	/* Sync reclaim used only for compaction */
-	reclaim_mode_t syncmode = sync ? RECLAIM_MODE_SYNC : RECLAIM_MODE_ASYNC;
-
 	/*
 	 * Restrict reclaim/compaction to costly allocations or when
 	 * under memory pressure
@@ -373,14 +365,14 @@ static void set_reclaim_mode(int priority, struct scan_control *sc,
 	if (COMPACTION_BUILD && sc->order &&
 			(sc->order > PAGE_ALLOC_COSTLY_ORDER ||
 			 priority < DEF_PRIORITY - 2))
-		sc->reclaim_mode = RECLAIM_MODE_COMPACTION | syncmode;
+		sc->reclaim_mode = RECLAIM_MODE_COMPACTION;
 	else
-		sc->reclaim_mode = RECLAIM_MODE_SINGLE | RECLAIM_MODE_ASYNC;
+		sc->reclaim_mode = RECLAIM_MODE_SINGLE;
 }
 
 static void reset_reclaim_mode(struct scan_control *sc)
 {
-	sc->reclaim_mode = RECLAIM_MODE_SINGLE | RECLAIM_MODE_ASYNC;
+	sc->reclaim_mode = RECLAIM_MODE_SINGLE;
 }
 
 static inline int is_page_cache_freeable(struct page *page)
@@ -791,19 +783,8 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 
 		if (PageWriteback(page)) {
 			nr_writeback++;
-			/*
-			 * Synchronous reclaim cannot queue pages for
-			 * writeback due to the possibility of stack overflow
-			 * but if it encounters a page under writeback, wait
-			 * for the IO to complete.
-			 */
-			if ((sc->reclaim_mode & RECLAIM_MODE_SYNC) &&
-			    may_enter_fs)
-				wait_on_page_writeback(page);
-			else {
-				unlock_page(page);
-				goto keep_reclaim_mode;
-			}
+			unlock_page(page);
+			goto keep;
 		}
 
 		references = page_check_references(page, mz, sc);
@@ -886,7 +867,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 				goto activate_locked;
 			case PAGE_SUCCESS:
 				if (PageWriteback(page))
-					goto keep_reclaim_mode;
+					goto keep;
 				if (PageDirty(page))
 					goto keep;
 
@@ -985,8 +966,6 @@ activate_locked:
 keep_locked:
 		unlock_page(page);
 keep:
-		reset_reclaim_mode(sc);
-keep_reclaim_mode:
 		list_add(&page->lru, &ret_pages);
 		VM_BUG_ON(PageLRU(page) || PageUnevictable(page));
 	}
@@ -1342,47 +1321,6 @@ update_isolated_counts(struct mem_cgroup_zone *mz,
 }
 
 /*
- * Returns true if a direct reclaim should wait on pages under writeback.
- *
- * If we are direct reclaiming for contiguous pages and we do not reclaim
- * everything in the list, try again and wait for writeback IO to complete.
- * This will stall high-order allocations noticeably. Only do that when really
- * need to free the pages under high memory pressure.
- */
-static inline bool should_reclaim_stall(unsigned long nr_taken,
-					unsigned long nr_freed,
-					int priority,
-					struct scan_control *sc)
-{
-	int stall_priority;
-
-	/* kswapd should not stall on sync IO */
-	if (current_is_kswapd())
-		return false;
-
-	/* Only stall for memory compaction */
-	if (sc->reclaim_mode & RECLAIM_MODE_SINGLE)
-		return false;
-
-	/* If we have reclaimed everything on the isolated list, no stall */
-	if (nr_freed == nr_taken)
-		return false;
-
-	/*
-	 * For high-order allocations, there are two stall thresholds.
-	 * High-cost allocations stall immediately where as lower
-	 * order allocations such as stacks require the scanning
-	 * priority to be much higher before stalling.
-	 */
-	if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
-		stall_priority = DEF_PRIORITY;
-	else
-		stall_priority = DEF_PRIORITY / 3;
-
-	return priority <= stall_priority;
-}
-
-/*
  * shrink_inactive_list() is a helper for shrink_zone().  It returns the number
  * of reclaimed pages
  */
@@ -1410,7 +1348,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz,
 			return SWAP_CLUSTER_MAX;
 	}
 
-	set_reclaim_mode(priority, sc, false);
+	set_reclaim_mode(priority, sc);
 
 	lru_add_drain();
 
@@ -1442,13 +1380,6 @@ shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz,
 	nr_reclaimed = shrink_page_list(&page_list, mz, sc, priority,
 						&nr_dirty, &nr_writeback);
 
-	/* Check if we should syncronously wait for writeback */
-	if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) {
-		set_reclaim_mode(priority, sc, true);
-		nr_reclaimed += shrink_page_list(&page_list, mz, sc,
-					priority, &nr_dirty, &nr_writeback);
-	}
-
 	spin_lock_irq(&zone->lru_lock);
 
 	reclaim_stat->recent_scanned[0] += nr_anon;
-- 
cgit v0.10.2


From 23b9da55c5b0feb484bd5e8615f4eb1ce4169453 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@suse.de>
Date: Tue, 29 May 2012 15:06:20 -0700
Subject: mm: vmscan: remove reclaim_mode_t

There is little motiviation for reclaim_mode_t once RECLAIM_MODE_[A]SYNC
and lumpy reclaim have been removed.  This patch gets rid of
reclaim_mode_t as well and improves the documentation about what
reclaim/compaction is and when it is triggered.

Signed-off-by: Mel Gorman <mgorman@suse.de>
Acked-by: Rik van Riel <riel@redhat.com>
Acked-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Konstantin Khlebnikov <khlebnikov@openvz.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Ying Han <yinghan@google.com>
Cc: Andy Whitcroft <apw@shadowen.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h
index 82f6933..bab3b87 100644
--- a/include/trace/events/vmscan.h
+++ b/include/trace/events/vmscan.h
@@ -25,12 +25,12 @@
 		{RECLAIM_WB_ASYNC,	"RECLAIM_WB_ASYNC"}	\
 		) : "RECLAIM_WB_NONE"
 
-#define trace_reclaim_flags(page, sync) ( \
+#define trace_reclaim_flags(page) ( \
 	(page_is_file_cache(page) ? RECLAIM_WB_FILE : RECLAIM_WB_ANON) | \
 	(RECLAIM_WB_ASYNC) \
 	)
 
-#define trace_shrink_flags(file, sync) \
+#define trace_shrink_flags(file) \
 	( \
 		(file ? RECLAIM_WB_FILE : RECLAIM_WB_ANON) | \
 		(RECLAIM_WB_ASYNC) \
diff --git a/mm/vmscan.c b/mm/vmscan.c
index e27f27d..68e5819 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -53,16 +53,6 @@
 #define CREATE_TRACE_POINTS
 #include <trace/events/vmscan.h>
 
-/*
- * reclaim_mode determines how the inactive list is shrunk
- * RECLAIM_MODE_SINGLE: Reclaim only order-0 pages
- * RECLAIM_MODE_COMPACTION: For high-order allocations, reclaim a number of
- *			order-0 pages and then compact the zone
- */
-typedef unsigned __bitwise__ reclaim_mode_t;
-#define RECLAIM_MODE_SINGLE		((__force reclaim_mode_t)0x01u)
-#define RECLAIM_MODE_COMPACTION		((__force reclaim_mode_t)0x10u)
-
 struct scan_control {
 	/* Incremented by the number of inactive pages that were scanned */
 	unsigned long nr_scanned;
@@ -89,12 +79,6 @@ struct scan_control {
 	int order;
 
 	/*
-	 * Intend to reclaim enough continuous memory rather than reclaim
-	 * enough amount of memory. i.e, mode for high order allocation.
-	 */
-	reclaim_mode_t reclaim_mode;
-
-	/*
 	 * The memory cgroup that hit its limit and as a result is the
 	 * primary target of this reclaim invocation.
 	 */
@@ -356,25 +340,6 @@ out:
 	return ret;
 }
 
-static void set_reclaim_mode(int priority, struct scan_control *sc)
-{
-	/*
-	 * Restrict reclaim/compaction to costly allocations or when
-	 * under memory pressure
-	 */
-	if (COMPACTION_BUILD && sc->order &&
-			(sc->order > PAGE_ALLOC_COSTLY_ORDER ||
-			 priority < DEF_PRIORITY - 2))
-		sc->reclaim_mode = RECLAIM_MODE_COMPACTION;
-	else
-		sc->reclaim_mode = RECLAIM_MODE_SINGLE;
-}
-
-static void reset_reclaim_mode(struct scan_control *sc)
-{
-	sc->reclaim_mode = RECLAIM_MODE_SINGLE;
-}
-
 static inline int is_page_cache_freeable(struct page *page)
 {
 	/*
@@ -497,8 +462,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
 			/* synchronous write or broken a_ops? */
 			ClearPageReclaim(page);
 		}
-		trace_mm_vmscan_writepage(page,
-			trace_reclaim_flags(page, sc->reclaim_mode));
+		trace_mm_vmscan_writepage(page, trace_reclaim_flags(page));
 		inc_zone_page_state(page, NR_VMSCAN_WRITE);
 		return PAGE_SUCCESS;
 	}
@@ -953,7 +917,6 @@ cull_mlocked:
 			try_to_free_swap(page);
 		unlock_page(page);
 		putback_lru_page(page);
-		reset_reclaim_mode(sc);
 		continue;
 
 activate_locked:
@@ -1348,8 +1311,6 @@ shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz,
 			return SWAP_CLUSTER_MAX;
 	}
 
-	set_reclaim_mode(priority, sc);
-
 	lru_add_drain();
 
 	if (!sc->may_unmap)
@@ -1433,7 +1394,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz,
 		zone_idx(zone),
 		nr_scanned, nr_reclaimed,
 		priority,
-		trace_shrink_flags(file, sc->reclaim_mode));
+		trace_shrink_flags(file));
 	return nr_reclaimed;
 }
 
@@ -1512,8 +1473,6 @@ static void shrink_active_list(unsigned long nr_to_scan,
 
 	lru_add_drain();
 
-	reset_reclaim_mode(sc);
-
 	if (!sc->may_unmap)
 		isolate_mode |= ISOLATE_UNMAPPED;
 	if (!sc->may_writepage)
@@ -1826,23 +1785,35 @@ out:
 	}
 }
 
+/* Use reclaim/compaction for costly allocs or under memory pressure */
+static bool in_reclaim_compaction(int priority, struct scan_control *sc)
+{
+	if (COMPACTION_BUILD && sc->order &&
+			(sc->order > PAGE_ALLOC_COSTLY_ORDER ||
+			 priority < DEF_PRIORITY - 2))
+		return true;
+
+	return false;
+}
+
 /*
- * Reclaim/compaction depends on a number of pages being freed. To avoid
- * disruption to the system, a small number of order-0 pages continue to be
- * rotated and reclaimed in the normal fashion. However, by the time we get
- * back to the allocator and call try_to_compact_zone(), we ensure that
- * there are enough free pages for it to be likely successful
+ * Reclaim/compaction is used for high-order allocation requests. It reclaims
+ * order-0 pages before compacting the zone. should_continue_reclaim() returns
+ * true if more pages should be reclaimed such that when the page allocator
+ * calls try_to_compact_zone() that it will have enough free pages to succeed.
+ * It will give up earlier than that if there is difficulty reclaiming pages.
  */
 static inline bool should_continue_reclaim(struct mem_cgroup_zone *mz,
 					unsigned long nr_reclaimed,
 					unsigned long nr_scanned,
+					int priority,
 					struct scan_control *sc)
 {
 	unsigned long pages_for_compaction;
 	unsigned long inactive_lru_pages;
 
 	/* If not in reclaim/compaction mode, stop */
-	if (!(sc->reclaim_mode & RECLAIM_MODE_COMPACTION))
+	if (!in_reclaim_compaction(priority, sc))
 		return false;
 
 	/* Consider stopping depending on scan and reclaim activity */
@@ -1944,7 +1915,8 @@ restart:
 
 	/* reclaim/compaction might need reclaim to continue */
 	if (should_continue_reclaim(mz, nr_reclaimed,
-					sc->nr_scanned - nr_scanned, sc))
+					sc->nr_scanned - nr_scanned,
+					priority, sc))
 		goto restart;
 
 	throttle_vm_writeout(sc->gfp_mask);
-- 
cgit v0.10.2


From f62388187207bea83f1865d507bf892a1f9152c3 Mon Sep 17 00:00:00 2001
From: Ryota Ozaki <ozaki.ryota@gmail.com>
Date: Tue, 29 May 2012 15:06:20 -0700
Subject: mm: fix off-by-one bug in print_nodes_state()

/sys/devices/system/node/{online,possible} outputs a garbage byte
because print_nodes_state() returns content size + 1.  To fix the bug,
the patch changes the use of cpuset_sprintf_cpulist to follow the use at
other places, which is clearer and safer.

This bug was introduced in v2.6.24 (commit bde631a51876: "mm: add node
states sysfs class attributeS").

Signed-off-by: Ryota Ozaki <ozaki.ryota@gmail.com>
Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/base/node.c b/drivers/base/node.c
index 90aa2a1..af1a177 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -592,11 +592,9 @@ static ssize_t print_nodes_state(enum node_states state, char *buf)
 {
 	int n;
 
-	n = nodelist_scnprintf(buf, PAGE_SIZE, node_states[state]);
-	if (n > 0 && PAGE_SIZE > n + 1) {
-		*(buf + n++) = '\n';
-		*(buf + n++) = '\0';
-	}
+	n = nodelist_scnprintf(buf, PAGE_SIZE-2, node_states[state]);
+	buf[n++] = '\n';
+	buf[n] = '\0';
 	return n;
 }
 
-- 
cgit v0.10.2


From 4d67d860531ad5378dedfad7661c540f3365013d Mon Sep 17 00:00:00 2001
From: Thomas Meyer <thomas@m3y3r.de>
Date: Tue, 29 May 2012 15:06:21 -0700
Subject: mm: use kcalloc() instead of kzalloc() to allocate array

The advantage of kcalloc is, that will prevent integer overflows which
could result from the multiplication of number of elements and size and
it is also a bit nicer to read.

The semantic patch that makes this change is available in
https://lkml.org/lkml/2011/11/25/107

Signed-off-by: Thomas Meyer <thomas@m3y3r.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 94dff88..c28b0b9 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -2375,8 +2375,8 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
 		return NULL;
 	}
 
-	vms = kzalloc(sizeof(vms[0]) * nr_vms, GFP_KERNEL);
-	vas = kzalloc(sizeof(vas[0]) * nr_vms, GFP_KERNEL);
+	vms = kcalloc(nr_vms, sizeof(vms[0]), GFP_KERNEL);
+	vas = kcalloc(nr_vms, sizeof(vas[0]), GFP_KERNEL);
 	if (!vas || !vms)
 		goto err_free2;
 
-- 
cgit v0.10.2


From 841e31e5cc6219d62054788faa289b6ed682d068 Mon Sep 17 00:00:00 2001
From: Rajman Mekaco <rajman.mekaco@gmail.com>
Date: Tue, 29 May 2012 15:06:21 -0700
Subject: mm/mmap.c: find_vma(): remove unnecessary if(mm) check

The "if (mm)" check is not required in find_vma, as the kernel code
calls find_vma only when it is absolutely sure that the mm_struct arg to
it is non-NULL.

Remove the if(mm) check and adding the a WARN_ONCE(!mm) for now.  This
will serve the purpose of mandating that the execution
context(user-mode/kernel-mode) be known before find_vma is called.  Also
fixed 2 checkpatch.pl errors in the declaration of the rb_node and
vma_tmp local variables.

I was browsing through the internet and read a discussion at
https://lkml.org/lkml/2012/3/27/342 which discusses removal of the
validation check within find_vma.  Since no-one responded, I decided to
send this patch with Andrew's suggestions.

[akpm@linux-foundation.org: add remove-me comment]
Signed-off-by: Rajman Mekaco <rajman.mekaco@gmail.com>
Cc: Kautuk Consul <consul.kautuk@gmail.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Acked-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/mm/mmap.c b/mm/mmap.c
index e8dcfc7..4a9c2a3 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1639,33 +1639,34 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
 {
 	struct vm_area_struct *vma = NULL;
 
-	if (mm) {
-		/* Check the cache first. */
-		/* (Cache hit rate is typically around 35%.) */
-		vma = mm->mmap_cache;
-		if (!(vma && vma->vm_end > addr && vma->vm_start <= addr)) {
-			struct rb_node * rb_node;
-
-			rb_node = mm->mm_rb.rb_node;
-			vma = NULL;
-
-			while (rb_node) {
-				struct vm_area_struct * vma_tmp;
-
-				vma_tmp = rb_entry(rb_node,
-						struct vm_area_struct, vm_rb);
-
-				if (vma_tmp->vm_end > addr) {
-					vma = vma_tmp;
-					if (vma_tmp->vm_start <= addr)
-						break;
-					rb_node = rb_node->rb_left;
-				} else
-					rb_node = rb_node->rb_right;
-			}
-			if (vma)
-				mm->mmap_cache = vma;
+	if (WARN_ON_ONCE(!mm))		/* Remove this in linux-3.6 */
+		return NULL;
+
+	/* Check the cache first. */
+	/* (Cache hit rate is typically around 35%.) */
+	vma = mm->mmap_cache;
+	if (!(vma && vma->vm_end > addr && vma->vm_start <= addr)) {
+		struct rb_node *rb_node;
+
+		rb_node = mm->mm_rb.rb_node;
+		vma = NULL;
+
+		while (rb_node) {
+			struct vm_area_struct *vma_tmp;
+
+			vma_tmp = rb_entry(rb_node,
+					   struct vm_area_struct, vm_rb);
+
+			if (vma_tmp->vm_end > addr) {
+				vma = vma_tmp;
+				if (vma_tmp->vm_start <= addr)
+					break;
+				rb_node = rb_node->rb_left;
+			} else
+				rb_node = rb_node->rb_right;
 		}
+		if (vma)
+			mm->mmap_cache = vma;
 	}
 	return vma;
 }
-- 
cgit v0.10.2


From 7edc8b0ac16cbaed7cb4ea4c6b95ce98d2997e84 Mon Sep 17 00:00:00 2001
From: Siddhesh Poyarekar <siddhesh.poyarekar@gmail.com>
Date: Tue, 29 May 2012 15:06:22 -0700
Subject: mm/fork: fix overflow in vma length when copying mmap on clone

The vma length in dup_mmap is calculated and stored in a unsigned int,
which is insufficient and hence overflows for very large maps (beyond
16TB). The following program demonstrates this:

#include <stdio.h>
#include <unistd.h>
#include <sys/mman.h>

#define GIG 1024 * 1024 * 1024L
#define EXTENT 16393

int main(void)
{
        int i, r;
        void *m;
        char buf[1024];

        for (i = 0; i < EXTENT; i++) {
                m = mmap(NULL, (size_t) 1 * 1024 * 1024 * 1024L,
                         PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, 0, 0);

                if (m == (void *)-1)
                        printf("MMAP Failed: %d\n", m);
                else
                        printf("%d : MMAP returned %p\n", i, m);

                r = fork();

                if (r == 0) {
                        printf("%d: successed\n", i);
                        return 0;
                } else if (r < 0)
                        printf("FORK Failed: %d\n", r);
                else if (r > 0)
                        wait(NULL);
        }
        return 0;
}

Increase the storage size of the result to unsigned long, which is
sufficient for storing the difference between addresses.

Signed-off-by: Siddhesh Poyarekar <siddhesh.poyarekar@gmail.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Hugh Dickins <hughd@google.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/kernel/fork.c b/kernel/fork.c
index 5b13eea..017fb23 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -386,7 +386,8 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
 		}
 		charge = 0;
 		if (mpnt->vm_flags & VM_ACCOUNT) {
-			unsigned int len = (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT;
+			unsigned long len;
+			len = (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT;
 			if (security_vm_enough_memory_mm(oldmm, len)) /* sic */
 				goto fail_nomem;
 			charge = len;
-- 
cgit v0.10.2


From bde8bd8a1d5242589ddcaef8e017b48b207c4729 Mon Sep 17 00:00:00 2001
From: Sasikantha babu <sasikanth.v19@gmail.com>
Date: Tue, 29 May 2012 15:06:22 -0700
Subject: mm/vmstat.c: remove debug fs entries on failure of file creation and
 made extfrag_debug_root dentry local

Remove debug fs files and directory on failure.  Since no one is using
"extfrag_debug_root" dentry outside of extfrag_debug_init(), make it
local to the function.

Signed-off-by: Sasikantha babu <sasikanth.v19@gmail.com>
Acked-by: David Rientjes <rientjes@google.com>
Acked-by: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/mm/vmstat.c b/mm/vmstat.c
index 0dad31dc..1bbbbd9 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1223,7 +1223,6 @@ module_init(setup_vmstat)
 #if defined(CONFIG_DEBUG_FS) && defined(CONFIG_COMPACTION)
 #include <linux/debugfs.h>
 
-static struct dentry *extfrag_debug_root;
 
 /*
  * Return an index indicating how much of the available free memory is
@@ -1361,19 +1360,24 @@ static const struct file_operations extfrag_file_ops = {
 
 static int __init extfrag_debug_init(void)
 {
+	struct dentry *extfrag_debug_root;
+
 	extfrag_debug_root = debugfs_create_dir("extfrag", NULL);
 	if (!extfrag_debug_root)
 		return -ENOMEM;
 
 	if (!debugfs_create_file("unusable_index", 0444,
 			extfrag_debug_root, NULL, &unusable_file_ops))
-		return -ENOMEM;
+		goto fail;
 
 	if (!debugfs_create_file("extfrag_index", 0444,
 			extfrag_debug_root, NULL, &extfrag_file_ops))
-		return -ENOMEM;
+		goto fail;
 
 	return 0;
+fail:
+	debugfs_remove_recursive(extfrag_debug_root);
+	return -ENOMEM;
 }
 
 module_init(extfrag_debug_init);
-- 
cgit v0.10.2


From 1f1d06c34f7675026326cd9f39ff91e4555cf355 Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Tue, 29 May 2012 15:06:23 -0700
Subject: thp, memcg: split hugepage for memcg oom on cow

On COW, a new hugepage is allocated and charged to the memcg.  If the
system is oom or the charge to the memcg fails, however, the fault
handler will return VM_FAULT_OOM which results in an oom kill.

Instead, it's possible to fallback to splitting the hugepage so that the
COW results only in an order-0 page being allocated and charged to the
memcg which has a higher liklihood to succeed.  This is expensive
because the hugepage must be split in the page fault handler, but it is
much better than unnecessarily oom killing a process.

Signed-off-by: David Rientjes <rientjes@google.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Johannes Weiner <jweiner@redhat.com>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Michal Hocko <mhocko@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index d7d7165..edfeb8c 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -952,6 +952,8 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		count_vm_event(THP_FAULT_FALLBACK);
 		ret = do_huge_pmd_wp_page_fallback(mm, vma, address,
 						   pmd, orig_pmd, page, haddr);
+		if (ret & VM_FAULT_OOM)
+			split_huge_page(page);
 		put_page(page);
 		goto out;
 	}
@@ -959,6 +961,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 
 	if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) {
 		put_page(new_page);
+		split_huge_page(page);
 		put_page(page);
 		ret |= VM_FAULT_OOM;
 		goto out;
diff --git a/mm/memory.c b/mm/memory.c
index 2bf9e11..1b7dc66 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3486,6 +3486,7 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 	if (unlikely(is_vm_hugetlb_page(vma)))
 		return hugetlb_fault(mm, vma, address, flags);
 
+retry:
 	pgd = pgd_offset(mm, address);
 	pud = pud_alloc(mm, pgd, address);
 	if (!pud)
@@ -3499,13 +3500,24 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 							  pmd, flags);
 	} else {
 		pmd_t orig_pmd = *pmd;
+		int ret;
+
 		barrier();
 		if (pmd_trans_huge(orig_pmd)) {
 			if (flags & FAULT_FLAG_WRITE &&
 			    !pmd_write(orig_pmd) &&
-			    !pmd_trans_splitting(orig_pmd))
-				return do_huge_pmd_wp_page(mm, vma, address,
-							   pmd, orig_pmd);
+			    !pmd_trans_splitting(orig_pmd)) {
+				ret = do_huge_pmd_wp_page(mm, vma, address, pmd,
+							  orig_pmd);
+				/*
+				 * If COW results in an oom, the huge pmd will
+				 * have been split, so retry the fault on the
+				 * pte for a smaller charge.
+				 */
+				if (unlikely(ret & VM_FAULT_OOM))
+					goto retry;
+				return ret;
+			}
 			return 0;
 		}
 	}
-- 
cgit v0.10.2


From 4a5b18cc1971046d9ca3a29fdcafbe5648629585 Mon Sep 17 00:00:00 2001
From: Larry Woodman <lwoodman@redhat.com>
Date: Tue, 29 May 2012 15:06:24 -0700
Subject: mm: do_migrate_pages() calls migrate_to_node() even if task is
 already on a correct node

While running an application that moves tasks from one cpuset to another
I noticed that it takes much longer and moves many more pages than
expected.

The reason for this is do_migrate_pages() does its best to preserve the
relative node differential from the first node of the cpuset because the
application may have been written with that in mind.  If memory was
interleaved on the nodes of the source cpuset by an application
do_migrate_pages() will try its best to maintain that interleaving on
the nodes of the destination cpuset.  This means copying the memory from
all source nodes to the destination nodes even if the source and
destination nodes overlap.

This is a problem for userspace NUMA placement tools.  The amount of
time spent doing extra memory moves cancels out some of the NUMA
performance improvements.  Furthermore, if the number of source and
destination nodes are to maintain the previous interleaving layout
anyway.

This patch changes do_migrate_pages() to only preserve the relative
layout inside the program if the number of NUMA nodes in the source and
destination mask are the same.  If the number is different, we do a much
more efficient migration by not touching memory that is in an allowed
node.

This preserves the old behaviour for programs that want it, while
allowing a userspace NUMA placement tool to use the new, faster
migration.  This improves performance in our tests by up to a factor of
7.

Without this change migrating tasks from a cpuset containing nodes 0-7
to a cpuset containing nodes 3-4, we migrate from ALL the nodes even if
they are in the both the source and destination nodesets:

   Migrating 7 to 4
   Migrating 6 to 3
   Migrating 5 to 4
   Migrating 4 to 3
   Migrating 1 to 4
   Migrating 3 to 4
   Migrating 0 to 3
   Migrating 2 to 3

With this change we only migrate from nodes that are not in the
destination nodesets:

   Migrating 7 to 4
   Migrating 6 to 3
   Migrating 5 to 4
   Migrating 2 to 3
   Migrating 1 to 4
   Migrating 0 to 3

Yet if we move from a cpuset containing nodes 2,3,4 to a cpuset
containing 3,4,5 we still do move everything so that we preserve the
desired NUMA offsets:

   Migrating 4 to 5
   Migrating 3 to 4
   Migrating 2 to 3

As far as performance is concerned this simple patch improves the time
it takes to move 14, 20 and 26 large tasks from a cpuset containing
nodes 0-7 to a cpuset containing nodes 1 & 3 by up to a factor of 7.
Here are the timings with and without the patch:

BEFORE PATCH -- Move times: 59, 140, 651 seconds
============

  Moving 14 tasks from nodes (0-7) to nodes (1,3)
  numad(8780) do_migrate_pages (mm=0xffff88081d414400
  from_nodes=0xffff880818c81d28 to_nodes=0xffff880818c81ce8 flags=0x4)
  numad(8780) migrate_to_node (mm=0xffff88081d414400 source=0x7 dest=0x3 flags=0x4)
  numad(8780) migrate_to_node (mm=0xffff88081d414400 source=0x6 dest=0x1 flags=0x4)
  numad(8780) migrate_to_node (mm=0xffff88081d414400 source=0x5 dest=0x3 flags=0x4)
  numad(8780) migrate_to_node (mm=0xffff88081d414400 source=0x4 dest=0x1 flags=0x4)
  numad(8780) migrate_to_node (mm=0xffff88081d414400 source=0x2 dest=0x1 flags=0x4)
  numad(8780) migrate_to_node (mm=0xffff88081d414400 source=0x1 dest=0x3 flags=0x4)
  numad(8780) migrate_to_node (mm=0xffff88081d414400 source=0x0 dest=0x1 flags=0x4)
  (Above moves repeated for each of the 14 tasks...)
  PID 8890 moved to node(s) 1,3 in 59.2 seconds

  Moving 20 tasks from nodes (0-7) to nodes (1,4-5)
  numad(8780) do_migrate_pages (mm=0xffff88081d88c700
  from_nodes=0xffff880818c81d28 to_nodes=0xffff880818c81ce8 flags=0x4)
  numad(8780) migrate_to_node (mm=0xffff88081d88c700 source=0x7 dest=0x4 flags=0x4)
  numad(8780) migrate_to_node (mm=0xffff88081d88c700 source=0x6 dest=0x1 flags=0x4)
  numad(8780) migrate_to_node (mm=0xffff88081d88c700 source=0x3 dest=0x1 flags=0x4)
  numad(8780) migrate_to_node (mm=0xffff88081d88c700 source=0x2 dest=0x5 flags=0x4)
  numad(8780) migrate_to_node (mm=0xffff88081d88c700 source=0x1 dest=0x4 flags=0x4)
  numad(8780) migrate_to_node (mm=0xffff88081d88c700 source=0x0 dest=0x1 flags=0x4)
  (Above moves repeated for each of the 20 tasks...)
  PID 8962 moved to node(s) 1,4-5 in 139.88 seconds

  Moving 26 tasks from nodes (0-7) to nodes (1-3,5)
  numad(8780) do_migrate_pages (mm=0xffff88081d5bc740
  from_nodes=0xffff880818c81d28 to_nodes=0xffff880818c81ce8 flags=0x4)
  numad(8780) migrate_to_node (mm=0xffff88081d5bc740 source=0x7 dest=0x5 flags=0x4)
  numad(8780) migrate_to_node (mm=0xffff88081d5bc740 source=0x6 dest=0x3 flags=0x4)
  numad(8780) migrate_to_node (mm=0xffff88081d5bc740 source=0x5 dest=0x2 flags=0x4)
  numad(8780) migrate_to_node (mm=0xffff88081d5bc740 source=0x3 dest=0x5 flags=0x4)
  numad(8780) migrate_to_node (mm=0xffff88081d5bc740 source=0x2 dest=0x3 flags=0x4)
  numad(8780) migrate_to_node (mm=0xffff88081d5bc740 source=0x1 dest=0x2 flags=0x4)
  numad(8780) migrate_to_node (mm=0xffff88081d5bc740 source=0x0 dest=0x1 flags=0x4)
  numad(8780) migrate_to_node (mm=0xffff88081d5bc740 source=0x4 dest=0x1 flags=0x4)
  (Above moves repeated for each of the 26 tasks...)
  PID 9058 moved to node(s) 1-3,5 in 651.45 seconds

AFTER PATCH -- Move times: 42, 56, 93 seconds
===========

  Moving 14 tasks from nodes (0-7) to nodes (5,7)
  numad(33209) do_migrate_pages (mm=0xffff88101d5ff140
  from_nodes=0xffff88101e7b5d28 to_nodes=0xffff88101e7b5ce8 flags=0x4)
  numad(33209) migrate_to_node (mm=0xffff88101d5ff140 source=0x6 dest=0x5 flags=0x4)
  numad(33209) migrate_to_node (mm=0xffff88101d5ff140 source=0x4 dest=0x5 flags=0x4)
  numad(33209) migrate_to_node (mm=0xffff88101d5ff140 source=0x3 dest=0x7 flags=0x4)
  numad(33209) migrate_to_node (mm=0xffff88101d5ff140 source=0x2 dest=0x5 flags=0x4)
  numad(33209) migrate_to_node (mm=0xffff88101d5ff140 source=0x1 dest=0x7 flags=0x4)
  numad(33209) migrate_to_node (mm=0xffff88101d5ff140 source=0x0 dest=0x5 flags=0x4)
  (Above moves repeated for each of the 14 tasks...)
  PID 33221 moved to node(s) 5,7 in 41.67 seconds

  Moving 20 tasks from nodes (0-7) to nodes (1,3,5)
  numad(33209) do_migrate_pages (mm=0xffff88101d6c37c0
  from_nodes=0xffff88101e7b5d28 to_nodes=0xffff88101e7b5ce8 flags=0x4)
  numad(33209) migrate_to_node (mm=0xffff88101d6c37c0 source=0x7 dest=0x3 flags=0x4)
  numad(33209) migrate_to_node (mm=0xffff88101d6c37c0 source=0x6 dest=0x1 flags=0x4)
  numad(33209) migrate_to_node (mm=0xffff88101d6c37c0 source=0x4 dest=0x3 flags=0x4)
  numad(33209) migrate_to_node (mm=0xffff88101d6c37c0 source=0x2 dest=0x5 flags=0x4)
  numad(33209) migrate_to_node (mm=0xffff88101d6c37c0 source=0x0 dest=0x1 flags=0x4)
  (Above moves repeated for each of the 20 tasks...)
  PID 33289 moved to node(s) 1,3,5 in 56.3 seconds

  Moving 26 tasks from nodes (0-7) to nodes (1,3,5,7)
  numad(33209) do_migrate_pages (mm=0xffff88101d924400
  from_nodes=0xffff88101e7b5d28 to_nodes=0xffff88101e7b5ce8 flags=0x4)
  numad(33209) migrate_to_node (mm=0xffff88101d924400 source=0x6 dest=0x5 flags=0x4)
  numad(33209) migrate_to_node (mm=0xffff88101d924400 source=0x4 dest=0x1 flags=0x4)
  numad(33209) migrate_to_node (mm=0xffff88101d924400 source=0x2 dest=0x5 flags=0x4)
  numad(33209) migrate_to_node (mm=0xffff88101d924400 source=0x0 dest=0x1 flags=0x4)
  (Above moves repeated for each of the 26 tasks...)
  PID 33372 moved to node(s) 1,3,5,7 in 92.67 seconds

[akpm@linux-foundation.org: clean up comment layout]
Signed-off-by: Larry Woodman <lwoodman@redhat.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Acked-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Reviewed-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index a388888..d3c5de4 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1005,6 +1005,26 @@ int do_migrate_pages(struct mm_struct *mm,
 		int dest = 0;
 
 		for_each_node_mask(s, tmp) {
+
+			/*
+			 * do_migrate_pages() tries to maintain the relative
+			 * node relationship of the pages established between
+			 * threads and memory areas.
+                         *
+			 * However if the number of source nodes is not equal to
+			 * the number of destination nodes we can not preserve
+			 * this node relative relationship.  In that case, skip
+			 * copying memory from a node that is in the destination
+			 * mask.
+			 *
+			 * Example: [2,3,4] -> [3,4,5] moves everything.
+			 *          [0-7] - > [3,4,5] moves only 0,1,2,6,7.
+			 */
+
+			if ((nodes_weight(*from_nodes) != nodes_weight(*to_nodes)) &&
+						(node_isset(s, *to_nodes)))
+				continue;
+
 			d = node_remap(s, *from_nodes, *to_nodes);
 			if (s == d)
 				continue;
-- 
cgit v0.10.2


From 0ce72d4f7333248efbef1f3309770c7edb1b2625 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Tue, 29 May 2012 15:06:24 -0700
Subject: mm: do_migrate_pages(): rename arguments

s/from_nodes/from and s/to_nodes/to/.  The "_nodes" is redundant - it
duplicates the argument's type.

Done in a fit of irritation over 80-col issues :(

Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: KOSAKI Motohiro <mkosaki@redhat.com>
Cc: Larry Woodman <lwoodman@redhat.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index 7c727a9..4aa4273 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -225,8 +225,8 @@ static inline void check_highest_zone(enum zone_type k)
 		policy_zone = k;
 }
 
-int do_migrate_pages(struct mm_struct *mm,
-	const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags);
+int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
+		     const nodemask_t *to, int flags);
 
 
 #ifdef CONFIG_TMPFS
@@ -354,9 +354,8 @@ static inline bool mempolicy_nodemask_intersects(struct task_struct *tsk,
 	return false;
 }
 
-static inline int do_migrate_pages(struct mm_struct *mm,
-			const nodemask_t *from_nodes,
-			const nodemask_t *to_nodes, int flags)
+static inline int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
+				   const nodemask_t *to, int flags)
 {
 	return 0;
 }
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index d3c5de4..f15c1b2 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -950,8 +950,8 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,
  *
  * Returns the number of page that could not be moved.
  */
-int do_migrate_pages(struct mm_struct *mm,
-	const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
+int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
+		     const nodemask_t *to, int flags)
 {
 	int busy = 0;
 	int err;
@@ -963,7 +963,7 @@ int do_migrate_pages(struct mm_struct *mm,
 
 	down_read(&mm->mmap_sem);
 
-	err = migrate_vmas(mm, from_nodes, to_nodes, flags);
+	err = migrate_vmas(mm, from, to, flags);
 	if (err)
 		goto out;
 
@@ -998,7 +998,7 @@ int do_migrate_pages(struct mm_struct *mm,
 	 * moved to an empty node, then there is nothing left worth migrating.
 	 */
 
-	tmp = *from_nodes;
+	tmp = *from;
 	while (!nodes_empty(tmp)) {
 		int s,d;
 		int source = -1;
@@ -1021,11 +1021,11 @@ int do_migrate_pages(struct mm_struct *mm,
 			 *          [0-7] - > [3,4,5] moves only 0,1,2,6,7.
 			 */
 
-			if ((nodes_weight(*from_nodes) != nodes_weight(*to_nodes)) &&
-						(node_isset(s, *to_nodes)))
+			if ((nodes_weight(*from) != nodes_weight(*to)) &&
+						(node_isset(s, *to)))
 				continue;
 
-			d = node_remap(s, *from_nodes, *to_nodes);
+			d = node_remap(s, *from, *to);
 			if (s == d)
 				continue;
 
@@ -1085,8 +1085,8 @@ static void migrate_page_add(struct page *page, struct list_head *pagelist,
 {
 }
 
-int do_migrate_pages(struct mm_struct *mm,
-	const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
+int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
+		     const nodemask_t *to, int flags)
 {
 	return -ENOSYS;
 }
-- 
cgit v0.10.2


From 91c63734f6908425903aed69c04035592f18d398 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Tue, 29 May 2012 15:06:24 -0700
Subject: kernel: cgroup: push rcu read locking from css_is_ancestor() to
 callsite

Library functions should not grab locks when the callsites can do it,
even if the lock nests like the rcu read-side lock does.

Push the rcu_read_lock() from css_is_ancestor() to its single user,
mem_cgroup_same_or_subtree() in preparation for another user that may
already hold the rcu read-side lock.

Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Konstantin Khlebnikov <khlebnikov@openvz.org>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Acked-by: Michal Hocko <mhocko@suse.cz>
Acked-by: Li Zefan <lizf@cn.fujitsu.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index a0c6af3..0f3527d 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -5132,7 +5132,7 @@ EXPORT_SYMBOL_GPL(css_depth);
  * @root: the css supporsed to be an ancestor of the child.
  *
  * Returns true if "root" is an ancestor of "child" in its hierarchy. Because
- * this function reads css->id, this use rcu_dereference() and rcu_read_lock().
+ * this function reads css->id, the caller must hold rcu_read_lock().
  * But, considering usual usage, the csses should be valid objects after test.
  * Assuming that the caller will do some action to the child if this returns
  * returns true, the caller must take "child";s reference count.
@@ -5144,18 +5144,18 @@ bool css_is_ancestor(struct cgroup_subsys_state *child,
 {
 	struct css_id *child_id;
 	struct css_id *root_id;
-	bool ret = true;
 
-	rcu_read_lock();
 	child_id  = rcu_dereference(child->id);
+	if (!child_id)
+		return false;
 	root_id = rcu_dereference(root->id);
-	if (!child_id
-	    || !root_id
-	    || (child_id->depth < root_id->depth)
-	    || (child_id->stack[root_id->depth] != root_id->id))
-		ret = false;
-	rcu_read_unlock();
-	return ret;
+	if (!root_id)
+		return false;
+	if (child_id->depth < root_id->depth)
+		return false;
+	if (child_id->stack[root_id->depth] != root_id->id)
+		return false;
+	return true;
 }
 
 void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 92675fe..faad98e 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1152,12 +1152,16 @@ struct lruvec *mem_cgroup_lru_move_lists(struct zone *zone,
 static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,
 		struct mem_cgroup *memcg)
 {
-	if (root_memcg != memcg) {
-		return (root_memcg->use_hierarchy &&
-			css_is_ancestor(&memcg->css, &root_memcg->css));
-	}
+	bool ret;
 
-	return true;
+	if (root_memcg == memcg)
+		return true;
+	if (!root_memcg->use_hierarchy)
+		return false;
+	rcu_read_lock();
+	ret = css_is_ancestor(&memcg->css, &root_memcg->css);
+	rcu_read_unlock();
+	return ret;
 }
 
 int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *memcg)
-- 
cgit v0.10.2


From c3ac9a8ade65ccbfd145fbff895ae8d8d62d09b0 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Tue, 29 May 2012 15:06:25 -0700
Subject: mm: memcg: count pte references from every member of the reclaimed
 hierarchy

The rmap walker checking page table references has historically ignored
references from VMAs that were not part of the memcg that was being
reclaimed during memcg hard limit reclaim.

When transitioning global reclaim to memcg hierarchy reclaim, I missed
that bit and now references from outside a memcg are ignored even during
global reclaim.

Reverting back to traditional behaviour - count all references during
global reclaim and only mind references of the memcg being reclaimed
during limit reclaim would be one option.

However, the more generic idea is to ignore references exactly then when
they are outside the hierarchy that is currently under reclaim; because
only then will their reclamation be of any use to help the pressure
situation.  It makes no sense to ignore references from a sibling memcg
and then evict a page that will be immediately refaulted by that sibling
which contributes to the same usage of the common ancestor under
reclaim.

The solution: make the rmap walker ignore references from VMAs that are
not part of the hierarchy that is being reclaimed.

Flat limit reclaim will stay the same, hierarchical limit reclaim will
mind the references only to pages that the hierarchy owns.  Global
reclaim, since it reclaims from all memcgs, will be fixed to regard all
references.

[akpm@linux-foundation.org: name the args in the declaration]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reported-by: Konstantin Khlebnikov <khlebnikov@openvz.org>
Acked-by: Konstantin Khlebnikov<khlebnikov@openvz.org>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index f94efd2..18ea0b7 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -79,6 +79,8 @@ extern void mem_cgroup_uncharge_cache_page(struct page *page);
 
 extern void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
 				     int order);
+bool __mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,
+				  struct mem_cgroup *memcg);
 int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *memcg);
 
 extern struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page);
@@ -92,10 +94,13 @@ static inline
 int mm_match_cgroup(const struct mm_struct *mm, const struct mem_cgroup *cgroup)
 {
 	struct mem_cgroup *memcg;
+	int match;
+
 	rcu_read_lock();
 	memcg = mem_cgroup_from_task(rcu_dereference((mm)->owner));
+	match = __mem_cgroup_same_or_subtree(cgroup, memcg);
 	rcu_read_unlock();
-	return cgroup == memcg;
+	return match;
 }
 
 extern struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index faad98e..4f71219 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1149,17 +1149,23 @@ struct lruvec *mem_cgroup_lru_move_lists(struct zone *zone,
  * Checks whether given mem is same or in the root_mem_cgroup's
  * hierarchy subtree
  */
-static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,
-		struct mem_cgroup *memcg)
+bool __mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,
+				  struct mem_cgroup *memcg)
 {
-	bool ret;
-
 	if (root_memcg == memcg)
 		return true;
 	if (!root_memcg->use_hierarchy)
 		return false;
+	return css_is_ancestor(&memcg->css, &root_memcg->css);
+}
+
+static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,
+				       struct mem_cgroup *memcg)
+{
+	bool ret;
+
 	rcu_read_lock();
-	ret = css_is_ancestor(&memcg->css, &root_memcg->css);
+	ret = __mem_cgroup_same_or_subtree(root_memcg, memcg);
 	rcu_read_unlock();
 	return ret;
 }
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 68e5819..8fffc65 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -645,7 +645,8 @@ static enum page_references page_check_references(struct page *page,
 	int referenced_ptes, referenced_page;
 	unsigned long vm_flags;
 
-	referenced_ptes = page_referenced(page, 1, mz->mem_cgroup, &vm_flags);
+	referenced_ptes = page_referenced(page, 1, sc->target_mem_cgroup,
+					  &vm_flags);
 	referenced_page = TestClearPageReferenced(page);
 
 	/*
@@ -1513,7 +1514,8 @@ static void shrink_active_list(unsigned long nr_to_scan,
 			}
 		}
 
-		if (page_referenced(page, 0, mz->mem_cgroup, &vm_flags)) {
+		if (page_referenced(page, 0, sc->target_mem_cgroup,
+				    &vm_flags)) {
 			nr_rotated += hpage_nr_pages(page);
 			/*
 			 * Identify referenced, file-backed active pages and
-- 
cgit v0.10.2


From 096a7cf44712ab531101bb4689f75f7fcd9b9f18 Mon Sep 17 00:00:00 2001
From: Ying Han <yinghan@google.com>
Date: Tue, 29 May 2012 15:06:25 -0700
Subject: mm: rename is_mlocked_vma() to mlocked_vma_newpage()

Andrew pointed out that the is_mlocked_vma() is misnamed.  A function
with name like that would expect bool return and no side-effects.

Since it is called on the fault path for new page, rename it in this
patch.

Signed-off-by: Ying Han <yinghan@google.com>
Reviewed-by: Rik van Riel <riel@redhat.com>
Acked-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujtisu.com>
Reviewed-by: Minchan Kim <minchan@kernel.org>
[akpm@linux-foundation.org: s/mlock_vma_newpage/mlock_vma_newpage/, per Minchan]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/mm/internal.h b/mm/internal.h
index aee4761..8b0fc8d 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -164,7 +164,8 @@ static inline void munlock_vma_pages_all(struct vm_area_struct *vma)
  * to determine if it's being mapped into a LOCKED vma.
  * If so, mark page as mlocked.
  */
-static inline int is_mlocked_vma(struct vm_area_struct *vma, struct page *page)
+static inline int mlocked_vma_newpage(struct vm_area_struct *vma,
+				    struct page *page)
 {
 	VM_BUG_ON(PageLRU(page));
 
@@ -222,7 +223,7 @@ extern unsigned long vma_address(struct page *page,
 				 struct vm_area_struct *vma);
 #endif
 #else /* !CONFIG_MMU */
-static inline int is_mlocked_vma(struct vm_area_struct *v, struct page *p)
+static inline int mlocked_vma_newpage(struct vm_area_struct *v, struct page *p)
 {
 	return 0;
 }
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 8fffc65..44f0436 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -3321,7 +3321,7 @@ int page_evictable(struct page *page, struct vm_area_struct *vma)
 	if (mapping_unevictable(page_mapping(page)))
 		return 0;
 
-	if (PageMlocked(page) || (vma && is_mlocked_vma(vma, page)))
+	if (PageMlocked(page) || (vma && mlocked_vma_newpage(vma, page)))
 		return 0;
 
 	return 1;
-- 
cgit v0.10.2


From 6f60b69d8cabbf7c0daf879cae4d09ac0776f4b4 Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Tue, 29 May 2012 15:06:26 -0700
Subject: mm, thp: drop page_table_lock to uncharge memcg pages

mm->page_table_lock is hotly contested for page fault tests and isn't
necessary to do mem_cgroup_uncharge_page() in do_huge_pmd_wp_page().

Signed-off-by: David Rientjes <rientjes@google.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Andrea Arcangeli <aarcange@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index edfeb8c..d0def42 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -973,8 +973,10 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	spin_lock(&mm->page_table_lock);
 	put_page(page);
 	if (unlikely(!pmd_same(*pmd, orig_pmd))) {
+		spin_unlock(&mm->page_table_lock);
 		mem_cgroup_uncharge_page(new_page);
 		put_page(new_page);
+		goto out;
 	} else {
 		pmd_t entry;
 		VM_BUG_ON(!PageHead(page));
-- 
cgit v0.10.2


From eb6332a54542bcd3aedb121ecb247f172b8f3602 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Tue, 29 May 2012 15:06:26 -0700
Subject: Documentation: memcg: future proof hierarchical statistics
 documentation

The hierarchical versions of per-memcg counters in memory.stat are all
calculated the same way and are all named total_<counter>.

Documenting the pattern is easier for maintenance than listing each
counter twice.

Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Acked-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Acked-by: Ying Han <yinghan@google.com>
Randy Dunlap <rdunlap@xenotime.net>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt
index 9b1067a..e479007 100644
--- a/Documentation/cgroups/memory.txt
+++ b/Documentation/cgroups/memory.txt
@@ -430,17 +430,10 @@ hierarchical_memory_limit - # of bytes of memory limit with regard to hierarchy
 hierarchical_memsw_limit - # of bytes of memory+swap limit with regard to
 			hierarchy under which memory cgroup is.
 
-total_cache		- sum of all children's "cache"
-total_rss		- sum of all children's "rss"
-total_mapped_file	- sum of all children's "cache"
-total_pgpgin		- sum of all children's "pgpgin"
-total_pgpgout		- sum of all children's "pgpgout"
-total_swap		- sum of all children's "swap"
-total_inactive_anon	- sum of all children's "inactive_anon"
-total_active_anon	- sum of all children's "active_anon"
-total_inactive_file	- sum of all children's "inactive_file"
-total_active_file	- sum of all children's "active_file"
-total_unevictable	- sum of all children's "unevictable"
+total_<counter>		- # hierarchical version of <counter>, which in
+			addition to the cgroup's own value includes the
+			sum of all hierarchical children's values of
+			<counter>, i.e. total_cache
 
 # The following additional stats are dependent on CONFIG_DEBUG_VM.
 
-- 
cgit v0.10.2


From 5febcbe99d4766cc383909c447e002e63d8b4592 Mon Sep 17 00:00:00 2001
From: Christopher Yeoh <cyeoh@au1.ibm.com>
Date: Tue, 29 May 2012 15:06:27 -0700
Subject: Cross Memory Attach: make it Kconfigurable

Add a Kconfig option to allow people who don't want cross memory attach to
not have it included in their build.

Signed-off-by: Chris Yeoh <yeohc@au1.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/mm/Kconfig b/mm/Kconfig
index 3922002..b217637 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -349,6 +349,16 @@ choice
 	  benefit.
 endchoice
 
+config CROSS_MEMORY_ATTACH
+	bool "Cross Memory Support"
+	depends on MMU
+	default y
+	help
+	  Enabling this option adds the system calls process_vm_readv and
+	  process_vm_writev which allow a process with the correct privileges
+	  to directly read from or write to to another process's address space.
+	  See the man page for more details.
+
 #
 # UP and nommu archs use km based percpu allocator
 #
diff --git a/mm/Makefile b/mm/Makefile
index ccecbf9..a156285 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -5,8 +5,11 @@
 mmu-y			:= nommu.o
 mmu-$(CONFIG_MMU)	:= fremap.o highmem.o madvise.o memory.o mincore.o \
 			   mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \
-			   vmalloc.o pagewalk.o pgtable-generic.o \
-			   process_vm_access.o
+			   vmalloc.o pagewalk.o pgtable-generic.o
+
+ifdef CONFIG_CROSS_MEMORY_ATTACH
+mmu-$(CONFIG_MMU)	+= process_vm_access.o
+endif
 
 obj-y			:= filemap.o mempool.o oom_kill.o fadvise.o \
 			   maccess.o page_alloc.o page-writeback.o \
-- 
cgit v0.10.2


From baf05aa9271bdbc07d3160035a231abc5fbd429a Mon Sep 17 00:00:00 2001
From: Konstantin Khlebnikov <khlebnikov@openvz.org>
Date: Tue, 29 May 2012 15:06:27 -0700
Subject: bug: introduce BUILD_BUG_ON_INVALID() macro

Sometimes we want to check some expressions correctness at compile time.
"(void)(e);" or "if (e);" can be dangerous if the expression has
side-effects, and gcc sometimes generates a lot of code, even if the
expression has no effect.

This patch introduces macro BUILD_BUG_ON_INVALID() for such checks, it
forces a compilation error if expression is invalid without any extra
code.

[Cast to "long" required because sizeof does not work for bit-fields.]

Signed-off-by: Konstantin Khlebnikov <khlebnikov@openvz.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Cong Wang <xiyou.wangcong@gmail.com>
Cc: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/include/linux/bug.h b/include/linux/bug.h
index 72961c3..aaac4bb 100644
--- a/include/linux/bug.h
+++ b/include/linux/bug.h
@@ -30,6 +30,13 @@ struct pt_regs;
 #define BUILD_BUG_ON_ZERO(e) (sizeof(struct { int:-!!(e); }))
 #define BUILD_BUG_ON_NULL(e) ((void *)sizeof(struct { int:-!!(e); }))
 
+/*
+ * BUILD_BUG_ON_INVALID() permits the compiler to check the validity of the
+ * expression but avoids the generation of any code, even if that expression
+ * has side-effects.
+ */
+#define BUILD_BUG_ON_INVALID(e) ((void)(sizeof((__force long)(e))))
+
 /**
  * BUILD_BUG_ON - break compile if a condition is true.
  * @condition: the condition which the compiler should know is false.
-- 
cgit v0.10.2


From 02602a18c32d76f0e0f50eefa91b2d53c8a3a751 Mon Sep 17 00:00:00 2001
From: Konstantin Khlebnikov <khlebnikov@openvz.org>
Date: Tue, 29 May 2012 15:06:28 -0700
Subject: bug: completely remove code generated by disabled VM_BUG_ON()

Even if CONFIG_DEBUG_VM=n gcc genereates code for some VM_BUG_ON()

for example VM_BUG_ON(!PageCompound(page) || !PageHead(page)); in
do_huge_pmd_wp_page() generates 114 bytes of code.

But they mostly disappears when I split this VM_BUG_ON into two:

  -VM_BUG_ON(!PageCompound(page) || !PageHead(page));
  +VM_BUG_ON(!PageCompound(page));
  +VM_BUG_ON(!PageHead(page));

weird... but anyway after this patch code disappears completely.

  add/remove: 0/0 grow/shrink: 7/97 up/down: 135/-1784 (-1649)

Signed-off-by: Konstantin Khlebnikov <khlebnikov@openvz.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Cong Wang <xiyou.wangcong@gmail.com>
Cc: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/include/linux/mmdebug.h b/include/linux/mmdebug.h
index c04ecfe..580bd58 100644
--- a/include/linux/mmdebug.h
+++ b/include/linux/mmdebug.h
@@ -4,7 +4,7 @@
 #ifdef CONFIG_DEBUG_VM
 #define VM_BUG_ON(cond) BUG_ON(cond)
 #else
-#define VM_BUG_ON(cond) do { (void)(cond); } while (0)
+#define VM_BUG_ON(cond) BUILD_BUG_ON_INVALID(cond)
 #endif
 
 #ifdef CONFIG_DEBUG_VIRTUAL
-- 
cgit v0.10.2


From 91eb0f67c38c7104766faa49c5aaee2b4876511e Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas <bhelgaas@google.com>
Date: Tue, 29 May 2012 15:06:28 -0700
Subject: x86: print e820 physical addresses consistently with other parts of
 kernel

Print physical address info in a style consistent with the %pR style used
elsewhere in the kernel.  For example:

    -BIOS-provided physical RAM map:
    +e820: BIOS-provided physical RAM map:
    - BIOS-e820: 0000000000000100 - 000000000009e000 (usable)
    +BIOS-e820: [mem 0x0000000000000100-0x000000000009dfff] usable
    -Allocating PCI resources starting at 90000000 (gap: 90000000:6ed1c000)
    +e820: [mem 0x90000000-0xfed1bfff] available for PCI devices
    -reserve RAM buffer: 000000000009e000 - 000000000009ffff
    +e820: reserve RAM buffer [mem 0x0009e000-0x0009ffff]

Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 62d61e9..4185797 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -113,7 +113,9 @@ static void __init __e820_add_region(struct e820map *e820x, u64 start, u64 size,
 	int x = e820x->nr_map;
 
 	if (x >= ARRAY_SIZE(e820x->map)) {
-		printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
+		printk(KERN_ERR "e820: too many entries; ignoring [mem %#010llx-%#010llx]\n",
+		       (unsigned long long) start,
+		       (unsigned long long) (start + size - 1));
 		return;
 	}
 
@@ -133,19 +135,19 @@ static void __init e820_print_type(u32 type)
 	switch (type) {
 	case E820_RAM:
 	case E820_RESERVED_KERN:
-		printk(KERN_CONT "(usable)");
+		printk(KERN_CONT "usable");
 		break;
 	case E820_RESERVED:
-		printk(KERN_CONT "(reserved)");
+		printk(KERN_CONT "reserved");
 		break;
 	case E820_ACPI:
-		printk(KERN_CONT "(ACPI data)");
+		printk(KERN_CONT "ACPI data");
 		break;
 	case E820_NVS:
-		printk(KERN_CONT "(ACPI NVS)");
+		printk(KERN_CONT "ACPI NVS");
 		break;
 	case E820_UNUSABLE:
-		printk(KERN_CONT "(unusable)");
+		printk(KERN_CONT "unusable");
 		break;
 	default:
 		printk(KERN_CONT "type %u", type);
@@ -158,10 +160,10 @@ void __init e820_print_map(char *who)
 	int i;
 
 	for (i = 0; i < e820.nr_map; i++) {
-		printk(KERN_INFO " %s: %016Lx - %016Lx ", who,
+		printk(KERN_INFO "%s: [mem %#018Lx-%#018Lx] ", who,
 		       (unsigned long long) e820.map[i].addr,
 		       (unsigned long long)
-		       (e820.map[i].addr + e820.map[i].size));
+		       (e820.map[i].addr + e820.map[i].size - 1));
 		e820_print_type(e820.map[i].type);
 		printk(KERN_CONT "\n");
 	}
@@ -428,9 +430,8 @@ static u64 __init __e820_update_range(struct e820map *e820x, u64 start,
 		size = ULLONG_MAX - start;
 
 	end = start + size;
-	printk(KERN_DEBUG "e820 update range: %016Lx - %016Lx ",
-		       (unsigned long long) start,
-		       (unsigned long long) end);
+	printk(KERN_DEBUG "e820: update [mem %#010Lx-%#010Lx] ",
+	       (unsigned long long) start, (unsigned long long) (end - 1));
 	e820_print_type(old_type);
 	printk(KERN_CONT " ==> ");
 	e820_print_type(new_type);
@@ -509,9 +510,8 @@ u64 __init e820_remove_range(u64 start, u64 size, unsigned old_type,
 		size = ULLONG_MAX - start;
 
 	end = start + size;
-	printk(KERN_DEBUG "e820 remove range: %016Lx - %016Lx ",
-		       (unsigned long long) start,
-		       (unsigned long long) end);
+	printk(KERN_DEBUG "e820: remove [mem %#010Lx-%#010Lx] ",
+	       (unsigned long long) start, (unsigned long long) (end - 1));
 	if (checktype)
 		e820_print_type(old_type);
 	printk(KERN_CONT "\n");
@@ -567,7 +567,7 @@ void __init update_e820(void)
 	if (sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &nr_map))
 		return;
 	e820.nr_map = nr_map;
-	printk(KERN_INFO "modified physical RAM map:\n");
+	printk(KERN_INFO "e820: modified physical RAM map:\n");
 	e820_print_map("modified");
 }
 static void __init update_e820_saved(void)
@@ -637,8 +637,8 @@ __init void e820_setup_gap(void)
 	if (!found) {
 		gapstart = (max_pfn << PAGE_SHIFT) + 1024*1024;
 		printk(KERN_ERR
-	"PCI: Warning: Cannot find a gap in the 32bit address range\n"
-	"PCI: Unassigned devices with 32bit resource registers may break!\n");
+	"e820: cannot find a gap in the 32bit address range\n"
+	"e820: PCI devices with unassigned 32bit BARs may break!\n");
 	}
 #endif
 
@@ -648,8 +648,8 @@ __init void e820_setup_gap(void)
 	pci_mem_start = gapstart;
 
 	printk(KERN_INFO
-	       "Allocating PCI resources starting at %lx (gap: %lx:%lx)\n",
-	       pci_mem_start, gapstart, gapsize);
+	       "e820: [mem %#010lx-%#010lx] available for PCI devices\n",
+	       gapstart, gapstart + gapsize - 1);
 }
 
 /**
@@ -667,7 +667,7 @@ void __init parse_e820_ext(struct setup_data *sdata)
 	extmap = (struct e820entry *)(sdata->data);
 	__append_e820_map(extmap, entries);
 	sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
-	printk(KERN_INFO "extended physical RAM map:\n");
+	printk(KERN_INFO "e820: extended physical RAM map:\n");
 	e820_print_map("extended");
 }
 
@@ -734,7 +734,7 @@ u64 __init early_reserve_e820(u64 size, u64 align)
 	addr = __memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ACCESSIBLE);
 	if (addr) {
 		e820_update_range_saved(addr, size, E820_RAM, E820_RESERVED);
-		printk(KERN_INFO "update e820_saved for early_reserve_e820\n");
+		printk(KERN_INFO "e820: update e820_saved for early_reserve_e820\n");
 		update_e820_saved();
 	}
 
@@ -784,7 +784,7 @@ static unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type)
 	if (last_pfn > max_arch_pfn)
 		last_pfn = max_arch_pfn;
 
-	printk(KERN_INFO "last_pfn = %#lx max_arch_pfn = %#lx\n",
+	printk(KERN_INFO "e820: last_pfn = %#lx max_arch_pfn = %#lx\n",
 			 last_pfn, max_arch_pfn);
 	return last_pfn;
 }
@@ -888,7 +888,7 @@ void __init finish_e820_parsing(void)
 			early_panic("Invalid user supplied memory map");
 		e820.nr_map = nr;
 
-		printk(KERN_INFO "user-defined physical RAM map:\n");
+		printk(KERN_INFO "e820: user-defined physical RAM map:\n");
 		e820_print_map("user");
 	}
 }
@@ -996,8 +996,9 @@ void __init e820_reserve_resources_late(void)
 			end = MAX_RESOURCE_SIZE;
 		if (start >= end)
 			continue;
-		printk(KERN_DEBUG "reserve RAM buffer: %016llx - %016llx ",
-			       start, end);
+		printk(KERN_DEBUG
+		       "e820: reserve RAM buffer [mem %#010llx-%#010llx]\n",
+		       start, end);
 		reserve_region_with_split(&iomem_resource, start, end,
 					  "RAM buffer");
 	}
@@ -1047,7 +1048,7 @@ void __init setup_memory_map(void)
 
 	who = x86_init.resources.memory_setup();
 	memcpy(&e820_saved, &e820, sizeof(struct e820map));
-	printk(KERN_INFO "BIOS-provided physical RAM map:\n");
+	printk(KERN_INFO "e820: BIOS-provided physical RAM map:\n");
 	e820_print_map(who);
 }
 
-- 
cgit v0.10.2


From 365811d6f9bd98543bedc02b72d94f0f0faf3670 Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas <bhelgaas@google.com>
Date: Tue, 29 May 2012 15:06:29 -0700
Subject: x86: print physical addresses consistently with other parts of kernel

Print physical address info in a style consistent with the %pR style used
elsewhere in the kernel.  For example:

    -found SMP MP-table at [ffff8800000fce90] fce90
    +found SMP MP-table at [mem 0x000fce90-0x000fce9f] mapped at [ffff8800000fce90]
    -initial memory mapped : 0 - 20000000
    +initial memory mapped: [mem 0x00000000-0x1fffffff]
    -Base memory trampoline at [ffff88000009c000] 9c000 size 8192
    +Base memory trampoline [mem 0x0009c000-0x0009dfff] mapped at [ffff88000009c000]
    -SRAT: Node 0 PXM 0 0-80000000
    +SRAT: Node 0 PXM 0 [mem 0x00000000-0x7fffffff]

Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index b02d4dd..fbca2e6 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -568,8 +568,8 @@ static int __init smp_scan_config(unsigned long base, unsigned long length)
 	struct mpf_intel *mpf;
 	unsigned long mem;
 
-	apic_printk(APIC_VERBOSE, "Scan SMP from %p for %ld bytes.\n",
-			bp, length);
+	apic_printk(APIC_VERBOSE, "Scan for SMP in [mem %#010lx-%#010lx]\n",
+		    base, base + length - 1);
 	BUILD_BUG_ON(sizeof(*mpf) != 16);
 
 	while (length > 0) {
@@ -584,8 +584,10 @@ static int __init smp_scan_config(unsigned long base, unsigned long length)
 #endif
 			mpf_found = mpf;
 
-			printk(KERN_INFO "found SMP MP-table at [%p] %llx\n",
-			       mpf, (u64)virt_to_phys(mpf));
+			printk(KERN_INFO "found SMP MP-table at [mem %#010llx-%#010llx] mapped at [%p]\n",
+			       (unsigned long long) virt_to_phys(mpf),
+			       (unsigned long long) virt_to_phys(mpf) +
+			       sizeof(*mpf) - 1, mpf);
 
 			mem = virt_to_phys(mpf);
 			memblock_reserve(mem, sizeof(*mpf));
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index f2afee6..982e44f 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -334,8 +334,8 @@ static void __init relocate_initrd(void)
 	memblock_reserve(ramdisk_here, area_size);
 	initrd_start = ramdisk_here + PAGE_OFFSET;
 	initrd_end   = initrd_start + ramdisk_size;
-	printk(KERN_INFO "Allocated new RAMDISK: %08llx - %08llx\n",
-			 ramdisk_here, ramdisk_here + ramdisk_size);
+	printk(KERN_INFO "Allocated new RAMDISK: [mem %#010llx-%#010llx]\n",
+			 ramdisk_here, ramdisk_here + ramdisk_size - 1);
 
 	q = (char *)initrd_start;
 
@@ -366,8 +366,8 @@ static void __init relocate_initrd(void)
 	/* high pages is not converted by early_res_to_bootmem */
 	ramdisk_image = boot_params.hdr.ramdisk_image;
 	ramdisk_size  = boot_params.hdr.ramdisk_size;
-	printk(KERN_INFO "Move RAMDISK from %016llx - %016llx to"
-		" %08llx - %08llx\n",
+	printk(KERN_INFO "Move RAMDISK from [mem %#010llx-%#010llx] to"
+		" [mem %#010llx-%#010llx]\n",
 		ramdisk_image, ramdisk_image + ramdisk_size - 1,
 		ramdisk_here, ramdisk_here + ramdisk_size - 1);
 }
@@ -392,8 +392,8 @@ static void __init reserve_initrd(void)
 		       ramdisk_size, end_of_lowmem>>1);
 	}
 
-	printk(KERN_INFO "RAMDISK: %08llx - %08llx\n", ramdisk_image,
-			ramdisk_end);
+	printk(KERN_INFO "RAMDISK: [mem %#010llx-%#010llx]\n", ramdisk_image,
+			ramdisk_end - 1);
 
 
 	if (ramdisk_end <= end_of_lowmem) {
@@ -906,8 +906,8 @@ void __init setup_arch(char **cmdline_p)
 	setup_bios_corruption_check();
 #endif
 
-	printk(KERN_DEBUG "initial memory mapped : 0 - %08lx\n",
-			max_pfn_mapped<<PAGE_SHIFT);
+	printk(KERN_DEBUG "initial memory mapped: [mem 0x00000000-%#010lx]\n",
+			(max_pfn_mapped<<PAGE_SHIFT) - 1);
 
 	setup_trampolines();
 
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 319b6f2..97141c2 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -84,8 +84,9 @@ static void __init find_early_table_space(struct map_range *mr, unsigned long en
 	pgt_buf_end = pgt_buf_start;
 	pgt_buf_top = pgt_buf_start + (tables >> PAGE_SHIFT);
 
-	printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n",
-		end, pgt_buf_start << PAGE_SHIFT, pgt_buf_top << PAGE_SHIFT);
+	printk(KERN_DEBUG "kernel direct mapping tables up to %#lx @ [mem %#010lx-%#010lx]\n",
+		end - 1, pgt_buf_start << PAGE_SHIFT,
+		(pgt_buf_top << PAGE_SHIFT) - 1);
 }
 
 void __init native_pagetable_reserve(u64 start, u64 end)
@@ -132,7 +133,8 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
 	int nr_range, i;
 	int use_pse, use_gbpages;
 
-	printk(KERN_INFO "init_memory_mapping: %016lx-%016lx\n", start, end);
+	printk(KERN_INFO "init_memory_mapping: [mem %#010lx-%#010lx]\n",
+	       start, end - 1);
 
 #if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KMEMCHECK)
 	/*
@@ -251,8 +253,8 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
 	}
 
 	for (i = 0; i < nr_range; i++)
-		printk(KERN_DEBUG " %010lx - %010lx page %s\n",
-				mr[i].start, mr[i].end,
+		printk(KERN_DEBUG " [mem %#010lx-%#010lx] page %s\n",
+				mr[i].start, mr[i].end - 1,
 			(mr[i].page_size_mask & (1<<PG_LEVEL_1G))?"1G":(
 			 (mr[i].page_size_mask & (1<<PG_LEVEL_2M))?"2M":"4k"));
 
@@ -350,8 +352,8 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end)
 	 * create a kernel page fault:
 	 */
 #ifdef CONFIG_DEBUG_PAGEALLOC
-	printk(KERN_INFO "debug: unmapping init memory %08lx..%08lx\n",
-		begin, end);
+	printk(KERN_INFO "debug: unmapping init [mem %#010lx-%#010lx]\n",
+		begin, end - 1);
 	set_memory_np(begin, (end - begin) >> PAGE_SHIFT);
 #else
 	/*
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 19d3fa0..2d125be 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -141,8 +141,8 @@ static int __init numa_add_memblk_to(int nid, u64 start, u64 end,
 
 	/* whine about and ignore invalid blks */
 	if (start > end || nid < 0 || nid >= MAX_NUMNODES) {
-		pr_warning("NUMA: Warning: invalid memblk node %d (%Lx-%Lx)\n",
-			   nid, start, end);
+		pr_warning("NUMA: Warning: invalid memblk node %d [mem %#010Lx-%#010Lx]\n",
+			   nid, start, end - 1);
 		return 0;
 	}
 
@@ -210,8 +210,8 @@ static void __init setup_node_data(int nid, u64 start, u64 end)
 
 	start = roundup(start, ZONE_ALIGN);
 
-	printk(KERN_INFO "Initmem setup node %d %016Lx-%016Lx\n",
-	       nid, start, end);
+	printk(KERN_INFO "Initmem setup node %d [mem %#010Lx-%#010Lx]\n",
+	       nid, start, end - 1);
 
 	/*
 	 * Allocate node data.  Try remap allocator first, node-local
@@ -232,7 +232,7 @@ static void __init setup_node_data(int nid, u64 start, u64 end)
 	}
 
 	/* report and initialize */
-	printk(KERN_INFO "  NODE_DATA [%016Lx - %016Lx]%s\n",
+	printk(KERN_INFO "  NODE_DATA [mem %#010Lx-%#010Lx]%s\n",
 	       nd_pa, nd_pa + nd_size - 1, remapped ? " (remapped)" : "");
 	tnid = early_pfn_to_nid(nd_pa >> PAGE_SHIFT);
 	if (!remapped && tnid != nid)
@@ -291,14 +291,14 @@ int __init numa_cleanup_meminfo(struct numa_meminfo *mi)
 			 */
 			if (bi->end > bj->start && bi->start < bj->end) {
 				if (bi->nid != bj->nid) {
-					pr_err("NUMA: node %d (%Lx-%Lx) overlaps with node %d (%Lx-%Lx)\n",
-					       bi->nid, bi->start, bi->end,
-					       bj->nid, bj->start, bj->end);
+					pr_err("NUMA: node %d [mem %#010Lx-%#010Lx] overlaps with node %d [mem %#010Lx-%#010Lx]\n",
+					       bi->nid, bi->start, bi->end - 1,
+					       bj->nid, bj->start, bj->end - 1);
 					return -EINVAL;
 				}
-				pr_warning("NUMA: Warning: node %d (%Lx-%Lx) overlaps with itself (%Lx-%Lx)\n",
-					   bi->nid, bi->start, bi->end,
-					   bj->start, bj->end);
+				pr_warning("NUMA: Warning: node %d [mem %#010Lx-%#010Lx] overlaps with itself [mem %#010Lx-%#010Lx]\n",
+					   bi->nid, bi->start, bi->end - 1,
+					   bj->start, bj->end - 1);
 			}
 
 			/*
@@ -320,9 +320,9 @@ int __init numa_cleanup_meminfo(struct numa_meminfo *mi)
 			}
 			if (k < mi->nr_blks)
 				continue;
-			printk(KERN_INFO "NUMA: Node %d [%Lx,%Lx) + [%Lx,%Lx) -> [%Lx,%Lx)\n",
-			       bi->nid, bi->start, bi->end, bj->start, bj->end,
-			       start, end);
+			printk(KERN_INFO "NUMA: Node %d [mem %#010Lx-%#010Lx] + [mem %#010Lx-%#010Lx] -> [mem %#010Lx-%#010Lx]\n",
+			       bi->nid, bi->start, bi->end - 1, bj->start,
+			       bj->end - 1, start, end - 1);
 			bi->start = start;
 			bi->end = end;
 			numa_remove_memblk_from(j--, mi);
@@ -616,8 +616,8 @@ static int __init dummy_numa_init(void)
 {
 	printk(KERN_INFO "%s\n",
 	       numa_off ? "NUMA turned off" : "No NUMA configuration found");
-	printk(KERN_INFO "Faking a node at %016Lx-%016Lx\n",
-	       0LLU, PFN_PHYS(max_pfn));
+	printk(KERN_INFO "Faking a node at [mem %#018Lx-%#018Lx]\n",
+	       0LLU, PFN_PHYS(max_pfn) - 1);
 
 	node_set(0, numa_nodes_parsed);
 	numa_add_memblk(0, 0, PFN_PHYS(max_pfn));
diff --git a/arch/x86/mm/numa_emulation.c b/arch/x86/mm/numa_emulation.c
index 871dd88..dbbbb47 100644
--- a/arch/x86/mm/numa_emulation.c
+++ b/arch/x86/mm/numa_emulation.c
@@ -68,8 +68,8 @@ static int __init emu_setup_memblk(struct numa_meminfo *ei,
 		numa_remove_memblk_from(phys_blk, pi);
 	}
 
-	printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n", nid,
-	       eb->start, eb->end, (eb->end - eb->start) >> 20);
+	printk(KERN_INFO "Faking node %d at [mem %#018Lx-%#018Lx] (%LuMB)\n",
+	       nid, eb->start, eb->end - 1, (eb->end - eb->start) >> 20);
 	return 0;
 }
 
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index f6ff57b..f11729f 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -209,9 +209,8 @@ static int reserve_ram_pages_type(u64 start, u64 end, unsigned long req_type,
 		page = pfn_to_page(pfn);
 		type = get_page_memtype(page);
 		if (type != -1) {
-			printk(KERN_INFO "reserve_ram_pages_type failed "
-				"0x%Lx-0x%Lx, track 0x%lx, req 0x%lx\n",
-				start, end, type, req_type);
+			printk(KERN_INFO "reserve_ram_pages_type failed [mem %#010Lx-%#010Lx], track 0x%lx, req 0x%lx\n",
+				start, end - 1, type, req_type);
 			if (new_type)
 				*new_type = type;
 
@@ -314,9 +313,9 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
 
 	err = rbt_memtype_check_insert(new, new_type);
 	if (err) {
-		printk(KERN_INFO "reserve_memtype failed 0x%Lx-0x%Lx, "
-		       "track %s, req %s\n",
-		       start, end, cattr_name(new->type), cattr_name(req_type));
+		printk(KERN_INFO "reserve_memtype failed [mem %#010Lx-%#010Lx], track %s, req %s\n",
+		       start, end - 1,
+		       cattr_name(new->type), cattr_name(req_type));
 		kfree(new);
 		spin_unlock(&memtype_lock);
 
@@ -325,8 +324,8 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
 
 	spin_unlock(&memtype_lock);
 
-	dprintk("reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s, ret %s\n",
-		start, end, cattr_name(new->type), cattr_name(req_type),
+	dprintk("reserve_memtype added [mem %#010Lx-%#010Lx], track %s, req %s, ret %s\n",
+		start, end - 1, cattr_name(new->type), cattr_name(req_type),
 		new_type ? cattr_name(*new_type) : "-");
 
 	return err;
@@ -360,14 +359,14 @@ int free_memtype(u64 start, u64 end)
 	spin_unlock(&memtype_lock);
 
 	if (!entry) {
-		printk(KERN_INFO "%s:%d freeing invalid memtype %Lx-%Lx\n",
-			current->comm, current->pid, start, end);
+		printk(KERN_INFO "%s:%d freeing invalid memtype [mem %#010Lx-%#010Lx]\n",
+		       current->comm, current->pid, start, end - 1);
 		return -EINVAL;
 	}
 
 	kfree(entry);
 
-	dprintk("free_memtype request 0x%Lx-0x%Lx\n", start, end);
+	dprintk("free_memtype request [mem %#010Lx-%#010Lx]\n", start, end - 1);
 
 	return 0;
 }
@@ -491,9 +490,8 @@ static inline int range_is_allowed(unsigned long pfn, unsigned long size)
 
 	while (cursor < to) {
 		if (!devmem_is_allowed(pfn)) {
-			printk(KERN_INFO
-		"Program %s tried to access /dev/mem between %Lx->%Lx.\n",
-				current->comm, from, to);
+			printk(KERN_INFO "Program %s tried to access /dev/mem between [mem %#010Lx-%#010Lx]\n",
+				current->comm, from, to - 1);
 			return 0;
 		}
 		cursor += PAGE_SIZE;
@@ -554,12 +552,11 @@ int kernel_map_sync_memtype(u64 base, unsigned long size, unsigned long flags)
 				size;
 
 	if (ioremap_change_attr((unsigned long)__va(base), id_sz, flags) < 0) {
-		printk(KERN_INFO
-			"%s:%d ioremap_change_attr failed %s "
-			"for %Lx-%Lx\n",
+		printk(KERN_INFO "%s:%d ioremap_change_attr failed %s "
+			"for [mem %#010Lx-%#010Lx]\n",
 			current->comm, current->pid,
 			cattr_name(flags),
-			base, (unsigned long long)(base + size));
+			base, (unsigned long long)(base + size-1));
 		return -EINVAL;
 	}
 	return 0;
@@ -591,12 +588,11 @@ static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot,
 
 		flags = lookup_memtype(paddr);
 		if (want_flags != flags) {
-			printk(KERN_WARNING
-			"%s:%d map pfn RAM range req %s for %Lx-%Lx, got %s\n",
+			printk(KERN_WARNING "%s:%d map pfn RAM range req %s for [mem %#010Lx-%#010Lx], got %s\n",
 				current->comm, current->pid,
 				cattr_name(want_flags),
 				(unsigned long long)paddr,
-				(unsigned long long)(paddr + size),
+				(unsigned long long)(paddr + size - 1),
 				cattr_name(flags));
 			*vma_prot = __pgprot((pgprot_val(*vma_prot) &
 					      (~_PAGE_CACHE_MASK)) |
@@ -614,11 +610,11 @@ static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot,
 		    !is_new_memtype_allowed(paddr, size, want_flags, flags)) {
 			free_memtype(paddr, paddr + size);
 			printk(KERN_ERR "%s:%d map pfn expected mapping type %s"
-				" for %Lx-%Lx, got %s\n",
+				" for [mem %#010Lx-%#010Lx], got %s\n",
 				current->comm, current->pid,
 				cattr_name(want_flags),
 				(unsigned long long)paddr,
-				(unsigned long long)(paddr + size),
+				(unsigned long long)(paddr + size - 1),
 				cattr_name(flags));
 			return -EINVAL;
 		}
diff --git a/arch/x86/mm/srat.c b/arch/x86/mm/srat.c
index efb5b4b..732af3a 100644
--- a/arch/x86/mm/srat.c
+++ b/arch/x86/mm/srat.c
@@ -176,8 +176,9 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
 		return;
 	}
 
-	printk(KERN_INFO "SRAT: Node %u PXM %u %Lx-%Lx\n", node, pxm,
-	       start, end);
+	printk(KERN_INFO "SRAT: Node %u PXM %u [mem %#010Lx-%#010Lx]\n",
+	       node, pxm,
+	       (unsigned long long) start, (unsigned long long) end - 1);
 }
 
 void __init acpi_numa_arch_fixup(void) {}
-- 
cgit v0.10.2


From 3af684c7c5b3dddf7c5d83b8ad431380cdc6f164 Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas <bhelgaas@google.com>
Date: Tue, 29 May 2012 15:06:29 -0700
Subject: swiotlb: print physical addresses consistently with other parts of
 kernel

Print swiotlb info in a style consistent with the %pR style used elsewhere
in the kernel.  For example:

    -Placing 64MB software IO TLB between ffff88007a662000 - ffff88007e662000
    -software IO TLB at phys 0x7a662000 - 0x7e662000
    +software IO TLB [mem 0x7a662000-0x7e661fff] (64MB) mapped at [ffff88007a662000-ffff88007e661fff]

Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/lib/swiotlb.c b/lib/swiotlb.c
index 414f46e..45bc1f8 100644
--- a/lib/swiotlb.c
+++ b/lib/swiotlb.c
@@ -130,11 +130,9 @@ void swiotlb_print_info(void)
 	pstart = virt_to_phys(io_tlb_start);
 	pend = virt_to_phys(io_tlb_end);
 
-	printk(KERN_INFO "Placing %luMB software IO TLB between %p - %p\n",
-	       bytes >> 20, io_tlb_start, io_tlb_end);
-	printk(KERN_INFO "software IO TLB at phys %#llx - %#llx\n",
-	       (unsigned long long)pstart,
-	       (unsigned long long)pend);
+	printk(KERN_INFO "software IO TLB [mem %#010llx-%#010llx] (%luMB) mapped at [%p-%p]\n",
+	       (unsigned long long)pstart, (unsigned long long)pend - 1,
+	       bytes >> 20, io_tlb_start, io_tlb_end - 1);
 }
 
 void __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose)
-- 
cgit v0.10.2


From a62e2f4f508863da8e0c2f2b42f5252a87330297 Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas <bhelgaas@google.com>
Date: Tue, 29 May 2012 15:06:30 -0700
Subject: mm: print physical addresses consistently with other parts of kernel

Print physical address info in a style consistent with the %pR style used
elsewhere in the kernel.  For example:

    -Zone PFN ranges:
    +Zone ranges:
    -  DMA32    0x00000010 -> 0x00100000
    +  DMA32    [mem 0x00010000-0xffffffff]
    -  Normal   0x00100000 -> 0x01080000
    +  Normal   [mem 0x100000000-0x107fffffff]

Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index fc898cb..0d7e3ec 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -74,8 +74,7 @@ static struct resource *register_memory_resource(u64 start, u64 size)
 	res->end = start + size - 1;
 	res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
 	if (request_resource(&iomem_resource, res) < 0) {
-		printk("System RAM resource %llx - %llx cannot be added\n",
-		(unsigned long long)res->start, (unsigned long long)res->end);
+		printk("System RAM resource %pR cannot be added\n", res);
 		kfree(res);
 		res = NULL;
 	}
@@ -502,8 +501,10 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages)
 		online_pages_range);
 	if (ret) {
 		mutex_unlock(&zonelists_mutex);
-		printk(KERN_DEBUG "online_pages %lx at %lx failed\n",
-			nr_pages, pfn);
+		printk(KERN_DEBUG "online_pages [mem %#010llx-%#010llx] failed\n",
+		       (unsigned long long) pfn << PAGE_SHIFT,
+		       (((unsigned long long) pfn + nr_pages)
+			    << PAGE_SHIFT) - 1);
 		memory_notify(MEM_CANCEL_ONLINE, &arg);
 		unlock_memory_hotplug();
 		return ret;
@@ -977,8 +978,9 @@ repeat:
 	return 0;
 
 failed_removal:
-	printk(KERN_INFO "memory offlining %lx to %lx failed\n",
-		start_pfn, end_pfn);
+	printk(KERN_INFO "memory offlining [mem %#010llx-%#010llx] failed\n",
+	       (unsigned long long) start_pfn << PAGE_SHIFT,
+	       ((unsigned long long) end_pfn << PAGE_SHIFT) - 1);
 	memory_notify(MEM_CANCEL_OFFLINE, &arg);
 	/* pushback to free area */
 	undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index bab8e3bc..8d71ad8 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4815,7 +4815,7 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
 	find_zone_movable_pfns_for_nodes();
 
 	/* Print out the zone ranges */
-	printk("Zone PFN ranges:\n");
+	printk("Zone ranges:\n");
 	for (i = 0; i < MAX_NR_ZONES; i++) {
 		if (i == ZONE_MOVABLE)
 			continue;
@@ -4824,22 +4824,25 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
 				arch_zone_highest_possible_pfn[i])
 			printk(KERN_CONT "empty\n");
 		else
-			printk(KERN_CONT "%0#10lx -> %0#10lx\n",
-				arch_zone_lowest_possible_pfn[i],
-				arch_zone_highest_possible_pfn[i]);
+			printk(KERN_CONT "[mem %0#10lx-%0#10lx]\n",
+				arch_zone_lowest_possible_pfn[i] << PAGE_SHIFT,
+				(arch_zone_highest_possible_pfn[i]
+					<< PAGE_SHIFT) - 1);
 	}
 
 	/* Print out the PFNs ZONE_MOVABLE begins at in each node */
-	printk("Movable zone start PFN for each node\n");
+	printk("Movable zone start for each node\n");
 	for (i = 0; i < MAX_NUMNODES; i++) {
 		if (zone_movable_pfn[i])
-			printk("  Node %d: %lu\n", i, zone_movable_pfn[i]);
+			printk("  Node %d: %#010lx\n", i,
+			       zone_movable_pfn[i] << PAGE_SHIFT);
 	}
 
 	/* Print out the early_node_map[] */
-	printk("Early memory PFN ranges\n");
+	printk("Early memory node ranges\n");
 	for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid)
-		printk("  %3d: %0#10lx -> %0#10lx\n", nid, start_pfn, end_pfn);
+		printk("  node %3d: [mem %#010lx-%#010lx]\n", nid,
+		       start_pfn << PAGE_SHIFT, (end_pfn << PAGE_SHIFT) - 1);
 
 	/* Initialise every node */
 	mminit_verify_pageflags_layout();
-- 
cgit v0.10.2


From 9295b7a07c859a42346221b5839be0ae612333b0 Mon Sep 17 00:00:00 2001
From: Ulrich Drepper <drepper@akkadia.org>
Date: Tue, 29 May 2012 15:06:30 -0700
Subject: kbuild: install kernel-page-flags.h

Programs using /proc/kpageflags need to know about the various flags.  The
<linux/kernel-page-flags.h> provides them and the comments in the file
indicate that it is supposed to be used by user-level code.  But the file
is not installed.

Install the headers and mark the unstable flags as out-of-bounds.  The
page-type tool is also adjusted to not duplicate the definitions

Signed-off-by: Ulrich Drepper <drepper@gmail.com>
Acked-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Acked-by: Fengguang Wu <fengguang.wu@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/include/linux/Kbuild b/include/linux/Kbuild
index 4cd59b9..7185b8f 100644
--- a/include/linux/Kbuild
+++ b/include/linux/Kbuild
@@ -225,6 +225,7 @@ header-y += kd.h
 header-y += kdev_t.h
 header-y += kernel.h
 header-y += kernelcapi.h
+header-y += kernel-page-flags.h
 header-y += keyboard.h
 header-y += keyctl.h
 header-y += l2tp.h
diff --git a/include/linux/kernel-page-flags.h b/include/linux/kernel-page-flags.h
index 26a6571..a1bdf69 100644
--- a/include/linux/kernel-page-flags.h
+++ b/include/linux/kernel-page-flags.h
@@ -32,6 +32,8 @@
 #define KPF_KSM			21
 #define KPF_THP			22
 
+#ifdef __KERNEL__
+
 /* kernel hacking assistances
  * WARNING: subject to change, never rely on them!
  */
@@ -44,4 +46,6 @@
 #define KPF_ARCH		38
 #define KPF_UNCACHED		39
 
+#endif /* __KERNEL__ */
+
 #endif /* LINUX_KERNEL_PAGE_FLAGS_H */
diff --git a/tools/vm/page-types.c b/tools/vm/page-types.c
index 7dab7b25..f77c96b 100644
--- a/tools/vm/page-types.c
+++ b/tools/vm/page-types.c
@@ -35,6 +35,7 @@
 #include <sys/mount.h>
 #include <sys/statfs.h>
 #include "../../include/linux/magic.h"
+#include "../../include/linux/kernel-page-flags.h"
 
 
 #ifndef MAX_PATH
@@ -73,33 +74,6 @@
 #define KPF_BYTES		8
 #define PROC_KPAGEFLAGS		"/proc/kpageflags"
 
-/* copied from kpageflags_read() */
-#define KPF_LOCKED		0
-#define KPF_ERROR		1
-#define KPF_REFERENCED		2
-#define KPF_UPTODATE		3
-#define KPF_DIRTY		4
-#define KPF_LRU			5
-#define KPF_ACTIVE		6
-#define KPF_SLAB		7
-#define KPF_WRITEBACK		8
-#define KPF_RECLAIM		9
-#define KPF_BUDDY		10
-
-/* [11-20] new additions in 2.6.31 */
-#define KPF_MMAP		11
-#define KPF_ANON		12
-#define KPF_SWAPCACHE		13
-#define KPF_SWAPBACKED		14
-#define KPF_COMPOUND_HEAD	15
-#define KPF_COMPOUND_TAIL	16
-#define KPF_HUGE		17
-#define KPF_UNEVICTABLE		18
-#define KPF_HWPOISON		19
-#define KPF_NOPAGE		20
-#define KPF_KSM			21
-#define KPF_THP			22
-
 /* [32-] kernel hacking assistances */
 #define KPF_RESERVED		32
 #define KPF_MLOCKED		33
-- 
cgit v0.10.2


From e30d539b3fe76f5bb71e069dcbfa47a1c2e6da3b Mon Sep 17 00:00:00 2001
From: Ulrich Drepper <drepper@akkadia.org>
Date: Tue, 29 May 2012 15:06:30 -0700
Subject: tools/vm/page-types.c: cleanups

Compiling page-type.c with a recent compiler produces many warnings,
mostly related to signed/unsigned comparisons.  This patch cleans up most
of them.

One remaining warning is about an unused parameter.  The <compiler.h> file
doesn't define a __unused macro (or the like) yet.  This can be addressed
later.

Signed-off-by: Ulrich Drepper <drepper@gmail.com>
Acked-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Acked-by: Fengguang Wu <fengguang.wu@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/tools/vm/page-types.c b/tools/vm/page-types.c
index f77c96b..f576971 100644
--- a/tools/vm/page-types.c
+++ b/tools/vm/page-types.c
@@ -300,7 +300,7 @@ static char *page_flag_name(uint64_t flags)
 {
 	static char buf[65];
 	int present;
-	int i, j;
+	size_t i, j;
 
 	for (i = 0, j = 0; i < ARRAY_SIZE(page_flag_names); i++) {
 		present = (flags >> i) & 1;
@@ -318,7 +318,7 @@ static char *page_flag_name(uint64_t flags)
 static char *page_flag_longname(uint64_t flags)
 {
 	static char buf[1024];
-	int i, n;
+	size_t i, n;
 
 	for (i = 0, n = 0; i < ARRAY_SIZE(page_flag_names); i++) {
 		if (!page_flag_names[i])
@@ -376,7 +376,7 @@ static void show_page(unsigned long voffset,
 
 static void show_summary(void)
 {
-	int i;
+	size_t i;
 
 	printf("             flags\tpage-count       MB"
 		"  symbolic-flags\t\t\tlong-symbolic-flags\n");
@@ -474,7 +474,7 @@ static int debugfs_valid_mountpoint(const char *debugfs)
 /* find the path to the mounted debugfs */
 static const char *debugfs_find_mountpoint(void)
 {
-	const char **ptr;
+	const char *const *ptr;
 	char type[100];
 	FILE *fp;
 
@@ -511,7 +511,7 @@ static const char *debugfs_find_mountpoint(void)
 
 static void debugfs_mount(void)
 {
-	const char **ptr;
+	const char *const *ptr;
 
 	/* see if it's already mounted */
 	if (debugfs_find_mountpoint())
@@ -588,10 +588,10 @@ static int unpoison_page(unsigned long offset)
  * page frame walker
  */
 
-static int hash_slot(uint64_t flags)
+static size_t hash_slot(uint64_t flags)
 {
-	int k = HASH_KEY(flags);
-	int i;
+	size_t k = HASH_KEY(flags);
+	size_t i;
 
 	/* Explicitly reserve slot 0 for flags 0: the following logic
 	 * cannot distinguish an unoccupied slot from slot (flags==0).
@@ -644,7 +644,7 @@ static void walk_pfn(unsigned long voffset,
 {
 	uint64_t buf[KPAGEFLAGS_BATCH];
 	unsigned long batch;
-	long pages;
+	unsigned long pages;
 	unsigned long i;
 
 	while (count) {
@@ -753,7 +753,7 @@ static const char *page_flag_type(uint64_t flag)
 
 static void usage(void)
 {
-	int i, j;
+	size_t i, j;
 
 	printf(
 "page-types [options]\n"
@@ -912,7 +912,7 @@ static void add_bits_filter(uint64_t mask, uint64_t bits)
 
 static uint64_t parse_flag_name(const char *str, int len)
 {
-	int i;
+	size_t i;
 
 	if (!*str || !len)
 		return 0;
-- 
cgit v0.10.2


From 2099597401c7710c00b0d7c32b24a44a193836e1 Mon Sep 17 00:00:00 2001
From: Alex Shi <alex.shi@intel.com>
Date: Tue, 29 May 2012 15:06:31 -0700
Subject: mm: move is_vma_temporary_stack() declaration to huge_mm.h

When transparent_hugepage_enabled() is used outside mm/, such as in
arch/x86/xx/tlb.c:

+       if (!cpu_has_invlpg || vma->vm_flags & VM_HUGETLB
+                       || transparent_hugepage_enabled(vma)) {
+               flush_tlb_mm(vma->vm_mm);

is_vma_temporary_stack() isn't referenced in huge_mm.h, so it has compile
errors:

  arch/x86/mm/tlb.c: In function `flush_tlb_range':
  arch/x86/mm/tlb.c:324:4: error: implicit declaration of function `is_vma_temporary_stack' [-Werror=implicit-function-declaration]

Since is_vma_temporay_stack() is just used in rmap.c and huge_memory.c, it
is better to move it to huge_mm.h from rmap.h to avoid such errors.

Signed-off-by: Alex Shi <alex.shi@intel.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index c8af7a2..4c59b11 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -59,6 +59,8 @@ extern pmd_t *page_check_address_pmd(struct page *page,
 #define HPAGE_PMD_MASK HPAGE_MASK
 #define HPAGE_PMD_SIZE HPAGE_SIZE
 
+extern bool is_vma_temporary_stack(struct vm_area_struct *vma);
+
 #define transparent_hugepage_enabled(__vma)				\
 	((transparent_hugepage_flags &					\
 	  (1<<TRANSPARENT_HUGEPAGE_FLAG) ||				\
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index fd07c45..3fce545 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -173,8 +173,6 @@ enum ttu_flags {
 };
 #define TTU_ACTION(x) ((x) & TTU_ACTION_MASK)
 
-bool is_vma_temporary_stack(struct vm_area_struct *vma);
-
 int try_to_unmap(struct page *, enum ttu_flags flags);
 int try_to_unmap_one(struct page *, struct vm_area_struct *,
 			unsigned long address, enum ttu_flags flags);
-- 
cgit v0.10.2


From 955c1cd7401565671b064e499115344ec8067dfd Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Tue, 29 May 2012 15:06:31 -0700
Subject: mm/page_alloc.c: remove pageblock_default_order()

This has always been broken: one version takes an unsigned int and the
other version takes no arguments.  This bug was hidden because one
version of set_pageblock_order() was a macro which doesn't evaluate its
argument.

Simplify it all and remove pageblock_default_order() altogether.

Reported-by: rajman mekaco <rajman.mekaco@gmail.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Minchan Kim <minchan.kim@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 8d71ad8..84f2c59 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4300,25 +4300,24 @@ static inline void setup_usemap(struct pglist_data *pgdat,
 
 #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
 
-/* Return a sensible default order for the pageblock size. */
-static inline int pageblock_default_order(void)
-{
-	if (HPAGE_SHIFT > PAGE_SHIFT)
-		return HUGETLB_PAGE_ORDER;
-
-	return MAX_ORDER-1;
-}
-
 /* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */
-static inline void __init set_pageblock_order(unsigned int order)
+static inline void __init set_pageblock_order(void)
 {
+	unsigned int order;
+
 	/* Check that pageblock_nr_pages has not already been setup */
 	if (pageblock_order)
 		return;
 
+	if (HPAGE_SHIFT > PAGE_SHIFT)
+		order = HUGETLB_PAGE_ORDER;
+	else
+		order = MAX_ORDER - 1;
+
 	/*
 	 * Assume the largest contiguous order of interest is a huge page.
-	 * This value may be variable depending on boot parameters on IA64
+	 * This value may be variable depending on boot parameters on IA64 and
+	 * powerpc.
 	 */
 	pageblock_order = order;
 }
@@ -4326,15 +4325,13 @@ static inline void __init set_pageblock_order(unsigned int order)
 
 /*
  * When CONFIG_HUGETLB_PAGE_SIZE_VARIABLE is not set, set_pageblock_order()
- * and pageblock_default_order() are unused as pageblock_order is set
- * at compile-time. See include/linux/pageblock-flags.h for the values of
- * pageblock_order based on the kernel config
+ * is unused as pageblock_order is set at compile-time. See
+ * include/linux/pageblock-flags.h for the values of pageblock_order based on
+ * the kernel config
  */
-static inline int pageblock_default_order(unsigned int order)
+static inline void set_pageblock_order(void)
 {
-	return MAX_ORDER-1;
 }
-#define set_pageblock_order(x)	do {} while (0)
 
 #endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
 
@@ -4422,7 +4419,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
 		if (!size)
 			continue;
 
-		set_pageblock_order(pageblock_default_order());
+		set_pageblock_order();
 		setup_usemap(pgdat, zone, size);
 		ret = init_currently_empty_zone(zone, zone_start_pfn,
 						size, MEMMAP_EARLY);
-- 
cgit v0.10.2


From 6dccdcbe2c3ebe152847ac8507e7bded4e3f4546 Mon Sep 17 00:00:00 2001
From: Gavin Shan <shangw@linux.vnet.ibm.com>
Date: Tue, 29 May 2012 15:06:32 -0700
Subject: mm: bootmem: fix checking the bitmap when finally freeing bootmem

When bootmem releases an unaligned chunk of memory at the beginning of a
node to the page allocator, it iterates from that unaligned PFN but
checks an aligned word of the page bitmap.  The checked bits do not
correspond to the PFNs and, as a result, reserved pages can be freed.

Properly shift the bitmap word so that the lowest bit corresponds to the
starting PFN before entering the freeing loop.

This bug has been around since commit 41546c17418f ("bootmem: clean up
free_all_bootmem_core") (2.6.27) without known reports.

Signed-off-by: Gavin Shan <shangw@linux.vnet.ibm.com>
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Tejun Heo <tj@kernel.org>
Acked-by: David S. Miller <davem@davemloft.net>
Cc: Yinghai Lu <yinghai@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/mm/bootmem.c b/mm/bootmem.c
index 0131170..67872fc 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -203,6 +203,7 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
 		} else {
 			unsigned long off = 0;
 
+			vec >>= start & (BITS_PER_LONG - 1);
 			while (vec && off < BITS_PER_LONG) {
 				if (vec & 1) {
 					page = pfn_to_page(start + off);
-- 
cgit v0.10.2


From 549381e19cc7874bb15f0e4760c1004d4fe28d8a Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Tue, 29 May 2012 15:06:32 -0700
Subject: mm: bootmem: remove redundant offset check when finally freeing
 bootmem

When bootmem releases an unaligned BITS_PER_LONG pages chunk of memory
to the page allocator, it checks the bitmap if there are still
unreserved pages in the chunk (set bits), but also if the offset in the
chunk indicates BITS_PER_LONG loop iterations already.

But since the consulted bitmap is only a one-word-excerpt of the full
per-node bitmap, there can not be more than BITS_PER_LONG bits set in
it.  The additional offset check is unnecessary.

Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Tejun Heo <tj@kernel.org>
Acked-by: David S. Miller <davem@davemloft.net>
Cc: Yinghai Lu <yinghai@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/mm/bootmem.c b/mm/bootmem.c
index 67872fc..053ac3f 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -204,7 +204,7 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
 			unsigned long off = 0;
 
 			vec >>= start & (BITS_PER_LONG - 1);
-			while (vec && off < BITS_PER_LONG) {
+			while (vec) {
 				if (vec & 1) {
 					page = pfn_to_page(start + off);
 					__free_pages_bootmem(page, 0);
-- 
cgit v0.10.2


From c6785b6bf1b2a4b47238b24ee56f61e27c3af682 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Tue, 29 May 2012 15:06:33 -0700
Subject: mm: bootmem: rename alloc_bootmem_core to alloc_bootmem_bdata

Callsites need to provide a bootmem_data_t *, make the naming more
descriptive.

Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Tejun Heo <tj@kernel.org>
Acked-by: David S. Miller <davem@davemloft.net>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: Gavin Shan <shangw@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/mm/bootmem.c b/mm/bootmem.c
index 053ac3f..ceed0df 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -468,7 +468,7 @@ static unsigned long __init align_off(struct bootmem_data *bdata,
 	return ALIGN(base + off, align) - base;
 }
 
-static void * __init alloc_bootmem_core(struct bootmem_data *bdata,
+static void * __init alloc_bootmem_bdata(struct bootmem_data *bdata,
 					unsigned long size, unsigned long align,
 					unsigned long goal, unsigned long limit)
 {
@@ -589,7 +589,7 @@ static void * __init alloc_arch_preferred_bootmem(bootmem_data_t *bdata,
 		p_bdata = bootmem_arch_preferred_node(bdata, size, align,
 							goal, limit);
 		if (p_bdata)
-			return alloc_bootmem_core(p_bdata, size, align,
+			return alloc_bootmem_bdata(p_bdata, size, align,
 							goal, limit);
 	}
 #endif
@@ -615,7 +615,7 @@ restart:
 		if (limit && bdata->node_min_pfn >= PFN_DOWN(limit))
 			break;
 
-		region = alloc_bootmem_core(bdata, size, align, goal, limit);
+		region = alloc_bootmem_bdata(bdata, size, align, goal, limit);
 		if (region)
 			return region;
 	}
@@ -695,7 +695,7 @@ static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata,
 	if (ptr)
 		return ptr;
 
-	ptr = alloc_bootmem_core(bdata, size, align, goal, limit);
+	ptr = alloc_bootmem_bdata(bdata, size, align, goal, limit);
 	if (ptr)
 		return ptr;
 
@@ -744,7 +744,7 @@ void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size,
 		unsigned long new_goal;
 
 		new_goal = MAX_DMA32_PFN << PAGE_SHIFT;
-		ptr = alloc_bootmem_core(pgdat->bdata, size, align,
+		ptr = alloc_bootmem_bdata(pgdat->bdata, size, align,
 						 new_goal, 0);
 		if (ptr)
 			return ptr;
@@ -773,7 +773,7 @@ void * __init alloc_bootmem_section(unsigned long size,
 	goal = pfn << PAGE_SHIFT;
 	bdata = &bootmem_node_data[early_pfn_to_nid(pfn)];
 
-	return alloc_bootmem_core(bdata, size, SMP_CACHE_BYTES, goal, 0);
+	return alloc_bootmem_bdata(bdata, size, SMP_CACHE_BYTES, goal, 0);
 }
 #endif
 
@@ -789,7 +789,7 @@ void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size,
 	if (ptr)
 		return ptr;
 
-	ptr = alloc_bootmem_core(pgdat->bdata, size, align, goal, 0);
+	ptr = alloc_bootmem_bdata(pgdat->bdata, size, align, goal, 0);
 	if (ptr)
 		return ptr;
 
-- 
cgit v0.10.2


From c12ab504aa6076d1f1d37ee32431608ed11a1c3b Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Tue, 29 May 2012 15:06:33 -0700
Subject: mm: bootmem: split out goal-to-node mapping from goal dropping

Matching the desired goal to the right node is one thing, dropping the
goal when it can not be satisfied is another.  Split this into separate
functions so that subsequent patches can use the node-finding but drop and
handle the goal fallback on their own terms.

Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Tejun Heo <tj@kernel.org>
Acked-by: David S. Miller <davem@davemloft.net>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: Gavin Shan <shangw@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/mm/bootmem.c b/mm/bootmem.c
index ceed0df..bafeb2c 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -596,7 +596,7 @@ static void * __init alloc_arch_preferred_bootmem(bootmem_data_t *bdata,
 	return NULL;
 }
 
-static void * __init ___alloc_bootmem_nopanic(unsigned long size,
+static void * __init alloc_bootmem_core(unsigned long size,
 					unsigned long align,
 					unsigned long goal,
 					unsigned long limit)
@@ -604,7 +604,6 @@ static void * __init ___alloc_bootmem_nopanic(unsigned long size,
 	bootmem_data_t *bdata;
 	void *region;
 
-restart:
 	region = alloc_arch_preferred_bootmem(NULL, size, align, goal, limit);
 	if (region)
 		return region;
@@ -620,6 +619,20 @@ restart:
 			return region;
 	}
 
+	return NULL;
+}
+
+static void * __init ___alloc_bootmem_nopanic(unsigned long size,
+					      unsigned long align,
+					      unsigned long goal,
+					      unsigned long limit)
+{
+	void *ptr;
+
+restart:
+	ptr = alloc_bootmem_core(size, align, goal, limit);
+	if (ptr)
+		return ptr;
 	if (goal) {
 		goal = 0;
 		goto restart;
-- 
cgit v0.10.2


From ab3818432294a19ad793a0965d89867b4ce6255b Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Tue, 29 May 2012 15:06:34 -0700
Subject: mm: bootmem: allocate in order node+goal, goal, node, anywhere

Match the nobootmem version of __alloc_bootmem_node.  Try to satisfy both
the node and the goal, then just the goal, then just the node, then
allocate anywhere before panicking.

Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Tejun Heo <tj@kernel.org>
Acked-by: David S. Miller <davem@davemloft.net>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: Gavin Shan <shangw@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/mm/bootmem.c b/mm/bootmem.c
index bafeb2c..b5babdf 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -704,6 +704,7 @@ static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata,
 {
 	void *ptr;
 
+again:
 	ptr = alloc_arch_preferred_bootmem(bdata, size, align, goal, limit);
 	if (ptr)
 		return ptr;
@@ -712,7 +713,18 @@ static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata,
 	if (ptr)
 		return ptr;
 
-	return ___alloc_bootmem(size, align, goal, limit);
+	ptr = alloc_bootmem_core(size, align, goal, limit);
+	if (ptr)
+		return ptr;
+
+	if (goal) {
+		goal = 0;
+		goto again;
+	}
+
+	printk(KERN_ALERT "bootmem alloc of %lu bytes failed!\n", size);
+	panic("Out of memory");
+	return NULL;
 }
 
 /**
-- 
cgit v0.10.2


From 421456edd27cf512b8f0025245a0f3572bd69b00 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Tue, 29 May 2012 15:06:34 -0700
Subject: mm: bootmem: unify allocation policy of (non-)panicking node
 allocations

While the panicking node-specific allocation function tries to satisfy
node+goal, goal, node, anywhere, the non-panicking function still does
node+goal, goal, anywhere.

Make it simpler: define the panicking version in terms of the
non-panicking one, like the node-agnostic interface, so they always behave
the same way apart from how to deal with allocation failure.

Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Tejun Heo <tj@kernel.org>
Acked-by: David S. Miller <davem@davemloft.net>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: Gavin Shan <shangw@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/mm/bootmem.c b/mm/bootmem.c
index b5babdf..d9185c3 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -698,7 +698,7 @@ void * __init __alloc_bootmem(unsigned long size, unsigned long align,
 	return ___alloc_bootmem(size, align, goal, limit);
 }
 
-static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata,
+static void * __init ___alloc_bootmem_node_nopanic(bootmem_data_t *bdata,
 				unsigned long size, unsigned long align,
 				unsigned long goal, unsigned long limit)
 {
@@ -722,6 +722,29 @@ again:
 		goto again;
 	}
 
+	return NULL;
+}
+
+void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size,
+				   unsigned long align, unsigned long goal)
+{
+	if (WARN_ON_ONCE(slab_is_available()))
+		return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
+
+	return ___alloc_bootmem_node_nopanic(pgdat->bdata, size,
+					     align, goal, 0);
+}
+
+void * __init ___alloc_bootmem_node(bootmem_data_t *bdata, unsigned long size,
+				    unsigned long align, unsigned long goal,
+				    unsigned long limit)
+{
+	void *ptr;
+
+	ptr = ___alloc_bootmem_node_nopanic(bdata, size, align, goal, 0);
+	if (ptr)
+		return ptr;
+
 	printk(KERN_ALERT "bootmem alloc of %lu bytes failed!\n", size);
 	panic("Out of memory");
 	return NULL;
@@ -802,25 +825,6 @@ void * __init alloc_bootmem_section(unsigned long size,
 }
 #endif
 
-void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size,
-				   unsigned long align, unsigned long goal)
-{
-	void *ptr;
-
-	if (WARN_ON_ONCE(slab_is_available()))
-		return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
-
-	ptr = alloc_arch_preferred_bootmem(pgdat->bdata, size, align, goal, 0);
-	if (ptr)
-		return ptr;
-
-	ptr = alloc_bootmem_bdata(pgdat->bdata, size, align, goal, 0);
-	if (ptr)
-		return ptr;
-
-	return __alloc_bootmem_nopanic(size, align, goal);
-}
-
 #ifndef ARCH_LOW_ADDRESS_LIMIT
 #define ARCH_LOW_ADDRESS_LIMIT	0xffffffffUL
 #endif
-- 
cgit v0.10.2


From 2c478eae96501163c5c5d5f682bba4d34a7ea1d4 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Tue, 29 May 2012 15:06:35 -0700
Subject: mm: nobootmem: panic on node-specific allocation failure

__alloc_bootmem_node and __alloc_bootmem_low_node documentation claims
the functions panic on allocation failure.  Do it.

Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Yinghai Lu <yinghai@kernel.org>
Acked-by: Tejun Heo <tj@kernel.org>
Acked-by: David S. Miller <davem@davemloft.net>
Cc: Gavin Shan <shangw@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/mm/nobootmem.c b/mm/nobootmem.c
index 1983fb1..cca7620 100644
--- a/mm/nobootmem.c
+++ b/mm/nobootmem.c
@@ -305,11 +305,17 @@ again:
 
 	ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align,
 					goal, -1ULL);
-	if (!ptr && goal) {
+	if (ptr)
+		return ptr;
+
+	if (goal) {
 		goal = 0;
 		goto again;
 	}
-	return ptr;
+
+	printk(KERN_ALERT "bootmem alloc of %lu bytes failed!\n", size);
+	panic("Out of memory");
+	return NULL;
 }
 
 void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size,
@@ -407,6 +413,12 @@ void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size,
 	if (ptr)
 		return ptr;
 
-	return  __alloc_memory_core_early(MAX_NUMNODES, size, align,
-				goal, ARCH_LOW_ADDRESS_LIMIT);
+	ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align,
+					goal, ARCH_LOW_ADDRESS_LIMIT);
+	if (ptr)
+		return ptr;
+
+	printk(KERN_ALERT "bootmem alloc of %lu bytes failed!\n", size);
+	panic("Out of memory");
+	return NULL;
 }
-- 
cgit v0.10.2


From ba539868331874da02017e8dda8ed5b86af6d21a Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Tue, 29 May 2012 15:06:35 -0700
Subject: mm: nobootmem: unify allocation policy of (non-)panicking node
 allocations

While the panicking node-specific allocation function tries to satisfy
node+goal, goal, node, anywhere, the non-panicking function still does
node+goal, goal, anywhere.

Make it simpler: define the panicking version in terms of the non-panicking
one, like the node-agnostic interface, so they always behave the same way
apart from how to deal with allocation failure.

Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Yinghai Lu <yinghai@kernel.org>
Acked-by: Tejun Heo <tj@kernel.org>
Acked-by: David S. Miller <davem@davemloft.net>
Cc: Gavin Shan <shangw@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/mm/nobootmem.c b/mm/nobootmem.c
index cca7620..9f40481 100644
--- a/mm/nobootmem.c
+++ b/mm/nobootmem.c
@@ -274,6 +274,57 @@ void * __init __alloc_bootmem(unsigned long size, unsigned long align,
 	return ___alloc_bootmem(size, align, goal, limit);
 }
 
+static void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
+						   unsigned long size,
+						   unsigned long align,
+						   unsigned long goal,
+						   unsigned long limit)
+{
+	void *ptr;
+
+again:
+	ptr = __alloc_memory_core_early(pgdat->node_id, size, align,
+					goal, limit);
+	if (ptr)
+		return ptr;
+
+	ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align,
+					goal, limit);
+	if (ptr)
+		return ptr;
+
+	if (goal) {
+		goal = 0;
+		goto again;
+	}
+
+	return NULL;
+}
+
+void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size,
+				   unsigned long align, unsigned long goal)
+{
+	if (WARN_ON_ONCE(slab_is_available()))
+		return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
+
+	return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0);
+}
+
+void * __init ___alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
+				    unsigned long align, unsigned long goal,
+				    unsigned long limit)
+{
+	void *ptr;
+
+	ptr = ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, limit);
+	if (ptr)
+		return ptr;
+
+	printk(KERN_ALERT "bootmem alloc of %lu bytes failed!\n", size);
+	panic("Out of memory");
+	return NULL;
+}
+
 /**
  * __alloc_bootmem_node - allocate boot memory from a specific node
  * @pgdat: node to allocate from
@@ -292,30 +343,10 @@ void * __init __alloc_bootmem(unsigned long size, unsigned long align,
 void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
 				   unsigned long align, unsigned long goal)
 {
-	void *ptr;
-
 	if (WARN_ON_ONCE(slab_is_available()))
 		return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
 
-again:
-	ptr = __alloc_memory_core_early(pgdat->node_id, size, align,
-					 goal, -1ULL);
-	if (ptr)
-		return ptr;
-
-	ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align,
-					goal, -1ULL);
-	if (ptr)
-		return ptr;
-
-	if (goal) {
-		goal = 0;
-		goto again;
-	}
-
-	printk(KERN_ALERT "bootmem alloc of %lu bytes failed!\n", size);
-	panic("Out of memory");
-	return NULL;
+	return ___alloc_bootmem_node(pgdat, size, align, goal, 0);
 }
 
 void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size,
@@ -346,22 +377,6 @@ void * __init alloc_bootmem_section(unsigned long size,
 }
 #endif
 
-void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size,
-				   unsigned long align, unsigned long goal)
-{
-	void *ptr;
-
-	if (WARN_ON_ONCE(slab_is_available()))
-		return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
-
-	ptr =  __alloc_memory_core_early(pgdat->node_id, size, align,
-						 goal, -1ULL);
-	if (ptr)
-		return ptr;
-
-	return __alloc_bootmem_nopanic(size, align, goal);
-}
-
 #ifndef ARCH_LOW_ADDRESS_LIMIT
 #define ARCH_LOW_ADDRESS_LIMIT	0xffffffffUL
 #endif
@@ -403,22 +418,9 @@ void * __init __alloc_bootmem_low(unsigned long size, unsigned long align,
 void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size,
 				       unsigned long align, unsigned long goal)
 {
-	void *ptr;
-
 	if (WARN_ON_ONCE(slab_is_available()))
 		return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
 
-	ptr = __alloc_memory_core_early(pgdat->node_id, size, align,
-				goal, ARCH_LOW_ADDRESS_LIMIT);
-	if (ptr)
-		return ptr;
-
-	ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align,
-					goal, ARCH_LOW_ADDRESS_LIMIT);
-	if (ptr)
-		return ptr;
-
-	printk(KERN_ALERT "bootmem alloc of %lu bytes failed!\n", size);
-	panic("Out of memory");
-	return NULL;
+	return ___alloc_bootmem_node(pgdat, size, align, goal,
+				     ARCH_LOW_ADDRESS_LIMIT);
 }
-- 
cgit v0.10.2


From e9079911e6c8fb7882de142de76f3ee49b54e000 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Tue, 29 May 2012 15:06:36 -0700
Subject: mm: bootmem: pass pgdat instead of pgdat->bdata down the stack

Pass down the node descriptor instead of the more specific bootmem node
descriptor down the call stack, like nobootmem does, when there is no good
reason for the two to be different.

Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Tejun Heo <tj@kernel.org>
Acked-by: David S. Miller <davem@davemloft.net>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: Gavin Shan <shangw@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/mm/bootmem.c b/mm/bootmem.c
index d9185c3..9d0f266 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -698,18 +698,19 @@ void * __init __alloc_bootmem(unsigned long size, unsigned long align,
 	return ___alloc_bootmem(size, align, goal, limit);
 }
 
-static void * __init ___alloc_bootmem_node_nopanic(bootmem_data_t *bdata,
+static void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
 				unsigned long size, unsigned long align,
 				unsigned long goal, unsigned long limit)
 {
 	void *ptr;
 
 again:
-	ptr = alloc_arch_preferred_bootmem(bdata, size, align, goal, limit);
+	ptr = alloc_arch_preferred_bootmem(pgdat->bdata, size,
+					   align, goal, limit);
 	if (ptr)
 		return ptr;
 
-	ptr = alloc_bootmem_bdata(bdata, size, align, goal, limit);
+	ptr = alloc_bootmem_bdata(pgdat->bdata, size, align, goal, limit);
 	if (ptr)
 		return ptr;
 
@@ -731,17 +732,16 @@ void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size,
 	if (WARN_ON_ONCE(slab_is_available()))
 		return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
 
-	return ___alloc_bootmem_node_nopanic(pgdat->bdata, size,
-					     align, goal, 0);
+	return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0);
 }
 
-void * __init ___alloc_bootmem_node(bootmem_data_t *bdata, unsigned long size,
+void * __init ___alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
 				    unsigned long align, unsigned long goal,
 				    unsigned long limit)
 {
 	void *ptr;
 
-	ptr = ___alloc_bootmem_node_nopanic(bdata, size, align, goal, 0);
+	ptr = ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0);
 	if (ptr)
 		return ptr;
 
@@ -771,7 +771,7 @@ void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
 	if (WARN_ON_ONCE(slab_is_available()))
 		return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
 
-	return  ___alloc_bootmem_node(pgdat->bdata, size, align, goal, 0);
+	return  ___alloc_bootmem_node(pgdat, size, align, goal, 0);
 }
 
 void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size,
@@ -869,6 +869,6 @@ void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size,
 	if (WARN_ON_ONCE(slab_is_available()))
 		return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
 
-	return ___alloc_bootmem_node(pgdat->bdata, size, align,
-				goal, ARCH_LOW_ADDRESS_LIMIT);
+	return ___alloc_bootmem_node(pgdat, size, align,
+				     goal, ARCH_LOW_ADDRESS_LIMIT);
 }
-- 
cgit v0.10.2


From 238305bb4d418c95977162ba13c11880685fc731 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Tue, 29 May 2012 15:06:36 -0700
Subject: mm: remove sparsemem allocation details from the bootmem allocator

alloc_bootmem_section() derives allocation area constraints from the
specified sparsemem section.  This is a bit specific for a generic memory
allocator like bootmem, though, so move it over to sparsemem.

As __alloc_bootmem_node_nopanic() already retries failed allocations with
relaxed area constraints, the fallback code in sparsemem.c can be removed
and the code becomes a bit more compact overall.

[akpm@linux-foundation.org: fix build]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Tejun Heo <tj@kernel.org>
Acked-by: David S. Miller <davem@davemloft.net>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: Gavin Shan <shangw@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h
index 1a0cd27..324fe08 100644
--- a/include/linux/bootmem.h
+++ b/include/linux/bootmem.h
@@ -135,9 +135,6 @@ extern void *__alloc_bootmem_low_node(pg_data_t *pgdat,
 extern int reserve_bootmem_generic(unsigned long addr, unsigned long size,
 				   int flags);
 
-extern void *alloc_bootmem_section(unsigned long size,
-				   unsigned long section_nr);
-
 #ifdef CONFIG_HAVE_ARCH_ALLOC_REMAP
 extern void *alloc_remap(int nid, unsigned long size);
 #else
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 9d0f266..d1c7a79 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -803,28 +803,6 @@ void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size,
 
 }
 
-#ifdef CONFIG_SPARSEMEM
-/**
- * alloc_bootmem_section - allocate boot memory from a specific section
- * @size: size of the request in bytes
- * @section_nr: sparse map section to allocate from
- *
- * Return NULL on failure.
- */
-void * __init alloc_bootmem_section(unsigned long size,
-				    unsigned long section_nr)
-{
-	bootmem_data_t *bdata;
-	unsigned long pfn, goal;
-
-	pfn = section_nr_to_pfn(section_nr);
-	goal = pfn << PAGE_SHIFT;
-	bdata = &bootmem_node_data[early_pfn_to_nid(pfn)];
-
-	return alloc_bootmem_bdata(bdata, size, SMP_CACHE_BYTES, goal, 0);
-}
-#endif
-
 #ifndef ARCH_LOW_ADDRESS_LIMIT
 #define ARCH_LOW_ADDRESS_LIMIT	0xffffffffUL
 #endif
diff --git a/mm/nobootmem.c b/mm/nobootmem.c
index 9f40481..d23415c 100644
--- a/mm/nobootmem.c
+++ b/mm/nobootmem.c
@@ -355,28 +355,6 @@ void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size,
 	return __alloc_bootmem_node(pgdat, size, align, goal);
 }
 
-#ifdef CONFIG_SPARSEMEM
-/**
- * alloc_bootmem_section - allocate boot memory from a specific section
- * @size: size of the request in bytes
- * @section_nr: sparse map section to allocate from
- *
- * Return NULL on failure.
- */
-void * __init alloc_bootmem_section(unsigned long size,
-				    unsigned long section_nr)
-{
-	unsigned long pfn, goal, limit;
-
-	pfn = section_nr_to_pfn(section_nr);
-	goal = pfn << PAGE_SHIFT;
-	limit = section_nr_to_pfn(section_nr + 1) << PAGE_SHIFT;
-
-	return __alloc_memory_core_early(early_pfn_to_nid(pfn), size,
-					 SMP_CACHE_BYTES, goal, limit);
-}
-#endif
-
 #ifndef ARCH_LOW_ADDRESS_LIMIT
 #define ARCH_LOW_ADDRESS_LIMIT	0xffffffffUL
 #endif
diff --git a/mm/sparse.c b/mm/sparse.c
index a8bc7d3..6a4bf91 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -273,10 +273,10 @@ static unsigned long *__kmalloc_section_usemap(void)
 #ifdef CONFIG_MEMORY_HOTREMOVE
 static unsigned long * __init
 sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
-					 unsigned long count)
+					 unsigned long size)
 {
-	unsigned long section_nr;
-
+	pg_data_t *host_pgdat;
+	unsigned long goal;
 	/*
 	 * A page may contain usemaps for other sections preventing the
 	 * page being freed and making a section unremovable while
@@ -287,8 +287,10 @@ sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
 	 * from the same section as the pgdat where possible to avoid
 	 * this problem.
 	 */
-	section_nr = pfn_to_section_nr(__pa(pgdat) >> PAGE_SHIFT);
-	return alloc_bootmem_section(usemap_size() * count, section_nr);
+	goal = __pa(pgdat) & PAGE_SECTION_MASK;
+	host_pgdat = NODE_DATA(early_pfn_to_nid(goal >> PAGE_SHIFT));
+	return __alloc_bootmem_node_nopanic(host_pgdat, size,
+					    SMP_CACHE_BYTES, goal);
 }
 
 static void __init check_usemap_section_nr(int nid, unsigned long *usemap)
@@ -332,9 +334,9 @@ static void __init check_usemap_section_nr(int nid, unsigned long *usemap)
 #else
 static unsigned long * __init
 sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
-					 unsigned long count)
+					 unsigned long size)
 {
-	return NULL;
+	return alloc_bootmem_node_nopanic(pgdat, size);
 }
 
 static void __init check_usemap_section_nr(int nid, unsigned long *usemap)
@@ -352,13 +354,10 @@ static void __init sparse_early_usemaps_alloc_node(unsigned long**usemap_map,
 	int size = usemap_size();
 
 	usemap = sparse_early_usemaps_alloc_pgdat_section(NODE_DATA(nodeid),
-								 usemap_count);
+							  size * usemap_count);
 	if (!usemap) {
-		usemap = alloc_bootmem_node(NODE_DATA(nodeid), size * usemap_count);
-		if (!usemap) {
-			printk(KERN_WARNING "%s: allocation failed\n", __func__);
-			return;
-		}
+		printk(KERN_WARNING "%s: allocation failed\n", __func__);
+		return;
 	}
 
 	for (pnum = pnum_begin; pnum < pnum_end; pnum++) {
-- 
cgit v0.10.2


From 5ceb9ce6fe9462a298bb2cd5c9f1ca6cb80a0199 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
Date: Tue, 29 May 2012 15:06:37 -0700
Subject: mm: compaction: handle incorrect MIGRATE_UNMOVABLE type pageblocks

When MIGRATE_UNMOVABLE pages are freed from MIGRATE_UNMOVABLE type
pageblock (and some MIGRATE_MOVABLE pages are left in it) waiting until an
allocation takes ownership of the block may take too long.  The type of
the pageblock remains unchanged so the pageblock cannot be used as a
migration target during compaction.

Fix it by:

* Adding enum compact_mode (COMPACT_ASYNC_[MOVABLE,UNMOVABLE], and
  COMPACT_SYNC) and then converting sync field in struct compact_control
  to use it.

* Adding nr_pageblocks_skipped field to struct compact_control and
  tracking how many destination pageblocks were of MIGRATE_UNMOVABLE type.
   If COMPACT_ASYNC_MOVABLE mode compaction ran fully in
  try_to_compact_pages() (COMPACT_COMPLETE) it implies that there is not a
  suitable page for allocation.  In this case then check how if there were
  enough MIGRATE_UNMOVABLE pageblocks to try a second pass in
  COMPACT_ASYNC_UNMOVABLE mode.

* Scanning the MIGRATE_UNMOVABLE pageblocks (during COMPACT_SYNC and
  COMPACT_ASYNC_UNMOVABLE compaction modes) and building a count based on
  finding PageBuddy pages, page_count(page) == 0 or PageLRU pages.  If all
  pages within the MIGRATE_UNMOVABLE pageblock are in one of those three
  sets change the whole pageblock type to MIGRATE_MOVABLE.

My particular test case (on a ARM EXYNOS4 device with 512 MiB, which means
131072 standard 4KiB pages in 'Normal' zone) is to:

- allocate 120000 pages for kernel's usage
- free every second page (60000 pages) of memory just allocated
- allocate and use 60000 pages from user space
- free remaining 60000 pages of kernel memory
  (now we have fragmented memory occupied mostly by user space pages)
- try to allocate 100 order-9 (2048 KiB) pages for kernel's usage

The results:
- with compaction disabled I get 11 successful allocations
- with compaction enabled - 14 successful allocations
- with this patch I'm able to get all 100 successful allocations

NOTE: If we can make kswapd aware of order-0 request during compaction, we
can enhance kswapd with changing mode to COMPACT_ASYNC_FULL
(COMPACT_ASYNC_MOVABLE + COMPACT_ASYNC_UNMOVABLE).  Please see the
following thread:

	http://marc.info/?l=linux-mm&m=133552069417068&w=2

[minchan@kernel.org: minor cleanups]
Cc: Mel Gorman <mgorman@suse.de>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Marek Szyprowski <m.szyprowski@samsung.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
Signed-off-by: Kyungmin Park <kyungmin.park@samsung.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/include/linux/compaction.h b/include/linux/compaction.h
index 51a90b7..e988037 100644
--- a/include/linux/compaction.h
+++ b/include/linux/compaction.h
@@ -1,6 +1,8 @@
 #ifndef _LINUX_COMPACTION_H
 #define _LINUX_COMPACTION_H
 
+#include <linux/node.h>
+
 /* Return values for compact_zone() and try_to_compact_pages() */
 /* compaction didn't start as it was not possible or direct reclaim was more suitable */
 #define COMPACT_SKIPPED		0
@@ -11,6 +13,23 @@
 /* The full zone was compacted */
 #define COMPACT_COMPLETE	3
 
+/*
+ * compaction supports three modes
+ *
+ * COMPACT_ASYNC_MOVABLE uses asynchronous migration and only scans
+ *    MIGRATE_MOVABLE pageblocks as migration sources and targets.
+ * COMPACT_ASYNC_UNMOVABLE uses asynchronous migration and only scans
+ *    MIGRATE_MOVABLE pageblocks as migration sources.
+ *    MIGRATE_UNMOVABLE pageblocks are scanned as potential migration
+ *    targets and convers them to MIGRATE_MOVABLE if possible
+ * COMPACT_SYNC uses synchronous migration and scans all pageblocks
+ */
+enum compact_mode {
+	COMPACT_ASYNC_MOVABLE,
+	COMPACT_ASYNC_UNMOVABLE,
+	COMPACT_SYNC,
+};
+
 #ifdef CONFIG_COMPACTION
 extern int sysctl_compact_memory;
 extern int sysctl_compaction_handler(struct ctl_table *table, int write,
diff --git a/mm/compaction.c b/mm/compaction.c
index da7d35e..840ee28 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -235,7 +235,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
 	 */
 	while (unlikely(too_many_isolated(zone))) {
 		/* async migration should just abort */
-		if (!cc->sync)
+		if (cc->mode != COMPACT_SYNC)
 			return 0;
 
 		congestion_wait(BLK_RW_ASYNC, HZ/10);
@@ -303,7 +303,8 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
 		 * satisfies the allocation
 		 */
 		pageblock_nr = low_pfn >> pageblock_order;
-		if (!cc->sync && last_pageblock_nr != pageblock_nr &&
+		if (cc->mode != COMPACT_SYNC &&
+		    last_pageblock_nr != pageblock_nr &&
 		    !migrate_async_suitable(get_pageblock_migratetype(page))) {
 			low_pfn += pageblock_nr_pages;
 			low_pfn = ALIGN(low_pfn, pageblock_nr_pages) - 1;
@@ -324,7 +325,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
 			continue;
 		}
 
-		if (!cc->sync)
+		if (cc->mode != COMPACT_SYNC)
 			mode |= ISOLATE_ASYNC_MIGRATE;
 
 		/* Try isolate the page */
@@ -357,27 +358,90 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
 
 #endif /* CONFIG_COMPACTION || CONFIG_CMA */
 #ifdef CONFIG_COMPACTION
+/*
+ * Returns true if MIGRATE_UNMOVABLE pageblock was successfully
+ * converted to MIGRATE_MOVABLE type, false otherwise.
+ */
+static bool rescue_unmovable_pageblock(struct page *page)
+{
+	unsigned long pfn, start_pfn, end_pfn;
+	struct page *start_page, *end_page;
+
+	pfn = page_to_pfn(page);
+	start_pfn = pfn & ~(pageblock_nr_pages - 1);
+	end_pfn = start_pfn + pageblock_nr_pages;
+
+	start_page = pfn_to_page(start_pfn);
+	end_page = pfn_to_page(end_pfn);
+
+	/* Do not deal with pageblocks that overlap zones */
+	if (page_zone(start_page) != page_zone(end_page))
+		return false;
+
+	for (page = start_page, pfn = start_pfn; page < end_page; pfn++,
+								  page++) {
+		if (!pfn_valid_within(pfn))
+			continue;
+
+		if (PageBuddy(page)) {
+			int order = page_order(page);
+
+			pfn += (1 << order) - 1;
+			page += (1 << order) - 1;
+
+			continue;
+		} else if (page_count(page) == 0 || PageLRU(page))
+			continue;
+
+		return false;
+	}
+
+	set_pageblock_migratetype(page, MIGRATE_MOVABLE);
+	move_freepages_block(page_zone(page), page, MIGRATE_MOVABLE);
+	return true;
+}
 
-/* Returns true if the page is within a block suitable for migration to */
-static bool suitable_migration_target(struct page *page)
+enum smt_result {
+	GOOD_AS_MIGRATION_TARGET,
+	FAIL_UNMOVABLE_TARGET,
+	FAIL_BAD_TARGET,
+};
+
+/*
+ * Returns GOOD_AS_MIGRATION_TARGET if the page is within a block
+ * suitable for migration to, FAIL_UNMOVABLE_TARGET if the page
+ * is within a MIGRATE_UNMOVABLE block, FAIL_BAD_TARGET otherwise.
+ */
+static enum smt_result suitable_migration_target(struct page *page,
+				      struct compact_control *cc)
 {
 
 	int migratetype = get_pageblock_migratetype(page);
 
 	/* Don't interfere with memory hot-remove or the min_free_kbytes blocks */
 	if (migratetype == MIGRATE_ISOLATE || migratetype == MIGRATE_RESERVE)
-		return false;
+		return FAIL_BAD_TARGET;
 
 	/* If the page is a large free page, then allow migration */
 	if (PageBuddy(page) && page_order(page) >= pageblock_order)
-		return true;
+		return GOOD_AS_MIGRATION_TARGET;
 
 	/* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */
-	if (migrate_async_suitable(migratetype))
-		return true;
+	if (cc->mode != COMPACT_ASYNC_UNMOVABLE &&
+	    migrate_async_suitable(migratetype))
+		return GOOD_AS_MIGRATION_TARGET;
+
+	if (cc->mode == COMPACT_ASYNC_MOVABLE &&
+	    migratetype == MIGRATE_UNMOVABLE)
+		return FAIL_UNMOVABLE_TARGET;
+
+	if (cc->mode != COMPACT_ASYNC_MOVABLE &&
+	    migratetype == MIGRATE_UNMOVABLE &&
+	    rescue_unmovable_pageblock(page))
+		return GOOD_AS_MIGRATION_TARGET;
 
 	/* Otherwise skip the block */
-	return false;
+	return FAIL_BAD_TARGET;
 }
 
 /*
@@ -411,6 +475,13 @@ static void isolate_freepages(struct zone *zone,
 	zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages;
 
 	/*
+	 * isolate_freepages() may be called more than once during
+	 * compact_zone_order() run and we want only the most recent
+	 * count.
+	 */
+	cc->nr_pageblocks_skipped = 0;
+
+	/*
 	 * Isolate free pages until enough are available to migrate the
 	 * pages on cc->migratepages. We stop searching if the migrate
 	 * and free page scanners meet or enough free pages are isolated.
@@ -418,6 +489,7 @@ static void isolate_freepages(struct zone *zone,
 	for (; pfn > low_pfn && cc->nr_migratepages > nr_freepages;
 					pfn -= pageblock_nr_pages) {
 		unsigned long isolated;
+		enum smt_result ret;
 
 		if (!pfn_valid(pfn))
 			continue;
@@ -434,9 +506,12 @@ static void isolate_freepages(struct zone *zone,
 			continue;
 
 		/* Check the block is suitable for migration */
-		if (!suitable_migration_target(page))
+		ret = suitable_migration_target(page, cc);
+		if (ret != GOOD_AS_MIGRATION_TARGET) {
+			if (ret == FAIL_UNMOVABLE_TARGET)
+				cc->nr_pageblocks_skipped++;
 			continue;
-
+		}
 		/*
 		 * Found a block suitable for isolating free pages from. Now
 		 * we disabled interrupts, double check things are ok and
@@ -445,12 +520,14 @@ static void isolate_freepages(struct zone *zone,
 		 */
 		isolated = 0;
 		spin_lock_irqsave(&zone->lock, flags);
-		if (suitable_migration_target(page)) {
+		ret = suitable_migration_target(page, cc);
+		if (ret == GOOD_AS_MIGRATION_TARGET) {
 			end_pfn = min(pfn + pageblock_nr_pages, zone_end_pfn);
 			isolated = isolate_freepages_block(pfn, end_pfn,
 							   freelist, false);
 			nr_freepages += isolated;
-		}
+		} else if (ret == FAIL_UNMOVABLE_TARGET)
+			cc->nr_pageblocks_skipped++;
 		spin_unlock_irqrestore(&zone->lock, flags);
 
 		/*
@@ -682,8 +759,9 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
 
 		nr_migrate = cc->nr_migratepages;
 		err = migrate_pages(&cc->migratepages, compaction_alloc,
-				(unsigned long)cc, false,
-				cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC);
+			(unsigned long)&cc->freepages, false,
+			(cc->mode == COMPACT_SYNC) ? MIGRATE_SYNC_LIGHT
+						      : MIGRATE_ASYNC);
 		update_nr_listpages(cc);
 		nr_remaining = cc->nr_migratepages;
 
@@ -712,7 +790,8 @@ out:
 
 static unsigned long compact_zone_order(struct zone *zone,
 				 int order, gfp_t gfp_mask,
-				 bool sync)
+				 enum compact_mode mode,
+				 unsigned long *nr_pageblocks_skipped)
 {
 	struct compact_control cc = {
 		.nr_freepages = 0,
@@ -720,12 +799,17 @@ static unsigned long compact_zone_order(struct zone *zone,
 		.order = order,
 		.migratetype = allocflags_to_migratetype(gfp_mask),
 		.zone = zone,
-		.sync = sync,
+		.mode = mode,
 	};
+	unsigned long rc;
+
 	INIT_LIST_HEAD(&cc.freepages);
 	INIT_LIST_HEAD(&cc.migratepages);
 
-	return compact_zone(zone, &cc);
+	rc = compact_zone(zone, &cc);
+	*nr_pageblocks_skipped = cc.nr_pageblocks_skipped;
+
+	return rc;
 }
 
 int sysctl_extfrag_threshold = 500;
@@ -750,6 +834,8 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
 	struct zoneref *z;
 	struct zone *zone;
 	int rc = COMPACT_SKIPPED;
+	unsigned long nr_pageblocks_skipped;
+	enum compact_mode mode;
 
 	/*
 	 * Check whether it is worth even starting compaction. The order check is
@@ -766,12 +852,22 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
 								nodemask) {
 		int status;
 
-		status = compact_zone_order(zone, order, gfp_mask, sync);
+		mode = sync ? COMPACT_SYNC : COMPACT_ASYNC_MOVABLE;
+retry:
+		status = compact_zone_order(zone, order, gfp_mask, mode,
+						&nr_pageblocks_skipped);
 		rc = max(status, rc);
 
 		/* If a normal allocation would succeed, stop compacting */
 		if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, 0))
 			break;
+
+		if (rc == COMPACT_COMPLETE && mode == COMPACT_ASYNC_MOVABLE) {
+			if (nr_pageblocks_skipped) {
+				mode = COMPACT_ASYNC_UNMOVABLE;
+				goto retry;
+			}
+		}
 	}
 
 	return rc;
@@ -805,7 +901,7 @@ static int __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc)
 			if (ok && cc->order > zone->compact_order_failed)
 				zone->compact_order_failed = cc->order + 1;
 			/* Currently async compaction is never deferred. */
-			else if (!ok && cc->sync)
+			else if (!ok && cc->mode == COMPACT_SYNC)
 				defer_compaction(zone, cc->order);
 		}
 
@@ -820,7 +916,7 @@ int compact_pgdat(pg_data_t *pgdat, int order)
 {
 	struct compact_control cc = {
 		.order = order,
-		.sync = false,
+		.mode = COMPACT_ASYNC_MOVABLE,
 	};
 
 	return __compact_pgdat(pgdat, &cc);
@@ -830,7 +926,7 @@ static int compact_node(int nid)
 {
 	struct compact_control cc = {
 		.order = -1,
-		.sync = true,
+		.mode = COMPACT_SYNC,
 	};
 
 	return __compact_pgdat(NODE_DATA(nid), &cc);
diff --git a/mm/internal.h b/mm/internal.h
index 8b0fc8d..4194ab9 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -94,6 +94,9 @@ extern void putback_lru_page(struct page *page);
 /*
  * in mm/page_alloc.c
  */
+extern void set_pageblock_migratetype(struct page *page, int migratetype);
+extern int move_freepages_block(struct zone *zone, struct page *page,
+				int migratetype);
 extern void __free_pages_bootmem(struct page *page, unsigned int order);
 extern void prep_compound_page(struct page *page, unsigned long order);
 #ifdef CONFIG_MEMORY_FAILURE
@@ -101,6 +104,7 @@ extern bool is_free_buddy_page(struct page *page);
 #endif
 
 #if defined CONFIG_COMPACTION || defined CONFIG_CMA
+#include <linux/compaction.h>
 
 /*
  * in mm/compaction.c
@@ -119,11 +123,14 @@ struct compact_control {
 	unsigned long nr_migratepages;	/* Number of pages to migrate */
 	unsigned long free_pfn;		/* isolate_freepages search base */
 	unsigned long migrate_pfn;	/* isolate_migratepages search base */
-	bool sync;			/* Synchronous migration */
+	enum compact_mode mode;		/* Compaction mode */
 
 	int order;			/* order a direct compactor needs */
 	int migratetype;		/* MOVABLE, RECLAIMABLE etc */
 	struct zone *zone;
+
+	/* Number of UNMOVABLE destination pageblocks skipped during scan */
+	unsigned long nr_pageblocks_skipped;
 };
 
 unsigned long
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 84f2c59..457b4de 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -219,7 +219,7 @@ EXPORT_SYMBOL(nr_online_nodes);
 
 int page_group_by_mobility_disabled __read_mostly;
 
-static void set_pageblock_migratetype(struct page *page, int migratetype)
+void set_pageblock_migratetype(struct page *page, int migratetype)
 {
 
 	if (unlikely(page_group_by_mobility_disabled))
@@ -954,8 +954,8 @@ static int move_freepages(struct zone *zone,
 	return pages_moved;
 }
 
-static int move_freepages_block(struct zone *zone, struct page *page,
-				int migratetype)
+int move_freepages_block(struct zone *zone, struct page *page,
+			 int migratetype)
 {
 	unsigned long start_pfn, end_pfn;
 	struct page *start_page, *end_page;
@@ -5657,7 +5657,7 @@ static int __alloc_contig_migrate_range(unsigned long start, unsigned long end)
 		.nr_migratepages = 0,
 		.order = -1,
 		.zone = page_zone(pfn_to_page(start)),
-		.sync = true,
+		.mode = COMPACT_SYNC,
 	};
 	INIT_LIST_HEAD(&cc.migratepages);
 
-- 
cgit v0.10.2


From bde05d1ccd512696b09db9dd2e5f33ad19152605 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Tue, 29 May 2012 15:06:38 -0700
Subject: shmem: replace page if mapping excludes its zone

The GMA500 GPU driver uses GEM shmem objects, but with a new twist: the
backing RAM has to be below 4GB.  Not a problem while the boards
supported only 4GB: but now Intel's D2700MUD boards support 8GB, and
their GMA3600 is managed by the GMA500 driver.

shmem/tmpfs has never pretended to support hardware restrictions on the
backing memory, but it might have appeared to do so before v3.1, and
even now it works fine until a page is swapped out then back in.  When
read_cache_page_gfp() supplied a freshly allocated page for copy, that
compensated for whatever choice might have been made by earlier swapin
readahead; but swapoff was likely to destroy the illusion.

We'd like to continue to support GMA500, so now add a new
shmem_should_replace_page() check on the zone when about to move a page
from swapcache to filecache (in swapin and swapoff cases), with
shmem_replace_page() to allocate and substitute a suitable page (given
gma500/gem.c's mapping_set_gfp_mask GFP_KERNEL | __GFP_DMA32).

This does involve a minor extension to mem_cgroup_replace_page_cache()
(the page may or may not have already been charged); and I've removed a
comment and call to mem_cgroup_uncharge_cache_page(), which in fact is
always a no-op while PageSwapCache.

Also removed optimization of an unlikely path in shmem_getpage_gfp(),
now that we need to check PageSwapCache more carefully (a racing caller
might already have made the copy).  And at one point shmem_unuse_inode()
needs to use the hitherto private page_swapcount(), to guard against
racing with inode eviction.

It would make sense to extend shmem_should_replace_page(), to cover
cpuset and NUMA mempolicy restrictions too, but set that aside for now:
needs a cleanup of shmem mempolicy handling, and more testing, and ought
to handle swap faults in do_swap_page() as well as shmem.

Signed-off-by: Hugh Dickins <hughd@google.com>
Cc: Christoph Hellwig <hch@infradead.org>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Cc: Stephane Marchesin <marcheu@chromium.org>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Dave Airlie <airlied@gmail.com>
Cc: Daniel Vetter <daniel@ffwll.ch>
Cc: Rob Clark <rob.clark@linaro.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/include/linux/swap.h b/include/linux/swap.h
index bc3073c..d965c4b 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -351,6 +351,7 @@ extern int swap_type_of(dev_t, sector_t, struct block_device **);
 extern unsigned int count_swap_pages(int, int);
 extern sector_t map_swap_page(struct page *, struct block_device **);
 extern sector_t swapdev_block(int, pgoff_t);
+extern int page_swapcount(struct page *);
 extern int reuse_swap_page(struct page *);
 extern int try_to_free_swap(struct page *);
 struct backing_dev_info;
@@ -445,6 +446,11 @@ static inline void delete_from_swap_cache(struct page *page)
 {
 }
 
+static inline int page_swapcount(struct page *page)
+{
+	return 0;
+}
+
 #define reuse_swap_page(page)	(page_mapcount(page) == 1)
 
 static inline int try_to_free_swap(struct page *page)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 4f71219..d7ce417 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -3373,7 +3373,7 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg,
 void mem_cgroup_replace_page_cache(struct page *oldpage,
 				  struct page *newpage)
 {
-	struct mem_cgroup *memcg;
+	struct mem_cgroup *memcg = NULL;
 	struct page_cgroup *pc;
 	enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE;
 
@@ -3383,11 +3383,20 @@ void mem_cgroup_replace_page_cache(struct page *oldpage,
 	pc = lookup_page_cgroup(oldpage);
 	/* fix accounting on old pages */
 	lock_page_cgroup(pc);
-	memcg = pc->mem_cgroup;
-	mem_cgroup_charge_statistics(memcg, false, -1);
-	ClearPageCgroupUsed(pc);
+	if (PageCgroupUsed(pc)) {
+		memcg = pc->mem_cgroup;
+		mem_cgroup_charge_statistics(memcg, false, -1);
+		ClearPageCgroupUsed(pc);
+	}
 	unlock_page_cgroup(pc);
 
+	/*
+	 * When called from shmem_replace_page(), in some cases the
+	 * oldpage has already been charged, and in some cases not.
+	 */
+	if (!memcg)
+		return;
+
 	if (PageSwapBacked(oldpage))
 		type = MEM_CGROUP_CHARGE_TYPE_SHMEM;
 
diff --git a/mm/shmem.c b/mm/shmem.c
index be5af34..db72d8e 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -103,6 +103,9 @@ static unsigned long shmem_default_max_inodes(void)
 }
 #endif
 
+static bool shmem_should_replace_page(struct page *page, gfp_t gfp);
+static int shmem_replace_page(struct page **pagep, gfp_t gfp,
+				struct shmem_inode_info *info, pgoff_t index);
 static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
 	struct page **pagep, enum sgp_type sgp, gfp_t gfp, int *fault_type);
 
@@ -604,12 +607,13 @@ static void shmem_evict_inode(struct inode *inode)
  * If swap found in inode, free it and move page from swapcache to filecache.
  */
 static int shmem_unuse_inode(struct shmem_inode_info *info,
-			     swp_entry_t swap, struct page *page)
+			     swp_entry_t swap, struct page **pagep)
 {
 	struct address_space *mapping = info->vfs_inode.i_mapping;
 	void *radswap;
 	pgoff_t index;
-	int error;
+	gfp_t gfp;
+	int error = 0;
 
 	radswap = swp_to_radix_entry(swap);
 	index = radix_tree_locate_item(&mapping->page_tree, radswap);
@@ -625,22 +629,37 @@ static int shmem_unuse_inode(struct shmem_inode_info *info,
 	if (shmem_swaplist.next != &info->swaplist)
 		list_move_tail(&shmem_swaplist, &info->swaplist);
 
+	gfp = mapping_gfp_mask(mapping);
+	if (shmem_should_replace_page(*pagep, gfp)) {
+		mutex_unlock(&shmem_swaplist_mutex);
+		error = shmem_replace_page(pagep, gfp, info, index);
+		mutex_lock(&shmem_swaplist_mutex);
+		/*
+		 * We needed to drop mutex to make that restrictive page
+		 * allocation; but the inode might already be freed by now,
+		 * and we cannot refer to inode or mapping or info to check.
+		 * However, we do hold page lock on the PageSwapCache page,
+		 * so can check if that still has our reference remaining.
+		 */
+		if (!page_swapcount(*pagep))
+			error = -ENOENT;
+	}
+
 	/*
 	 * We rely on shmem_swaplist_mutex, not only to protect the swaplist,
 	 * but also to hold up shmem_evict_inode(): so inode cannot be freed
 	 * beneath us (pagelock doesn't help until the page is in pagecache).
 	 */
-	error = shmem_add_to_page_cache(page, mapping, index,
+	if (!error)
+		error = shmem_add_to_page_cache(*pagep, mapping, index,
 						GFP_NOWAIT, radswap);
-	/* which does mem_cgroup_uncharge_cache_page on error */
-
 	if (error != -ENOMEM) {
 		/*
 		 * Truncation and eviction use free_swap_and_cache(), which
 		 * only does trylock page: if we raced, best clean up here.
 		 */
-		delete_from_swap_cache(page);
-		set_page_dirty(page);
+		delete_from_swap_cache(*pagep);
+		set_page_dirty(*pagep);
 		if (!error) {
 			spin_lock(&info->lock);
 			info->swapped--;
@@ -660,7 +679,14 @@ int shmem_unuse(swp_entry_t swap, struct page *page)
 	struct list_head *this, *next;
 	struct shmem_inode_info *info;
 	int found = 0;
-	int error;
+	int error = 0;
+
+	/*
+	 * There's a faint possibility that swap page was replaced before
+	 * caller locked it: it will come back later with the right page.
+	 */
+	if (unlikely(!PageSwapCache(page)))
+		goto out;
 
 	/*
 	 * Charge page using GFP_KERNEL while we can wait, before taking
@@ -676,7 +702,7 @@ int shmem_unuse(swp_entry_t swap, struct page *page)
 	list_for_each_safe(this, next, &shmem_swaplist) {
 		info = list_entry(this, struct shmem_inode_info, swaplist);
 		if (info->swapped)
-			found = shmem_unuse_inode(info, swap, page);
+			found = shmem_unuse_inode(info, swap, &page);
 		else
 			list_del_init(&info->swaplist);
 		cond_resched();
@@ -685,8 +711,6 @@ int shmem_unuse(swp_entry_t swap, struct page *page)
 	}
 	mutex_unlock(&shmem_swaplist_mutex);
 
-	if (!found)
-		mem_cgroup_uncharge_cache_page(page);
 	if (found < 0)
 		error = found;
 out:
@@ -856,6 +880,84 @@ static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
 #endif
 
 /*
+ * When a page is moved from swapcache to shmem filecache (either by the
+ * usual swapin of shmem_getpage_gfp(), or by the less common swapoff of
+ * shmem_unuse_inode()), it may have been read in earlier from swap, in
+ * ignorance of the mapping it belongs to.  If that mapping has special
+ * constraints (like the gma500 GEM driver, which requires RAM below 4GB),
+ * we may need to copy to a suitable page before moving to filecache.
+ *
+ * In a future release, this may well be extended to respect cpuset and
+ * NUMA mempolicy, and applied also to anonymous pages in do_swap_page();
+ * but for now it is a simple matter of zone.
+ */
+static bool shmem_should_replace_page(struct page *page, gfp_t gfp)
+{
+	return page_zonenum(page) > gfp_zone(gfp);
+}
+
+static int shmem_replace_page(struct page **pagep, gfp_t gfp,
+				struct shmem_inode_info *info, pgoff_t index)
+{
+	struct page *oldpage, *newpage;
+	struct address_space *swap_mapping;
+	pgoff_t swap_index;
+	int error;
+
+	oldpage = *pagep;
+	swap_index = page_private(oldpage);
+	swap_mapping = page_mapping(oldpage);
+
+	/*
+	 * We have arrived here because our zones are constrained, so don't
+	 * limit chance of success by further cpuset and node constraints.
+	 */
+	gfp &= ~GFP_CONSTRAINT_MASK;
+	newpage = shmem_alloc_page(gfp, info, index);
+	if (!newpage)
+		return -ENOMEM;
+	VM_BUG_ON(shmem_should_replace_page(newpage, gfp));
+
+	*pagep = newpage;
+	page_cache_get(newpage);
+	copy_highpage(newpage, oldpage);
+
+	VM_BUG_ON(!PageLocked(oldpage));
+	__set_page_locked(newpage);
+	VM_BUG_ON(!PageUptodate(oldpage));
+	SetPageUptodate(newpage);
+	VM_BUG_ON(!PageSwapBacked(oldpage));
+	SetPageSwapBacked(newpage);
+	VM_BUG_ON(!swap_index);
+	set_page_private(newpage, swap_index);
+	VM_BUG_ON(!PageSwapCache(oldpage));
+	SetPageSwapCache(newpage);
+
+	/*
+	 * Our caller will very soon move newpage out of swapcache, but it's
+	 * a nice clean interface for us to replace oldpage by newpage there.
+	 */
+	spin_lock_irq(&swap_mapping->tree_lock);
+	error = shmem_radix_tree_replace(swap_mapping, swap_index, oldpage,
+								   newpage);
+	__inc_zone_page_state(newpage, NR_FILE_PAGES);
+	__dec_zone_page_state(oldpage, NR_FILE_PAGES);
+	spin_unlock_irq(&swap_mapping->tree_lock);
+	BUG_ON(error);
+
+	mem_cgroup_replace_page_cache(oldpage, newpage);
+	lru_cache_add_anon(newpage);
+
+	ClearPageSwapCache(oldpage);
+	set_page_private(oldpage, 0);
+
+	unlock_page(oldpage);
+	page_cache_release(oldpage);
+	page_cache_release(oldpage);
+	return 0;
+}
+
+/*
  * shmem_getpage_gfp - find page in cache, or get from swap, or allocate
  *
  * If we allocate a new one we do not mark it dirty. That's up to the
@@ -923,19 +1025,20 @@ repeat:
 
 		/* We have to do this with page locked to prevent races */
 		lock_page(page);
+		if (!PageSwapCache(page) || page->mapping) {
+			error = -EEXIST;	/* try again */
+			goto failed;
+		}
 		if (!PageUptodate(page)) {
 			error = -EIO;
 			goto failed;
 		}
 		wait_on_page_writeback(page);
 
-		/* Someone may have already done it for us */
-		if (page->mapping) {
-			if (page->mapping == mapping &&
-			    page->index == index)
-				goto done;
-			error = -EEXIST;
-			goto failed;
+		if (shmem_should_replace_page(page, gfp)) {
+			error = shmem_replace_page(&page, gfp, info, index);
+			if (error)
+				goto failed;
 		}
 
 		error = mem_cgroup_cache_charge(page, current->mm,
@@ -998,7 +1101,7 @@ repeat:
 		if (sgp == SGP_DIRTY)
 			set_page_dirty(page);
 	}
-done:
+
 	/* Perhaps the file has been truncated since we checked */
 	if (sgp != SGP_WRITE &&
 	    ((loff_t)index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) {
diff --git a/mm/swapfile.c b/mm/swapfile.c
index fafc26d..b0c86e9 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -601,7 +601,7 @@ void swapcache_free(swp_entry_t entry, struct page *page)
  * This does not give an exact answer when swap count is continued,
  * but does include the high COUNT_CONTINUED flag to allow for that.
  */
-static inline int page_swapcount(struct page *page)
+int page_swapcount(struct page *page)
 {
 	int count = 0;
 	struct swap_info_struct *p;
-- 
cgit v0.10.2


From 2f6e38f3cd17a7858112f538c1700c747170db1f Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Tue, 29 May 2012 15:06:38 -0700
Subject: tmpfs: enable NOSEC optimization

Let tmpfs into the NOSEC optimization (avoiding file_remove_suid()
overhead on most common writes): set MS_NOSEC on its superblocks.

Signed-off-by: Hugh Dickins <hughd@google.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/mm/shmem.c b/mm/shmem.c
index db72d8e..fe5ae69 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2373,6 +2373,7 @@ int shmem_fill_super(struct super_block *sb, void *data, int silent)
 		}
 	}
 	sb->s_export_op = &shmem_export_ops;
+	sb->s_flags |= MS_NOSEC;
 #else
 	sb->s_flags |= MS_NOUSER;
 #endif
-- 
cgit v0.10.2


From ec9516fbc5fa814014991e1ae7f8860127122105 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Tue, 29 May 2012 15:06:39 -0700
Subject: tmpfs: optimize clearing when writing

Nick proposed years ago that tmpfs should avoid clearing its pages where
write will overwrite them with new data, as ramfs has long done.  But I
messed it up and just got bad data.  Tried again recently, it works
fine.

Here's time output for writing 4GiB 16 times on this Core i5 laptop:

before: real	0m21.169s user	0m0.028s sys	0m21.057s
        real	0m21.382s user	0m0.016s sys	0m21.289s
        real	0m21.311s user	0m0.020s sys	0m21.217s

after:  real	0m18.273s user	0m0.032s sys	0m18.165s
        real	0m18.354s user	0m0.020s sys	0m18.265s
        real	0m18.440s user	0m0.032s sys	0m18.337s

ramfs:  real	0m16.860s user	0m0.028s sys	0m16.765s
        real	0m17.382s user	0m0.040s sys	0m17.273s
        real	0m17.133s user	0m0.044s sys	0m17.021s

Yes, I have done perf reports, but they need more explanation than they
deserve: in summary, clear_page vanishes, its cache loading shifts into
copy_user_generic_unrolled; shmem_getpage_gfp goes down, and
surprisingly mark_page_accessed goes way up - I think because they are
respectively where the cache gets to be reloaded after being purged by
clear or copy.

Suggested-by: Nick Piggin <npiggin@gmail.com>
Signed-off-by: Hugh Dickins <hughd@google.com>
Cc: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/mm/shmem.c b/mm/shmem.c
index fe5ae69..45c2647 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1095,9 +1095,14 @@ repeat:
 		shmem_recalc_inode(inode);
 		spin_unlock(&info->lock);
 
-		clear_highpage(page);
-		flush_dcache_page(page);
-		SetPageUptodate(page);
+		/*
+		 * Let SGP_WRITE caller clear ends if write does not fill page
+		 */
+		if (sgp != SGP_WRITE) {
+			clear_highpage(page);
+			flush_dcache_page(page);
+			SetPageUptodate(page);
+		}
 		if (sgp == SGP_DIRTY)
 			set_page_dirty(page);
 	}
@@ -1307,6 +1312,14 @@ shmem_write_end(struct file *file, struct address_space *mapping,
 	if (pos + copied > inode->i_size)
 		i_size_write(inode, pos + copied);
 
+	if (!PageUptodate(page)) {
+		if (copied < PAGE_CACHE_SIZE) {
+			unsigned from = pos & (PAGE_CACHE_SIZE - 1);
+			zero_user_segments(page, 0, from,
+					from + copied, PAGE_CACHE_SIZE);
+		}
+		SetPageUptodate(page);
+	}
 	set_page_dirty(page);
 	unlock_page(page);
 	page_cache_release(page);
@@ -1768,6 +1781,7 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
 		kaddr = kmap_atomic(page);
 		memcpy(kaddr, symname, len);
 		kunmap_atomic(kaddr);
+		SetPageUptodate(page);
 		set_page_dirty(page);
 		unlock_page(page);
 		page_cache_release(page);
-- 
cgit v0.10.2


From 83e4fa9c16e4af7122e31be3eca5d57881d236fe Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Tue, 29 May 2012 15:06:40 -0700
Subject: tmpfs: support fallocate FALLOC_FL_PUNCH_HOLE

tmpfs has supported hole-punching since 2.6.16, via
madvise(,,MADV_REMOVE).

But nowadays fallocate(,FALLOC_FL_PUNCH_HOLE|FALLOC_FL_KEEP_SIZE,,) is
the agreed way to punch holes.

So add shmem_fallocate() to support that, and tweak shmem_truncate_range()
to support partial pages at both the beginning and end of range (never
needed for madvise, which demands rounded addr and rounds up length).

Based-on-patch-by: Cong Wang <amwang@redhat.com>
Signed-off-by: Hugh Dickins <hughd@google.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Cong Wang <amwang@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/mm/shmem.c b/mm/shmem.c
index 45c2647..7e54ff1 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -53,6 +53,7 @@ static struct vfsmount *shm_mnt;
 #include <linux/blkdev.h>
 #include <linux/pagevec.h>
 #include <linux/percpu_counter.h>
+#include <linux/falloc.h>
 #include <linux/splice.h>
 #include <linux/security.h>
 #include <linux/swapops.h>
@@ -432,21 +433,23 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
 	struct address_space *mapping = inode->i_mapping;
 	struct shmem_inode_info *info = SHMEM_I(inode);
 	pgoff_t start = (lstart + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
-	unsigned partial = lstart & (PAGE_CACHE_SIZE - 1);
-	pgoff_t end = (lend >> PAGE_CACHE_SHIFT);
+	pgoff_t end = (lend + 1) >> PAGE_CACHE_SHIFT;
+	unsigned int partial_start = lstart & (PAGE_CACHE_SIZE - 1);
+	unsigned int partial_end = (lend + 1) & (PAGE_CACHE_SIZE - 1);
 	struct pagevec pvec;
 	pgoff_t indices[PAGEVEC_SIZE];
 	long nr_swaps_freed = 0;
 	pgoff_t index;
 	int i;
 
-	BUG_ON((lend & (PAGE_CACHE_SIZE - 1)) != (PAGE_CACHE_SIZE - 1));
+	if (lend == -1)
+		end = -1;	/* unsigned, so actually very big */
 
 	pagevec_init(&pvec, 0);
 	index = start;
-	while (index <= end) {
+	while (index < end) {
 		pvec.nr = shmem_find_get_pages_and_swap(mapping, index,
-			min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1,
+				min(end - index, (pgoff_t)PAGEVEC_SIZE),
 							pvec.pages, indices);
 		if (!pvec.nr)
 			break;
@@ -455,7 +458,7 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
 			struct page *page = pvec.pages[i];
 
 			index = indices[i];
-			if (index > end)
+			if (index >= end)
 				break;
 
 			if (radix_tree_exceptional_entry(page)) {
@@ -479,22 +482,39 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
 		index++;
 	}
 
-	if (partial) {
+	if (partial_start) {
 		struct page *page = NULL;
 		shmem_getpage(inode, start - 1, &page, SGP_READ, NULL);
 		if (page) {
-			zero_user_segment(page, partial, PAGE_CACHE_SIZE);
+			unsigned int top = PAGE_CACHE_SIZE;
+			if (start > end) {
+				top = partial_end;
+				partial_end = 0;
+			}
+			zero_user_segment(page, partial_start, top);
+			set_page_dirty(page);
+			unlock_page(page);
+			page_cache_release(page);
+		}
+	}
+	if (partial_end) {
+		struct page *page = NULL;
+		shmem_getpage(inode, end, &page, SGP_READ, NULL);
+		if (page) {
+			zero_user_segment(page, 0, partial_end);
 			set_page_dirty(page);
 			unlock_page(page);
 			page_cache_release(page);
 		}
 	}
+	if (start >= end)
+		return;
 
 	index = start;
 	for ( ; ; ) {
 		cond_resched();
 		pvec.nr = shmem_find_get_pages_and_swap(mapping, index,
-			min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1,
+				min(end - index, (pgoff_t)PAGEVEC_SIZE),
 							pvec.pages, indices);
 		if (!pvec.nr) {
 			if (index == start)
@@ -502,7 +522,7 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
 			index = start;
 			continue;
 		}
-		if (index == start && indices[0] > end) {
+		if (index == start && indices[0] >= end) {
 			shmem_deswap_pagevec(&pvec);
 			pagevec_release(&pvec);
 			break;
@@ -512,7 +532,7 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
 			struct page *page = pvec.pages[i];
 
 			index = indices[i];
-			if (index > end)
+			if (index >= end)
 				break;
 
 			if (radix_tree_exceptional_entry(page)) {
@@ -1578,6 +1598,31 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos,
 	return error;
 }
 
+static long shmem_fallocate(struct file *file, int mode, loff_t offset,
+							 loff_t len)
+{
+	struct inode *inode = file->f_path.dentry->d_inode;
+	int error = -EOPNOTSUPP;
+
+	mutex_lock(&inode->i_mutex);
+
+	if (mode & FALLOC_FL_PUNCH_HOLE) {
+		struct address_space *mapping = file->f_mapping;
+		loff_t unmap_start = round_up(offset, PAGE_SIZE);
+		loff_t unmap_end = round_down(offset + len, PAGE_SIZE) - 1;
+
+		if ((u64)unmap_end > (u64)unmap_start)
+			unmap_mapping_range(mapping, unmap_start,
+					    1 + unmap_end - unmap_start, 0);
+		shmem_truncate_range(inode, offset, offset + len - 1);
+		/* No need to unmap again: hole-punching leaves COWed pages */
+		error = 0;
+	}
+
+	mutex_unlock(&inode->i_mutex);
+	return error;
+}
+
 static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
 	struct shmem_sb_info *sbinfo = SHMEM_SB(dentry->d_sb);
@@ -2490,6 +2535,7 @@ static const struct file_operations shmem_file_operations = {
 	.fsync		= noop_fsync,
 	.splice_read	= shmem_file_splice_read,
 	.splice_write	= generic_file_splice_write,
+	.fallocate	= shmem_fallocate,
 #endif
 };
 
-- 
cgit v0.10.2


From 3f31d07571eeea18a7d34db9af21d2285b807a17 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Tue, 29 May 2012 15:06:40 -0700
Subject: mm/fs: route MADV_REMOVE to FALLOC_FL_PUNCH_HOLE

Now tmpfs supports hole-punching via fallocate(), switch madvise_remove()
to use do_fallocate() instead of vmtruncate_range(): which extends
madvise(,,MADV_REMOVE) support from tmpfs to ext4, ocfs2 and xfs.

There is one more user of vmtruncate_range() in our tree,
staging/android's ashmem_shrink(): convert it to use do_fallocate() too
(but if its unpinned areas are already unmapped - I don't know - then it
would do better to use shmem_truncate_range() directly).

Based-on-patch-by: Cong Wang <amwang@redhat.com>
Signed-off-by: Hugh Dickins <hughd@google.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Colin Cross <ccross@android.com>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Greg Kroah-Hartman <gregkh@linux-foundation.org>
Cc: "Theodore Ts'o" <tytso@mit.edu>
Cc: Andreas Dilger <adilger@dilger.ca>
Cc: Mark Fasheh <mfasheh@suse.de>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Ben Myers <bpm@sgi.com>
Cc: Michael Kerrisk <mtk.manpages@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/staging/android/ashmem.c b/drivers/staging/android/ashmem.c
index 4511420..e84dbec 100644
--- a/drivers/staging/android/ashmem.c
+++ b/drivers/staging/android/ashmem.c
@@ -19,6 +19,7 @@
 #include <linux/module.h>
 #include <linux/file.h>
 #include <linux/fs.h>
+#include <linux/falloc.h>
 #include <linux/miscdevice.h>
 #include <linux/security.h>
 #include <linux/mm.h>
@@ -363,11 +364,12 @@ static int ashmem_shrink(struct shrinker *s, struct shrink_control *sc)
 
 	mutex_lock(&ashmem_mutex);
 	list_for_each_entry_safe(range, next, &ashmem_lru_list, lru) {
-		struct inode *inode = range->asma->file->f_dentry->d_inode;
 		loff_t start = range->pgstart * PAGE_SIZE;
-		loff_t end = (range->pgend + 1) * PAGE_SIZE - 1;
+		loff_t end = (range->pgend + 1) * PAGE_SIZE;
 
-		vmtruncate_range(inode, start, end);
+		do_fallocate(range->asma->file,
+				FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
+				start, end - start);
 		range->purged = ASHMEM_WAS_PURGED;
 		lru_del(range);
 
diff --git a/mm/madvise.c b/mm/madvise.c
index 1ccbba5..deff1b6 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -11,8 +11,10 @@
 #include <linux/mempolicy.h>
 #include <linux/page-isolation.h>
 #include <linux/hugetlb.h>
+#include <linux/falloc.h>
 #include <linux/sched.h>
 #include <linux/ksm.h>
+#include <linux/fs.h>
 
 /*
  * Any behaviour which results in changes to the vma->vm_flags needs to
@@ -200,8 +202,7 @@ static long madvise_remove(struct vm_area_struct *vma,
 				struct vm_area_struct **prev,
 				unsigned long start, unsigned long end)
 {
-	struct address_space *mapping;
-	loff_t offset, endoff;
+	loff_t offset;
 	int error;
 
 	*prev = NULL;	/* tell sys_madvise we drop mmap_sem */
@@ -217,16 +218,14 @@ static long madvise_remove(struct vm_area_struct *vma,
 	if ((vma->vm_flags & (VM_SHARED|VM_WRITE)) != (VM_SHARED|VM_WRITE))
 		return -EACCES;
 
-	mapping = vma->vm_file->f_mapping;
-
 	offset = (loff_t)(start - vma->vm_start)
 			+ ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
-	endoff = (loff_t)(end - vma->vm_start - 1)
-			+ ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
 
-	/* vmtruncate_range needs to take i_mutex */
+	/* filesystem's fallocate may need to take i_mutex */
 	up_read(&current->mm->mmap_sem);
-	error = vmtruncate_range(mapping->host, offset, endoff);
+	error = do_fallocate(vma->vm_file,
+				FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
+				offset, end - start);
 	down_read(&current->mm->mmap_sem);
 	return error;
 }
-- 
cgit v0.10.2


From 17cf28afea2a1112f240a3a2da8af883be024811 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Tue, 29 May 2012 15:06:41 -0700
Subject: mm/fs: remove truncate_range

Remove vmtruncate_range(), and remove the truncate_range method from
struct inode_operations: only tmpfs ever supported it, and tmpfs has now
converted over to using the fallocate method of file_operations.

Update Documentation accordingly, adding (setlease and) fallocate lines.
And while we're in mm.h, remove duplicate declarations of shmem_lock() and
shmem_file_setup(): everyone is now using the ones in shmem_fs.h.

Based-on-patch-by: Cong Wang <amwang@redhat.com>
Signed-off-by: Hugh Dickins <hughd@google.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Cong Wang <amwang@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index 4fca82e..d449e63 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -60,7 +60,6 @@ ata *);
 	ssize_t (*getxattr) (struct dentry *, const char *, void *, size_t);
 	ssize_t (*listxattr) (struct dentry *, char *, size_t);
 	int (*removexattr) (struct dentry *, const char *);
-	void (*truncate_range)(struct inode *, loff_t, loff_t);
 	int (*fiemap)(struct inode *, struct fiemap_extent_info *, u64 start, u64 len);
 
 locking rules:
@@ -87,7 +86,6 @@ setxattr:	yes
 getxattr:	no
 listxattr:	no
 removexattr:	yes
-truncate_range:	yes
 fiemap:		no
 	Additionally, ->rmdir(), ->unlink() and ->rename() have ->i_mutex on
 victim.
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
index 0d04920..ef19f91 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -363,7 +363,6 @@ struct inode_operations {
 	ssize_t (*getxattr) (struct dentry *, const char *, void *, size_t);
 	ssize_t (*listxattr) (struct dentry *, char *, size_t);
 	int (*removexattr) (struct dentry *, const char *);
-	void (*truncate_range)(struct inode *, loff_t, loff_t);
 };
 
 Again, all methods are called without any locks being held, unless
@@ -472,9 +471,6 @@ otherwise noted.
   removexattr: called by the VFS to remove an extended attribute from
   	a file. This method is called by removexattr(2) system call.
 
-  truncate_range: a method provided by the underlying filesystem to truncate a
-  	range of blocks , i.e. punch a hole somewhere in a file.
-
 
 The Address Space Object
 ========================
@@ -760,7 +756,7 @@ struct file_operations
 ----------------------
 
 This describes how the VFS can manipulate an open file. As of kernel
-2.6.22, the following members are defined:
+3.5, the following members are defined:
 
 struct file_operations {
 	struct module *owner;
@@ -790,6 +786,8 @@ struct file_operations {
 	int (*flock) (struct file *, int, struct file_lock *);
 	ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, size_t, unsigned int);
 	ssize_t (*splice_read)(struct file *, struct pipe_inode_info *, size_t, unsigned int);
+	int (*setlease)(struct file *, long arg, struct file_lock **);
+	long (*fallocate)(struct file *, int mode, loff_t offset, loff_t len);
 };
 
 Again, all methods are called without any locks being held, unless
@@ -858,6 +856,11 @@ otherwise noted.
   splice_read: called by the VFS to splice data from file to a pipe. This
 	       method is used by the splice(2) system call
 
+  setlease: called by the VFS to set or release a file lock lease.
+	    setlease has the file_lock_lock held and must not sleep.
+
+  fallocate: called by the VFS to preallocate blocks or punch a hole.
+
 Note that the file operations are implemented by the specific
 filesystem in which the inode resides. When opening a device node
 (character or block special) most filesystems will call special
diff --git a/fs/bad_inode.c b/fs/bad_inode.c
index 37268c5..1b35d6b 100644
--- a/fs/bad_inode.c
+++ b/fs/bad_inode.c
@@ -292,7 +292,6 @@ static const struct inode_operations bad_inode_ops =
 	.getxattr	= bad_inode_getxattr,
 	.listxattr	= bad_inode_listxattr,
 	.removexattr	= bad_inode_removexattr,
-	/* truncate_range returns void */
 };
 
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index cdc1a96..038076b 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1681,7 +1681,6 @@ struct inode_operations {
 	ssize_t (*getxattr) (struct dentry *, const char *, void *, size_t);
 	ssize_t (*listxattr) (struct dentry *, char *, size_t);
 	int (*removexattr) (struct dentry *, const char *);
-	void (*truncate_range)(struct inode *, loff_t, loff_t);
 	int (*fiemap)(struct inode *, struct fiemap_extent_info *, u64 start,
 		      u64 len);
 } ____cacheline_aligned;
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 7d5c37f..aa20baf 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -871,8 +871,6 @@ extern void pagefault_out_of_memory(void);
 extern void show_free_areas(unsigned int flags);
 extern bool skip_free_areas_node(unsigned int flags, int nid);
 
-int shmem_lock(struct file *file, int lock, struct user_struct *user);
-struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags);
 int shmem_zero_setup(struct vm_area_struct *);
 
 extern int can_do_mlock(void);
@@ -951,11 +949,9 @@ static inline void unmap_shared_mapping_range(struct address_space *mapping,
 extern void truncate_pagecache(struct inode *inode, loff_t old, loff_t new);
 extern void truncate_setsize(struct inode *inode, loff_t newsize);
 extern int vmtruncate(struct inode *inode, loff_t offset);
-extern int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end);
 void truncate_pagecache_range(struct inode *inode, loff_t offset, loff_t end);
 int truncate_inode_page(struct address_space *mapping, struct page *page);
 int generic_error_remove_page(struct address_space *mapping, struct page *page);
-
 int invalidate_inode_page(struct page *page);
 
 #ifdef CONFIG_MMU
diff --git a/mm/shmem.c b/mm/shmem.c
index 7e54ff1..f368d0a 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2541,7 +2541,6 @@ static const struct file_operations shmem_file_operations = {
 
 static const struct inode_operations shmem_inode_operations = {
 	.setattr	= shmem_setattr,
-	.truncate_range	= shmem_truncate_range,
 #ifdef CONFIG_TMPFS_XATTR
 	.setxattr	= shmem_setxattr,
 	.getxattr	= shmem_getxattr,
diff --git a/mm/truncate.c b/mm/truncate.c
index 61a183b..75801ac 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -602,31 +602,6 @@ int vmtruncate(struct inode *inode, loff_t newsize)
 }
 EXPORT_SYMBOL(vmtruncate);
 
-int vmtruncate_range(struct inode *inode, loff_t lstart, loff_t lend)
-{
-	struct address_space *mapping = inode->i_mapping;
-	loff_t holebegin = round_up(lstart, PAGE_SIZE);
-	loff_t holelen = 1 + lend - holebegin;
-
-	/*
-	 * If the underlying filesystem is not going to provide
-	 * a way to truncate a range of blocks (punch a hole) -
-	 * we should return failure right now.
-	 */
-	if (!inode->i_op->truncate_range)
-		return -ENOSYS;
-
-	mutex_lock(&inode->i_mutex);
-	inode_dio_wait(inode);
-	unmap_mapping_range(mapping, holebegin, holelen, 1);
-	inode->i_op->truncate_range(inode, lstart, lend);
-	/* unmap again to remove racily COWed private pages */
-	unmap_mapping_range(mapping, holebegin, holelen, 1);
-	mutex_unlock(&inode->i_mutex);
-
-	return 0;
-}
-
 /**
  * truncate_pagecache_range - unmap and remove pagecache that is hole-punched
  * @inode: inode
-- 
cgit v0.10.2


From e2d12e22c59ce714008aa5266d769f8568d74eac Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Tue, 29 May 2012 15:06:41 -0700
Subject: tmpfs: support fallocate preallocation

The systemd plumbers expressed a wish that tmpfs support preallocation.
Cong Wang wrote a patch, but several kernel guys expressed scepticism:
https://lkml.org/lkml/2011/11/18/137

Christoph Hellwig: What for exactly? Please explain why preallocating on
tmpfs would make any sense.

Kay Sievers: To be able to safely use mmap(), regarding SIGBUS, on files
on the /dev/shm filesystem.  The glibc fallback loop for -ENOSYS [or
-EOPNOTSUPP] on fallocate is just ugly.

Hugh Dickins: If tmpfs is going to support
fallocate(FALLOC_FL_PUNCH_HOLE), it would seem perverse to permit the
deallocation but fail the allocation.  Christoph Hellwig: Agreed.

Now that we do have shmem_fallocate() for hole-punching, plumb in basic
support for preallocation mode too.  It's fairly straightforward (though
quite a few details needed attention), except for when it fails part way
through.  What a pity that fallocate(2) was not specified to return the
length allocated, permitting short fallocations!

As it is, when it fails part way through, we ought to free what has just
been allocated by this system call; but must be very sure not to free any
allocated earlier, or any allocated by racing accesses (not all excluded
by i_mutex).

But we cannot distinguish them: so in this patch simply leak allocations
on partial failure (they will be freed later if the file is removed).

An attractive alternative approach would have been for fallocate() not to
allocate pages at all, but note reservations by entries in the radix-tree.
 But that would give less assurance, and, critically, would be hard to fit
with mem cgroups (who owns the reservations?): allocating pages lets
fallocate() behave in just the same way as write().

Based-on-patch-by: Cong Wang <amwang@redhat.com>
Signed-off-by: Hugh Dickins <hughd@google.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Cong Wang <amwang@redhat.com>
Cc: Kay Sievers <kay@vrfy.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/mm/shmem.c b/mm/shmem.c
index f368d0a..9b90d89 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1602,7 +1602,9 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset,
 							 loff_t len)
 {
 	struct inode *inode = file->f_path.dentry->d_inode;
-	int error = -EOPNOTSUPP;
+	struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
+	pgoff_t start, index, end;
+	int error;
 
 	mutex_lock(&inode->i_mutex);
 
@@ -1617,8 +1619,65 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset,
 		shmem_truncate_range(inode, offset, offset + len - 1);
 		/* No need to unmap again: hole-punching leaves COWed pages */
 		error = 0;
+		goto out;
 	}
 
+	/* We need to check rlimit even when FALLOC_FL_KEEP_SIZE */
+	error = inode_newsize_ok(inode, offset + len);
+	if (error)
+		goto out;
+
+	start = offset >> PAGE_CACHE_SHIFT;
+	end = (offset + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+	/* Try to avoid a swapstorm if len is impossible to satisfy */
+	if (sbinfo->max_blocks && end - start > sbinfo->max_blocks) {
+		error = -ENOSPC;
+		goto out;
+	}
+
+	for (index = start; index < end; index++) {
+		struct page *page;
+
+		/*
+		 * Good, the fallocate(2) manpage permits EINTR: we may have
+		 * been interrupted because we are using up too much memory.
+		 */
+		if (signal_pending(current))
+			error = -EINTR;
+		else
+			error = shmem_getpage(inode, index, &page, SGP_WRITE,
+									NULL);
+		if (error) {
+			/*
+			 * We really ought to free what we allocated so far,
+			 * but it would be wrong to free pages allocated
+			 * earlier, or already now in use: i_mutex does not
+			 * exclude all cases.  We do not know what to free.
+			 */
+			goto ctime;
+		}
+
+		if (!PageUptodate(page)) {
+			clear_highpage(page);
+			flush_dcache_page(page);
+			SetPageUptodate(page);
+		}
+		/*
+		 * set_page_dirty so that memory pressure will swap rather
+		 * than free the pages we are allocating (and SGP_CACHE pages
+		 * might still be clean: we now need to mark those dirty too).
+		 */
+		set_page_dirty(page);
+		unlock_page(page);
+		page_cache_release(page);
+		cond_resched();
+	}
+
+	if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size)
+		i_size_write(inode, offset + len);
+ctime:
+	inode->i_ctime = CURRENT_TIME;
+out:
 	mutex_unlock(&inode->i_mutex);
 	return error;
 }
-- 
cgit v0.10.2


From 1635f6a74152f1dcd1b888231609d64875f0a81a Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Tue, 29 May 2012 15:06:42 -0700
Subject: tmpfs: undo fallocation on failure

In the previous episode, we left the already-fallocated pages attached to
the file when shmem_fallocate() fails part way through.

Now try to do better, by extending the earlier optimization of !Uptodate
pages (then always under page lock) to !Uptodate pages (outside of page
lock), representing fallocated pages.  And don't waste time clearing them
at the time of fallocate(), leave that until later if necessary.

Adapt shmem_truncate_range() to shmem_undo_range(), so that a failing
fallocate can recognize and remove precisely those !Uptodate allocations
which it added (and were not independently allocated by racing tasks).

But unless we start playing with swapfile.c and memcontrol.c too, once one
of our fallocated pages reaches shmem_writepage(), we do then have to
instantiate it as an ordinarily allocated page, before swapping out.  This
is unsatisfactory, but improved in the next episode.

Signed-off-by: Hugh Dickins <hughd@google.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Cong Wang <amwang@redhat.com>
Cc: Kay Sievers <kay@vrfy.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/mm/shmem.c b/mm/shmem.c
index 9b90d89..793dcd1 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -89,7 +89,8 @@ enum sgp_type {
 	SGP_READ,	/* don't exceed i_size, don't allocate page */
 	SGP_CACHE,	/* don't exceed i_size, may allocate page */
 	SGP_DIRTY,	/* like SGP_CACHE, but set new page dirty */
-	SGP_WRITE,	/* may exceed i_size, may allocate page */
+	SGP_WRITE,	/* may exceed i_size, may allocate !Uptodate page */
+	SGP_FALLOC,	/* like SGP_WRITE, but make existing page Uptodate */
 };
 
 #ifdef CONFIG_TMPFS
@@ -427,8 +428,10 @@ void shmem_unlock_mapping(struct address_space *mapping)
 
 /*
  * Remove range of pages and swap entries from radix tree, and free them.
+ * If !unfalloc, truncate or punch hole; if unfalloc, undo failed fallocate.
  */
-void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
+static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
+								 bool unfalloc)
 {
 	struct address_space *mapping = inode->i_mapping;
 	struct shmem_inode_info *info = SHMEM_I(inode);
@@ -462,6 +465,8 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
 				break;
 
 			if (radix_tree_exceptional_entry(page)) {
+				if (unfalloc)
+					continue;
 				nr_swaps_freed += !shmem_free_swap(mapping,
 								index, page);
 				continue;
@@ -469,9 +474,11 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
 
 			if (!trylock_page(page))
 				continue;
-			if (page->mapping == mapping) {
-				VM_BUG_ON(PageWriteback(page));
-				truncate_inode_page(mapping, page);
+			if (!unfalloc || !PageUptodate(page)) {
+				if (page->mapping == mapping) {
+					VM_BUG_ON(PageWriteback(page));
+					truncate_inode_page(mapping, page);
+				}
 			}
 			unlock_page(page);
 		}
@@ -517,12 +524,12 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
 				min(end - index, (pgoff_t)PAGEVEC_SIZE),
 							pvec.pages, indices);
 		if (!pvec.nr) {
-			if (index == start)
+			if (index == start || unfalloc)
 				break;
 			index = start;
 			continue;
 		}
-		if (index == start && indices[0] >= end) {
+		if ((index == start || unfalloc) && indices[0] >= end) {
 			shmem_deswap_pagevec(&pvec);
 			pagevec_release(&pvec);
 			break;
@@ -536,15 +543,19 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
 				break;
 
 			if (radix_tree_exceptional_entry(page)) {
+				if (unfalloc)
+					continue;
 				nr_swaps_freed += !shmem_free_swap(mapping,
 								index, page);
 				continue;
 			}
 
 			lock_page(page);
-			if (page->mapping == mapping) {
-				VM_BUG_ON(PageWriteback(page));
-				truncate_inode_page(mapping, page);
+			if (!unfalloc || !PageUptodate(page)) {
+				if (page->mapping == mapping) {
+					VM_BUG_ON(PageWriteback(page));
+					truncate_inode_page(mapping, page);
+				}
 			}
 			unlock_page(page);
 		}
@@ -558,7 +569,11 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
 	info->swapped -= nr_swaps_freed;
 	shmem_recalc_inode(inode);
 	spin_unlock(&info->lock);
+}
 
+void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
+{
+	shmem_undo_range(inode, lstart, lend, false);
 	inode->i_ctime = inode->i_mtime = CURRENT_TIME;
 }
 EXPORT_SYMBOL_GPL(shmem_truncate_range);
@@ -771,6 +786,18 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
 		WARN_ON_ONCE(1);	/* Still happens? Tell us about it! */
 		goto redirty;
 	}
+
+	/*
+	 * This is somewhat ridiculous, but without plumbing a SWAP_MAP_FALLOC
+	 * value into swapfile.c, the only way we can correctly account for a
+	 * fallocated page arriving here is now to initialize it and write it.
+	 */
+	if (!PageUptodate(page)) {
+		clear_highpage(page);
+		flush_dcache_page(page);
+		SetPageUptodate(page);
+	}
+
 	swap = get_swap_page();
 	if (!swap.val)
 		goto redirty;
@@ -994,6 +1021,7 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
 	swp_entry_t swap;
 	int error;
 	int once = 0;
+	int alloced = 0;
 
 	if (index > (MAX_LFS_FILESIZE >> PAGE_CACHE_SHIFT))
 		return -EFBIG;
@@ -1005,19 +1033,21 @@ repeat:
 		page = NULL;
 	}
 
-	if (sgp != SGP_WRITE &&
+	if (sgp != SGP_WRITE && sgp != SGP_FALLOC &&
 	    ((loff_t)index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) {
 		error = -EINVAL;
 		goto failed;
 	}
 
+	/* fallocated page? */
+	if (page && !PageUptodate(page)) {
+		if (sgp != SGP_READ)
+			goto clear;
+		unlock_page(page);
+		page_cache_release(page);
+		page = NULL;
+	}
 	if (page || (sgp == SGP_READ && !swap.val)) {
-		/*
-		 * Once we can get the page lock, it must be uptodate:
-		 * if there were an error in reading back from swap,
-		 * the page would not be inserted into the filecache.
-		 */
-		BUG_ON(page && !PageUptodate(page));
 		*pagep = page;
 		return 0;
 	}
@@ -1114,9 +1144,18 @@ repeat:
 		inode->i_blocks += BLOCKS_PER_PAGE;
 		shmem_recalc_inode(inode);
 		spin_unlock(&info->lock);
+		alloced = true;
 
 		/*
-		 * Let SGP_WRITE caller clear ends if write does not fill page
+		 * Let SGP_FALLOC use the SGP_WRITE optimization on a new page.
+		 */
+		if (sgp == SGP_FALLOC)
+			sgp = SGP_WRITE;
+clear:
+		/*
+		 * Let SGP_WRITE caller clear ends if write does not fill page;
+		 * but SGP_FALLOC on a page fallocated earlier must initialize
+		 * it now, lest undo on failure cancel our earlier guarantee.
 		 */
 		if (sgp != SGP_WRITE) {
 			clear_highpage(page);
@@ -1128,10 +1167,13 @@ repeat:
 	}
 
 	/* Perhaps the file has been truncated since we checked */
-	if (sgp != SGP_WRITE &&
+	if (sgp != SGP_WRITE && sgp != SGP_FALLOC &&
 	    ((loff_t)index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) {
 		error = -EINVAL;
-		goto trunc;
+		if (alloced)
+			goto trunc;
+		else
+			goto failed;
 	}
 	*pagep = page;
 	return 0;
@@ -1140,6 +1182,7 @@ repeat:
 	 * Error recovery.
 	 */
 trunc:
+	info = SHMEM_I(inode);
 	ClearPageDirty(page);
 	delete_from_page_cache(page);
 	spin_lock(&info->lock);
@@ -1147,6 +1190,7 @@ trunc:
 	inode->i_blocks -= BLOCKS_PER_PAGE;
 	spin_unlock(&info->lock);
 decused:
+	sbinfo = SHMEM_SB(inode->i_sb);
 	if (sbinfo->max_blocks)
 		percpu_counter_add(&sbinfo->used_blocks, -1);
 unacct:
@@ -1645,25 +1689,20 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset,
 		if (signal_pending(current))
 			error = -EINTR;
 		else
-			error = shmem_getpage(inode, index, &page, SGP_WRITE,
+			error = shmem_getpage(inode, index, &page, SGP_FALLOC,
 									NULL);
 		if (error) {
-			/*
-			 * We really ought to free what we allocated so far,
-			 * but it would be wrong to free pages allocated
-			 * earlier, or already now in use: i_mutex does not
-			 * exclude all cases.  We do not know what to free.
-			 */
+			/* Remove the !PageUptodate pages we added */
+			shmem_undo_range(inode,
+				(loff_t)start << PAGE_CACHE_SHIFT,
+				(loff_t)index << PAGE_CACHE_SHIFT, true);
 			goto ctime;
 		}
 
-		if (!PageUptodate(page)) {
-			clear_highpage(page);
-			flush_dcache_page(page);
-			SetPageUptodate(page);
-		}
 		/*
-		 * set_page_dirty so that memory pressure will swap rather
+		 * If !PageUptodate, leave it that way so that freeable pages
+		 * can be recognized if we need to rollback on error later.
+		 * But set_page_dirty so that memory pressure will swap rather
 		 * than free the pages we are allocating (and SGP_CACHE pages
 		 * might still be clean: we now need to mark those dirty too).
 		 */
-- 
cgit v0.10.2


From 1aac1400319d30786f32b9290e9cc923937b3d57 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Tue, 29 May 2012 15:06:42 -0700
Subject: tmpfs: quit when fallocate fills memory

As it stands, a large fallocate() on tmpfs is liable to fill memory with
pages, freed on failure except when they run into swap, at which point
they become fixed into the file despite the failure.  That feels quite
wrong, to be consuming resources precisely when they're in short supply.

Go the other way instead: shmem_fallocate() indicate the range it has
fallocated to shmem_writepage(), keeping count of pages it's allocating;
shmem_writepage() reactivate instead of swapping out pages fallocated by
this syscall (but happily swap out those from earlier occasions), keeping
count; shmem_fallocate() compare counts and give up once the reactivated
pages have started to coming back to writepage (approximately: some zones
would in fact recycle faster than others).

This is a little unusual, but works well: although we could consider the
failure to swap as a bug, and fix it later with SWAP_MAP_FALLOC handling
added in swapfile.c and memcontrol.c, I doubt that we shall ever want to.

(If there's no swap, an over-large fallocate() on tmpfs is limited in the
same way as writing: stopped by rlimit, or by tmpfs mount size if that was
set sensibly, or by __vm_enough_memory() heuristics if OVERCOMMIT_GUESS or
OVERCOMMIT_NEVER.  If OVERCOMMIT_ALWAYS, then it is liable to OOM-kill
others as writing would, but stops and frees if interrupted.)

Now that everything is freed on failure, we can then skip updating ctime.

Signed-off-by: Hugh Dickins <hughd@google.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Cong Wang <amwang@redhat.com>
Cc: Kay Sievers <kay@vrfy.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/mm/shmem.c b/mm/shmem.c
index 793dcd1..191663a 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -84,6 +84,18 @@ struct shmem_xattr {
 	char value[0];
 };
 
+/*
+ * shmem_fallocate and shmem_writepage communicate via inode->i_private
+ * (with i_mutex making sure that it has only one user at a time):
+ * we would prefer not to enlarge the shmem inode just for that.
+ */
+struct shmem_falloc {
+	pgoff_t start;		/* start of range currently being fallocated */
+	pgoff_t next;		/* the next page offset to be fallocated */
+	pgoff_t nr_falloced;	/* how many new pages have been fallocated */
+	pgoff_t nr_unswapped;	/* how often writepage refused to swap out */
+};
+
 /* Flag allocation requirements to shmem_getpage */
 enum sgp_type {
 	SGP_READ,	/* don't exceed i_size, don't allocate page */
@@ -791,8 +803,28 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
 	 * This is somewhat ridiculous, but without plumbing a SWAP_MAP_FALLOC
 	 * value into swapfile.c, the only way we can correctly account for a
 	 * fallocated page arriving here is now to initialize it and write it.
+	 *
+	 * That's okay for a page already fallocated earlier, but if we have
+	 * not yet completed the fallocation, then (a) we want to keep track
+	 * of this page in case we have to undo it, and (b) it may not be a
+	 * good idea to continue anyway, once we're pushing into swap.  So
+	 * reactivate the page, and let shmem_fallocate() quit when too many.
 	 */
 	if (!PageUptodate(page)) {
+		if (inode->i_private) {
+			struct shmem_falloc *shmem_falloc;
+			spin_lock(&inode->i_lock);
+			shmem_falloc = inode->i_private;
+			if (shmem_falloc &&
+			    index >= shmem_falloc->start &&
+			    index < shmem_falloc->next)
+				shmem_falloc->nr_unswapped++;
+			else
+				shmem_falloc = NULL;
+			spin_unlock(&inode->i_lock);
+			if (shmem_falloc)
+				goto redirty;
+		}
 		clear_highpage(page);
 		flush_dcache_page(page);
 		SetPageUptodate(page);
@@ -1647,6 +1679,7 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset,
 {
 	struct inode *inode = file->f_path.dentry->d_inode;
 	struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
+	struct shmem_falloc shmem_falloc;
 	pgoff_t start, index, end;
 	int error;
 
@@ -1679,6 +1712,14 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset,
 		goto out;
 	}
 
+	shmem_falloc.start = start;
+	shmem_falloc.next  = start;
+	shmem_falloc.nr_falloced = 0;
+	shmem_falloc.nr_unswapped = 0;
+	spin_lock(&inode->i_lock);
+	inode->i_private = &shmem_falloc;
+	spin_unlock(&inode->i_lock);
+
 	for (index = start; index < end; index++) {
 		struct page *page;
 
@@ -1688,6 +1729,8 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset,
 		 */
 		if (signal_pending(current))
 			error = -EINTR;
+		else if (shmem_falloc.nr_unswapped > shmem_falloc.nr_falloced)
+			error = -ENOMEM;
 		else
 			error = shmem_getpage(inode, index, &page, SGP_FALLOC,
 									NULL);
@@ -1696,10 +1739,18 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset,
 			shmem_undo_range(inode,
 				(loff_t)start << PAGE_CACHE_SHIFT,
 				(loff_t)index << PAGE_CACHE_SHIFT, true);
-			goto ctime;
+			goto undone;
 		}
 
 		/*
+		 * Inform shmem_writepage() how far we have reached.
+		 * No need for lock or barrier: we have the page lock.
+		 */
+		shmem_falloc.next++;
+		if (!PageUptodate(page))
+			shmem_falloc.nr_falloced++;
+
+		/*
 		 * If !PageUptodate, leave it that way so that freeable pages
 		 * can be recognized if we need to rollback on error later.
 		 * But set_page_dirty so that memory pressure will swap rather
@@ -1714,8 +1765,11 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset,
 
 	if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size)
 		i_size_write(inode, offset + len);
-ctime:
 	inode->i_ctime = CURRENT_TIME;
+undone:
+	spin_lock(&inode->i_lock);
+	inode->i_private = NULL;
+	spin_unlock(&inode->i_lock);
 out:
 	mutex_unlock(&inode->i_mutex);
 	return error;
-- 
cgit v0.10.2


From 4fb5ef089b288942c6fc3f85c4ecb4016c1aa4c3 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Tue, 29 May 2012 15:06:43 -0700
Subject: tmpfs: support SEEK_DATA and SEEK_HOLE

It's quite easy for tmpfs to scan the radix_tree to support llseek's new
SEEK_DATA and SEEK_HOLE options: so add them while the minutiae are still
on my mind (in particular, the !PageUptodate-ness of pages fallocated but
still unwritten).

But I don't know who actually uses SEEK_DATA or SEEK_HOLE, and whether it
would be of any use to them on tmpfs.  This code adds 92 lines and 752
bytes on x86_64 - is that bloat or worthwhile?

[akpm@linux-foundation.org: fix warning with CONFIG_TMPFS=n]
Signed-off-by: Hugh Dickins <hughd@google.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Josef Bacik <josef@redhat.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Andreas Dilger <adilger@dilger.ca>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Marco Stornelli <marco.stornelli@gmail.com>
Cc: Jeff liu <jeff.liu@oracle.com>
Cc: Chris Mason <chris.mason@oracle.com>
Cc: Sunil Mushran <sunil.mushran@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/mm/shmem.c b/mm/shmem.c
index 191663a..d576b84 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1674,6 +1674,98 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos,
 	return error;
 }
 
+/*
+ * llseek SEEK_DATA or SEEK_HOLE through the radix_tree.
+ */
+static pgoff_t shmem_seek_hole_data(struct address_space *mapping,
+				    pgoff_t index, pgoff_t end, int origin)
+{
+	struct page *page;
+	struct pagevec pvec;
+	pgoff_t indices[PAGEVEC_SIZE];
+	bool done = false;
+	int i;
+
+	pagevec_init(&pvec, 0);
+	pvec.nr = 1;		/* start small: we may be there already */
+	while (!done) {
+		pvec.nr = shmem_find_get_pages_and_swap(mapping, index,
+					pvec.nr, pvec.pages, indices);
+		if (!pvec.nr) {
+			if (origin == SEEK_DATA)
+				index = end;
+			break;
+		}
+		for (i = 0; i < pvec.nr; i++, index++) {
+			if (index < indices[i]) {
+				if (origin == SEEK_HOLE) {
+					done = true;
+					break;
+				}
+				index = indices[i];
+			}
+			page = pvec.pages[i];
+			if (page && !radix_tree_exceptional_entry(page)) {
+				if (!PageUptodate(page))
+					page = NULL;
+			}
+			if (index >= end ||
+			    (page && origin == SEEK_DATA) ||
+			    (!page && origin == SEEK_HOLE)) {
+				done = true;
+				break;
+			}
+		}
+		shmem_deswap_pagevec(&pvec);
+		pagevec_release(&pvec);
+		pvec.nr = PAGEVEC_SIZE;
+		cond_resched();
+	}
+	return index;
+}
+
+static loff_t shmem_file_llseek(struct file *file, loff_t offset, int origin)
+{
+	struct address_space *mapping;
+	struct inode *inode;
+	pgoff_t start, end;
+	loff_t new_offset;
+
+	if (origin != SEEK_DATA && origin != SEEK_HOLE)
+		return generic_file_llseek_size(file, offset, origin,
+							MAX_LFS_FILESIZE);
+	mapping = file->f_mapping;
+	inode = mapping->host;
+	mutex_lock(&inode->i_mutex);
+	/* We're holding i_mutex so we can access i_size directly */
+
+	if (offset < 0)
+		offset = -EINVAL;
+	else if (offset >= inode->i_size)
+		offset = -ENXIO;
+	else {
+		start = offset >> PAGE_CACHE_SHIFT;
+		end = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+		new_offset = shmem_seek_hole_data(mapping, start, end, origin);
+		new_offset <<= PAGE_CACHE_SHIFT;
+		if (new_offset > offset) {
+			if (new_offset < inode->i_size)
+				offset = new_offset;
+			else if (origin == SEEK_DATA)
+				offset = -ENXIO;
+			else
+				offset = inode->i_size;
+		}
+	}
+
+	if (offset >= 0 && offset != file->f_pos) {
+		file->f_pos = offset;
+		file->f_version = 0;
+	}
+	mutex_unlock(&inode->i_mutex);
+	return offset;
+}
+
 static long shmem_fallocate(struct file *file, int mode, loff_t offset,
 							 loff_t len)
 {
@@ -2679,7 +2771,7 @@ static const struct address_space_operations shmem_aops = {
 static const struct file_operations shmem_file_operations = {
 	.mmap		= shmem_mmap,
 #ifdef CONFIG_TMPFS
-	.llseek		= generic_file_llseek,
+	.llseek		= shmem_file_llseek,
 	.read		= do_sync_read,
 	.write		= do_sync_write,
 	.aio_read	= shmem_file_aio_read,
-- 
cgit v0.10.2


From 782182e53a6cdb3e3d04cc40516e173046942a32 Mon Sep 17 00:00:00 2001
From: Cong Wang <xiyou.wangcong@gmail.com>
Date: Tue, 29 May 2012 15:06:43 -0700
Subject: mm: move readahead syscall to mm/readahead.c

It is better to define readahead(2) in mm/readahead.c than in
mm/filemap.c.

Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Cc: Fengguang Wu <fengguang.wu@intel.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/mm/filemap.c b/mm/filemap.c
index 79c4b2b..64b48f9 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -29,7 +29,6 @@
 #include <linux/pagevec.h>
 #include <linux/blkdev.h>
 #include <linux/security.h>
-#include <linux/syscalls.h>
 #include <linux/cpuset.h>
 #include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */
 #include <linux/memcontrol.h>
@@ -1478,44 +1477,6 @@ out:
 }
 EXPORT_SYMBOL(generic_file_aio_read);
 
-static ssize_t
-do_readahead(struct address_space *mapping, struct file *filp,
-	     pgoff_t index, unsigned long nr)
-{
-	if (!mapping || !mapping->a_ops || !mapping->a_ops->readpage)
-		return -EINVAL;
-
-	force_page_cache_readahead(mapping, filp, index, nr);
-	return 0;
-}
-
-SYSCALL_DEFINE(readahead)(int fd, loff_t offset, size_t count)
-{
-	ssize_t ret;
-	struct file *file;
-
-	ret = -EBADF;
-	file = fget(fd);
-	if (file) {
-		if (file->f_mode & FMODE_READ) {
-			struct address_space *mapping = file->f_mapping;
-			pgoff_t start = offset >> PAGE_CACHE_SHIFT;
-			pgoff_t end = (offset + count - 1) >> PAGE_CACHE_SHIFT;
-			unsigned long len = end - start + 1;
-			ret = do_readahead(mapping, file, start, len);
-		}
-		fput(file);
-	}
-	return ret;
-}
-#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
-asmlinkage long SyS_readahead(long fd, loff_t offset, long count)
-{
-	return SYSC_readahead((int) fd, offset, (size_t) count);
-}
-SYSCALL_ALIAS(sys_readahead, SyS_readahead);
-#endif
-
 #ifdef CONFIG_MMU
 /**
  * page_cache_read - adds requested page to the page cache if not already there
diff --git a/mm/readahead.c b/mm/readahead.c
index cbcbb02..ea8f8fa 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -17,6 +17,8 @@
 #include <linux/task_io_accounting_ops.h>
 #include <linux/pagevec.h>
 #include <linux/pagemap.h>
+#include <linux/syscalls.h>
+#include <linux/file.h>
 
 /*
  * Initialise a struct file's readahead state.  Assumes that the caller has
@@ -562,3 +564,41 @@ page_cache_async_readahead(struct address_space *mapping,
 	ondemand_readahead(mapping, ra, filp, true, offset, req_size);
 }
 EXPORT_SYMBOL_GPL(page_cache_async_readahead);
+
+static ssize_t
+do_readahead(struct address_space *mapping, struct file *filp,
+	     pgoff_t index, unsigned long nr)
+{
+	if (!mapping || !mapping->a_ops || !mapping->a_ops->readpage)
+		return -EINVAL;
+
+	force_page_cache_readahead(mapping, filp, index, nr);
+	return 0;
+}
+
+SYSCALL_DEFINE(readahead)(int fd, loff_t offset, size_t count)
+{
+	ssize_t ret;
+	struct file *file;
+
+	ret = -EBADF;
+	file = fget(fd);
+	if (file) {
+		if (file->f_mode & FMODE_READ) {
+			struct address_space *mapping = file->f_mapping;
+			pgoff_t start = offset >> PAGE_CACHE_SHIFT;
+			pgoff_t end = (offset + count - 1) >> PAGE_CACHE_SHIFT;
+			unsigned long len = end - start + 1;
+			ret = do_readahead(mapping, file, start, len);
+		}
+		fput(file);
+	}
+	return ret;
+}
+#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
+asmlinkage long SyS_readahead(long fd, loff_t offset, long count)
+{
+	return SYSC_readahead((int) fd, offset, (size_t) count);
+}
+SYSCALL_ALIAS(sys_readahead, SyS_readahead);
+#endif
-- 
cgit v0.10.2


From be9cd873e2a706a688e37224d48e135efd8c0d26 Mon Sep 17 00:00:00 2001
From: Gavin Shan <shangw@linux.vnet.ibm.com>
Date: Tue, 29 May 2012 15:06:44 -0700
Subject: mm/buddy: dump PG_compound_lock page flag

The array pageflag_names[] does conversion from page flags into their
corresponding names so that a meaningful representation of the
corresponding page flag can be printed.  This mechanism is used while
dumping page frames.  However, the array missed PG_compound_lock.  So
the PG_compound_lock page flag would be printed as a digital number
instead of a meaningful string.

The patch fixes that and prints "compound_lock" for the PG_compound_lock
page flag.

Signed-off-by: Gavin Shan <shangw@linux.vnet.ibm.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 457b4de..72869ea 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5973,6 +5973,9 @@ static struct trace_print_flags pageflag_names[] = {
 #ifdef CONFIG_MEMORY_FAILURE
 	{1UL << PG_hwpoison,		"hwpoison"	},
 #endif
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+	{1UL << PG_compound_lock,	"compound_lock"	},
+#endif
 	{-1UL,				NULL		},
 };
 
-- 
cgit v0.10.2


From acc50c110cf6f1a8bd1af410b2c7bc29ca624678 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Tue, 29 May 2012 15:06:44 -0700
Subject: mm: page_alloc: catch out-of-date list of page flag names

String tables with names of enum items are always prone to go out of
sync with the enums themselves.  Ensure during compile time that the
name table of page flags has the same size as the page flags enum.

Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Gavin Shan <shangw@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 72869ea..5712898 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5985,6 +5985,8 @@ static void dump_page_flags(unsigned long flags)
 	unsigned long mask;
 	int i;
 
+	BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) - 1 != __NR_PAGEFLAGS);
+
 	printk(KERN_ALERT "page flags: %#lx(", flags);
 
 	/* remove zone id */
-- 
cgit v0.10.2


From 51300cef41c9355a7d428fe0714888d3c72a6f2f Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Tue, 29 May 2012 15:06:44 -0700
Subject: mm/page_alloc.c: cleanups

- make pageflag_names[] const

- remove null termination of pageflag_names[]

Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Gavin Shan <shangw@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 5712898..4fc462b 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5938,7 +5938,7 @@ bool is_free_buddy_page(struct page *page)
 }
 #endif
 
-static struct trace_print_flags pageflag_names[] = {
+static const struct trace_print_flags pageflag_names[] = {
 	{1UL << PG_locked,		"locked"	},
 	{1UL << PG_error,		"error"		},
 	{1UL << PG_referenced,		"referenced"	},
@@ -5976,7 +5976,6 @@ static struct trace_print_flags pageflag_names[] = {
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 	{1UL << PG_compound_lock,	"compound_lock"	},
 #endif
-	{-1UL,				NULL		},
 };
 
 static void dump_page_flags(unsigned long flags)
@@ -5985,14 +5984,14 @@ static void dump_page_flags(unsigned long flags)
 	unsigned long mask;
 	int i;
 
-	BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) - 1 != __NR_PAGEFLAGS);
+	BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS);
 
 	printk(KERN_ALERT "page flags: %#lx(", flags);
 
 	/* remove zone id */
 	flags &= (1UL << NR_PAGEFLAGS) - 1;
 
-	for (i = 0; pageflag_names[i].name && flags; i++) {
+	for (i = 0; i < ARRAY_SIZE(pageflag_names) && flags; i++) {
 
 		mask = pageflag_names[i].mask;
 		if ((flags & mask) != mask)
-- 
cgit v0.10.2


From 692569946fbf56fbb75d85c57679541f9a3550b4 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@suse.de>
Date: Tue, 29 May 2012 15:06:45 -0700
Subject: mm: document the meminfo and vmstat fields of relevance to
 transparent hugepages

Update Documentation/vm/transhuge.txt and
Documentation/filesystems/proc.txt with some information on monitoring
transparent huge page usage and the associated overhead.

Signed-off-by: Mel Gorman <mgorman@suse.de>
Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index ef088e5..912af6c 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -743,6 +743,7 @@ Committed_AS:   100056 kB
 VmallocTotal:   112216 kB
 VmallocUsed:       428 kB
 VmallocChunk:   111088 kB
+AnonHugePages:   49152 kB
 
     MemTotal: Total usable ram (i.e. physical ram minus a few reserved
               bits and the kernel binary code)
@@ -776,6 +777,7 @@ VmallocChunk:   111088 kB
        Dirty: Memory which is waiting to get written back to the disk
    Writeback: Memory which is actively being written back to the disk
    AnonPages: Non-file backed pages mapped into userspace page tables
+AnonHugePages: Non-file backed huge pages mapped into userspace page tables
       Mapped: files which have been mmaped, such as libraries
         Slab: in-kernel data structures cache
 SReclaimable: Part of Slab, that might be reclaimed, such as caches
diff --git a/Documentation/vm/transhuge.txt b/Documentation/vm/transhuge.txt
index 29bdf62..f734bb2 100644
--- a/Documentation/vm/transhuge.txt
+++ b/Documentation/vm/transhuge.txt
@@ -166,6 +166,68 @@ behavior. So to make them effective you need to restart any
 application that could have been using hugepages. This also applies to
 the regions registered in khugepaged.
 
+== Monitoring usage ==
+
+The number of transparent huge pages currently used by the system is
+available by reading the AnonHugePages field in /proc/meminfo. To
+identify what applications are using transparent huge pages, it is
+necessary to read /proc/PID/smaps and count the AnonHugePages fields
+for each mapping. Note that reading the smaps file is expensive and
+reading it frequently will incur overhead.
+
+There are a number of counters in /proc/vmstat that may be used to
+monitor how successfully the system is providing huge pages for use.
+
+thp_fault_alloc is incremented every time a huge page is successfully
+	allocated to handle a page fault. This applies to both the
+	first time a page is faulted and for COW faults.
+
+thp_collapse_alloc is incremented by khugepaged when it has found
+	a range of pages to collapse into one huge page and has
+	successfully allocated a new huge page to store the data.
+
+thp_fault_fallback is incremented if a page fault fails to allocate
+	a huge page and instead falls back to using small pages.
+
+thp_collapse_alloc_failed is incremented if khugepaged found a range
+	of pages that should be collapsed into one huge page but failed
+	the allocation.
+
+thp_split is incremented every time a huge page is split into base
+	pages. This can happen for a variety of reasons but a common
+	reason is that a huge page is old and is being reclaimed.
+
+As the system ages, allocating huge pages may be expensive as the
+system uses memory compaction to copy data around memory to free a
+huge page for use. There are some counters in /proc/vmstat to help
+monitor this overhead.
+
+compact_stall is incremented every time a process stalls to run
+	memory compaction so that a huge page is free for use.
+
+compact_success is incremented if the system compacted memory and
+	freed a huge page for use.
+
+compact_fail is incremented if the system tries to compact memory
+	but failed.
+
+compact_pages_moved is incremented each time a page is moved. If
+	this value is increasing rapidly, it implies that the system
+	is copying a lot of data to satisfy the huge page allocation.
+	It is possible that the cost of copying exceeds any savings
+	from reduced TLB misses.
+
+compact_pagemigrate_failed is incremented when the underlying mechanism
+	for moving a page failed.
+
+compact_blocks_moved is incremented each time memory compaction examines
+	a huge page aligned range of pages.
+
+It is possible to establish how long the stalls were using the function
+tracer to record how long was spent in __alloc_pages_nodemask and
+using the mm_page_alloc tracepoint to identify which allocations were
+for huge pages.
+
 == get_user_pages and follow_page ==
 
 get_user_pages and follow_page if run on a hugepage, will return the
-- 
cgit v0.10.2


From e48982734ea0500d1eba4f9d96195acc5406cad6 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.cz>
Date: Tue, 29 May 2012 15:06:45 -0700
Subject: mm: consider all swapped back pages in used-once logic

Commit 645747462435 ("vmscan: detect mapped file pages used only once")
made mapped pages have another round in inactive list because they might
be just short lived and so we could consider them again next time.  This
heuristic helps to reduce pressure on the active list with a streaming
IO worklods.

This patch fixes a regression introduced by this commit for heavy shmem
based workloads because unlike Anon pages, which are excluded from this
heuristic because they are usually long lived, shmem pages are handled
as a regular page cache.

This doesn't work quite well, unfortunately, if the workload is mostly
backed by shmem (in memory database sitting on 80% of memory) with a
streaming IO in the background (backup - up to 20% of memory).  Anon
inactive list is full of (dirty) shmem pages when watermarks are hit.
Shmem pages are kept in the inactive list (they are referenced) in the
first round and it is hard to reclaim anything else so we reach lower
scanning priorities very quickly which leads to an excessive swap out.

Let's fix this by excluding all swap backed pages (they tend to be long
lived wrt.  the regular page cache anyway) from used-once heuristic and
rather activate them if they are referenced.

The customer's workload is shmem backed database (80% of RAM) and they
are measuring transactions/s with an IO in the background (20%).
Transactions touch more or less random rows in the table.  The
transaction rate fell by a factor of 3 (in the worst case) because of
commit 64574746.  This patch restores the previous numbers.

Signed-off-by: Michal Hocko <mhocko@suse.cz>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Minchan Kim <minchan@kernel.org>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Reviewed-by: Rik van Riel <riel@redhat.com>
Cc: <stable@vger.kernel.org>	[2.6.34+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 44f0436..67a4fd4 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -657,7 +657,7 @@ static enum page_references page_check_references(struct page *page,
 		return PAGEREF_RECLAIM;
 
 	if (referenced_ptes) {
-		if (PageAnon(page))
+		if (PageSwapBacked(page))
 			return PAGEREF_ACTIVATE;
 		/*
 		 * All mapped pages start out with page table
-- 
cgit v0.10.2


From 5c2b8a162b5f8616f709bf20d5ec88f709485522 Mon Sep 17 00:00:00 2001
From: Gavin Shan <shangw@linux.vnet.ibm.com>
Date: Tue, 29 May 2012 15:06:46 -0700
Subject: mm/bootmem.c: cleanup on addition to bootmem data list

The objects of "struct bootmem_data_t" are linked together to form
double-linked list sequentially based on its minimal page frame number.

The current implementation implicitly supports the following cases,
which means the inserting point for current bootmem data depends on how
"list_for_each" works.  That makes the code a little hard to read.
Besides, "list_for_each" and "list_entry" can be replaced with
"list_for_each_entry".

        - The linked list is empty.
        - There has no entry in the linked list, whose minimal page
          frame number is bigger than current one.

Signed-off-by: Gavin Shan <shangw@linux.vnet.ibm.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/mm/bootmem.c b/mm/bootmem.c
index d1c7a79..ec4fcb7 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -77,16 +77,16 @@ unsigned long __init bootmem_bootmap_pages(unsigned long pages)
  */
 static void __init link_bootmem(bootmem_data_t *bdata)
 {
-	struct list_head *iter;
+	bootmem_data_t *ent;
 
-	list_for_each(iter, &bdata_list) {
-		bootmem_data_t *ent;
-
-		ent = list_entry(iter, bootmem_data_t, list);
-		if (bdata->node_min_pfn < ent->node_min_pfn)
-			break;
+	list_for_each_entry(ent, &bdata_list, list) {
+		if (bdata->node_min_pfn < ent->node_min_pfn) {
+			list_add_tail(&bdata->list, &ent->list);
+			return;
+		}
 	}
-	list_add_tail(&bdata->list, iter);
+
+	list_add_tail(&bdata->list, &bdata_list);
 }
 
 /*
-- 
cgit v0.10.2


From c50ac050811d6485616a193eb0f37bfbd191cc89 Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave@linux.vnet.ibm.com>
Date: Tue, 29 May 2012 15:06:46 -0700
Subject: hugetlb: fix resv_map leak in error path

When called for anonymous (non-shared) mappings, hugetlb_reserve_pages()
does a resv_map_alloc().  It depends on code in hugetlbfs's
vm_ops->close() to release that allocation.

However, in the mmap() failure path, we do a plain unmap_region() without
the remove_vma() which actually calls vm_ops->close().

This is a decent fix.  This leak could get reintroduced if new code (say,
after hugetlb_reserve_pages() in hugetlbfs_file_mmap()) decides to return
an error.  But, I think it would have to unroll the reservation anyway.

Christoph's test case:

	http://marc.info/?l=linux-mm&m=133728900729735

This patch applies to 3.4 and later.  A version for earlier kernels is at
https://lkml.org/lkml/2012/5/22/418.

Signed-off-by: Dave Hansen <dave@linux.vnet.ibm.com>
Acked-by: Mel Gorman <mel@csn.ul.ie>
Acked-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Reported-by: Christoph Lameter <cl@linux.com>
Tested-by: Christoph Lameter <cl@linux.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: <stable@vger.kernel.org>	[2.6.32+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 41a647d..285a81e 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -2157,6 +2157,15 @@ static void hugetlb_vm_op_open(struct vm_area_struct *vma)
 		kref_get(&reservations->refs);
 }
 
+static void resv_map_put(struct vm_area_struct *vma)
+{
+	struct resv_map *reservations = vma_resv_map(vma);
+
+	if (!reservations)
+		return;
+	kref_put(&reservations->refs, resv_map_release);
+}
+
 static void hugetlb_vm_op_close(struct vm_area_struct *vma)
 {
 	struct hstate *h = hstate_vma(vma);
@@ -2173,7 +2182,7 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma)
 		reserve = (end - start) -
 			region_count(&reservations->regions, start, end);
 
-		kref_put(&reservations->refs, resv_map_release);
+		resv_map_put(vma);
 
 		if (reserve) {
 			hugetlb_acct_memory(h, -reserve);
@@ -2991,12 +3000,16 @@ int hugetlb_reserve_pages(struct inode *inode,
 		set_vma_resv_flags(vma, HPAGE_RESV_OWNER);
 	}
 
-	if (chg < 0)
-		return chg;
+	if (chg < 0) {
+		ret = chg;
+		goto out_err;
+	}
 
 	/* There must be enough pages in the subpool for the mapping */
-	if (hugepage_subpool_get_pages(spool, chg))
-		return -ENOSPC;
+	if (hugepage_subpool_get_pages(spool, chg)) {
+		ret = -ENOSPC;
+		goto out_err;
+	}
 
 	/*
 	 * Check enough hugepages are available for the reservation.
@@ -3005,7 +3018,7 @@ int hugetlb_reserve_pages(struct inode *inode,
 	ret = hugetlb_acct_memory(h, chg);
 	if (ret < 0) {
 		hugepage_subpool_put_pages(spool, chg);
-		return ret;
+		goto out_err;
 	}
 
 	/*
@@ -3022,6 +3035,9 @@ int hugetlb_reserve_pages(struct inode *inode,
 	if (!vma || vma->vm_flags & VM_MAYSHARE)
 		region_add(&inode->i_mapping->private_list, from, to);
 	return 0;
+out_err:
+	resv_map_put(vma);
+	return ret;
 }
 
 void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
-- 
cgit v0.10.2


From fe35004fbf9eaf67482b074a2e032abb9c89b1dd Mon Sep 17 00:00:00 2001
From: Satoru Moriya <satoru.moriya@hds.com>
Date: Tue, 29 May 2012 15:06:47 -0700
Subject: mm: avoid swapping out with swappiness==0

Sometimes we'd like to avoid swapping out anonymous memory.  In
particular, avoid swapping out pages of important process or process
groups while there is a reasonable amount of pagecache on RAM so that we
can satisfy our customers' requirements.

OTOH, we can control how aggressive the kernel will swap memory pages with
/proc/sys/vm/swappiness for global and
/sys/fs/cgroup/memory/memory.swappiness for each memcg.

But with current reclaim implementation, the kernel may swap out even if
we set swappiness=0 and there is pagecache in RAM.

This patch changes the behavior with swappiness==0.  If we set
swappiness==0, the kernel does not swap out completely (for global reclaim
until the amount of free pages and filebacked pages in a zone has been
reduced to something very very small (nr_free + nr_filebacked < high
watermark)).

Signed-off-by: Satoru Moriya <satoru.moriya@hds.com>
Acked-by: Minchan Kim <minchan@kernel.org>
Reviewed-by: Rik van Riel <riel@redhat.com>
Acked-by: Jerome Marchand <jmarchan@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 67a4fd4..ee97530 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1761,10 +1761,10 @@ static void get_scan_count(struct mem_cgroup_zone *mz, struct scan_control *sc,
 	 * proportional to the fraction of recently scanned pages on
 	 * each list that were recently referenced and in active use.
 	 */
-	ap = (anon_prio + 1) * (reclaim_stat->recent_scanned[0] + 1);
+	ap = anon_prio * (reclaim_stat->recent_scanned[0] + 1);
 	ap /= reclaim_stat->recent_rotated[0] + 1;
 
-	fp = (file_prio + 1) * (reclaim_stat->recent_scanned[1] + 1);
+	fp = file_prio * (reclaim_stat->recent_scanned[1] + 1);
 	fp /= reclaim_stat->recent_rotated[1] + 1;
 	spin_unlock_irq(&mz->zone->lru_lock);
 
@@ -1777,7 +1777,7 @@ out:
 		unsigned long scan;
 
 		scan = zone_nr_lru_pages(mz, lru);
-		if (priority || noswap) {
+		if (priority || noswap || !vmscan_swappiness(mz, sc)) {
 			scan >>= priority;
 			if (!scan && force_scan)
 				scan = SWAP_CLUSTER_MAX;
-- 
cgit v0.10.2


From a7f638f999ff42310e9582273b1fe25ea6e469ba Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Tue, 29 May 2012 15:06:47 -0700
Subject: mm, oom: normalize oom scores to oom_score_adj scale only for
 userspace

The oom_score_adj scale ranges from -1000 to 1000 and represents the
proportion of memory available to the process at allocation time.  This
means an oom_score_adj value of 300, for example, will bias a process as
though it was using an extra 30.0% of available memory and a value of
-350 will discount 35.0% of available memory from its usage.

The oom killer badness heuristic also uses this scale to report the oom
score for each eligible process in determining the "best" process to
kill.  Thus, it can only differentiate each process's memory usage by
0.1% of system RAM.

On large systems, this can end up being a large amount of memory: 256MB
on 256GB systems, for example.

This can be fixed by having the badness heuristic to use the actual
memory usage in scoring threads and then normalizing it to the
oom_score_adj scale for userspace.  This results in better comparison
between eligible threads for kill and no change from the userspace
perspective.

Suggested-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Tested-by: Dave Jones <davej@redhat.com>
Signed-off-by: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/fs/proc/base.c b/fs/proc/base.c
index d2d3108..d7d7118 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -411,12 +411,13 @@ static const struct file_operations proc_lstats_operations = {
 
 static int proc_oom_score(struct task_struct *task, char *buffer)
 {
+	unsigned long totalpages = totalram_pages + total_swap_pages;
 	unsigned long points = 0;
 
 	read_lock(&tasklist_lock);
 	if (pid_alive(task))
-		points = oom_badness(task, NULL, NULL,
-					totalram_pages + total_swap_pages);
+		points = oom_badness(task, NULL, NULL, totalpages) *
+						1000 / totalpages;
 	read_unlock(&tasklist_lock);
 	return sprintf(buffer, "%lu\n", points);
 }
diff --git a/include/linux/oom.h b/include/linux/oom.h
index 3d76475..e4c29bc 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -43,8 +43,9 @@ enum oom_constraint {
 extern void compare_swap_oom_score_adj(int old_val, int new_val);
 extern int test_set_oom_score_adj(int new_val);
 
-extern unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
-			const nodemask_t *nodemask, unsigned long totalpages);
+extern unsigned long oom_badness(struct task_struct *p,
+		struct mem_cgroup *memcg, const nodemask_t *nodemask,
+		unsigned long totalpages);
 extern int try_set_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_flags);
 extern void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_flags);
 
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 9f09a1f..ed0e196 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -180,10 +180,10 @@ static bool oom_unkillable_task(struct task_struct *p,
  * predictable as possible.  The goal is to return the highest value for the
  * task consuming the most memory to avoid subsequent oom failures.
  */
-unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
-		      const nodemask_t *nodemask, unsigned long totalpages)
+unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
+			  const nodemask_t *nodemask, unsigned long totalpages)
 {
-	long points;
+	unsigned long points;
 
 	if (oom_unkillable_task(p, memcg, nodemask))
 		return 0;
@@ -198,21 +198,11 @@ unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
 	}
 
 	/*
-	 * The memory controller may have a limit of 0 bytes, so avoid a divide
-	 * by zero, if necessary.
-	 */
-	if (!totalpages)
-		totalpages = 1;
-
-	/*
 	 * The baseline for the badness score is the proportion of RAM that each
 	 * task's rss, pagetable and swap space use.
 	 */
-	points = get_mm_rss(p->mm) + p->mm->nr_ptes;
-	points += get_mm_counter(p->mm, MM_SWAPENTS);
-
-	points *= 1000;
-	points /= totalpages;
+	points = get_mm_rss(p->mm) + p->mm->nr_ptes +
+		 get_mm_counter(p->mm, MM_SWAPENTS);
 	task_unlock(p);
 
 	/*
@@ -220,23 +210,20 @@ unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
 	 * implementation used by LSMs.
 	 */
 	if (has_capability_noaudit(p, CAP_SYS_ADMIN))
-		points -= 30;
+		points -= 30 * totalpages / 1000;
 
 	/*
 	 * /proc/pid/oom_score_adj ranges from -1000 to +1000 such that it may
 	 * either completely disable oom killing or always prefer a certain
 	 * task.
 	 */
-	points += p->signal->oom_score_adj;
+	points += p->signal->oom_score_adj * totalpages / 1000;
 
 	/*
-	 * Never return 0 for an eligible task that may be killed since it's
-	 * possible that no single user task uses more than 0.1% of memory and
-	 * no single admin tasks uses more than 3.0%.
+	 * Never return 0 for an eligible task regardless of the root bonus and
+	 * oom_score_adj (oom_score_adj can't be OOM_SCORE_ADJ_MIN here).
 	 */
-	if (points <= 0)
-		return 1;
-	return (points < 1000) ? points : 1000;
+	return points ? points : 1;
 }
 
 /*
@@ -314,7 +301,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
 {
 	struct task_struct *g, *p;
 	struct task_struct *chosen = NULL;
-	*ppoints = 0;
+	unsigned long chosen_points = 0;
 
 	do_each_thread(g, p) {
 		unsigned int points;
@@ -354,7 +341,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
 			 */
 			if (p == current) {
 				chosen = p;
-				*ppoints = 1000;
+				chosen_points = ULONG_MAX;
 			} else if (!force_kill) {
 				/*
 				 * If this task is not being ptraced on exit,
@@ -367,12 +354,13 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
 		}
 
 		points = oom_badness(p, memcg, nodemask, totalpages);
-		if (points > *ppoints) {
+		if (points > chosen_points) {
 			chosen = p;
-			*ppoints = points;
+			chosen_points = points;
 		}
 	} while_each_thread(g, p);
 
+	*ppoints = chosen_points * 1000 / totalpages;
 	return chosen;
 }
 
@@ -572,7 +560,7 @@ void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
 	}
 
 	check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL);
-	limit = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT;
+	limit = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT ? : 1;
 	read_lock(&tasklist_lock);
 	p = select_bad_process(&points, limit, memcg, NULL, false);
 	if (p && PTR_ERR(p) != -1UL)
-- 
cgit v0.10.2


From 26c191788f18129af0eb32a358cdaea0c7479626 Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Tue, 29 May 2012 15:06:49 -0700
Subject: mm: pmd_read_atomic: fix 32bit PAE pmd walk vs pmd_populate SMP race
 condition

When holding the mmap_sem for reading, pmd_offset_map_lock should only
run on a pmd_t that has been read atomically from the pmdp pointer,
otherwise we may read only half of it leading to this crash.

PID: 11679  TASK: f06e8000  CPU: 3   COMMAND: "do_race_2_panic"
 #0 [f06a9dd8] crash_kexec at c049b5ec
 #1 [f06a9e2c] oops_end at c083d1c2
 #2 [f06a9e40] no_context at c0433ded
 #3 [f06a9e64] bad_area_nosemaphore at c043401a
 #4 [f06a9e6c] __do_page_fault at c0434493
 #5 [f06a9eec] do_page_fault at c083eb45
 #6 [f06a9f04] error_code (via page_fault) at c083c5d5
    EAX: 01fb470c EBX: fff35000 ECX: 00000003 EDX: 00000100 EBP:
    00000000
    DS:  007b     ESI: 9e201000 ES:  007b     EDI: 01fb4700 GS:  00e0
    CS:  0060     EIP: c083bc14 ERR: ffffffff EFLAGS: 00010246
 #7 [f06a9f38] _spin_lock at c083bc14
 #8 [f06a9f44] sys_mincore at c0507b7d
 #9 [f06a9fb0] system_call at c083becd
                         start           len
    EAX: ffffffda  EBX: 9e200000  ECX: 00001000  EDX: 6228537f
    DS:  007b      ESI: 00000000  ES:  007b      EDI: 003d0f00
    SS:  007b      ESP: 62285354  EBP: 62285388  GS:  0033
    CS:  0073      EIP: 00291416  ERR: 000000da  EFLAGS: 00000286

This should be a longstanding bug affecting x86 32bit PAE without THP.
Only archs with 64bit large pmd_t and 32bit unsigned long should be
affected.

With THP enabled the barrier() in pmd_none_or_trans_huge_or_clear_bad()
would partly hide the bug when the pmd transition from none to stable,
by forcing a re-read of the *pmd in pmd_offset_map_lock, but when THP is
enabled a new set of problem arises by the fact could then transition
freely in any of the none, pmd_trans_huge or pmd_trans_stable states.
So making the barrier in pmd_none_or_trans_huge_or_clear_bad()
unconditional isn't good idea and it would be a flakey solution.

This should be fully fixed by introducing a pmd_read_atomic that reads
the pmd in order with THP disabled, or by reading the pmd atomically
with cmpxchg8b with THP enabled.

Luckily this new race condition only triggers in the places that must
already be covered by pmd_none_or_trans_huge_or_clear_bad() so the fix
is localized there but this bug is not related to THP.

NOTE: this can trigger on x86 32bit systems with PAE enabled with more
than 4G of ram, otherwise the high part of the pmd will never risk to be
truncated because it would be zero at all times, in turn so hiding the
SMP race.

This bug was discovered and fully debugged by Ulrich, quote:

----
[..]
pmd_none_or_trans_huge_or_clear_bad() loads the content of edx and
eax.

    496 static inline int pmd_none_or_trans_huge_or_clear_bad(pmd_t
    *pmd)
    497 {
    498         /* depend on compiler for an atomic pmd read */
    499         pmd_t pmdval = *pmd;

                                // edi = pmd pointer
0xc0507a74 <sys_mincore+548>:   mov    0x8(%esp),%edi
...
                                // edx = PTE page table high address
0xc0507a84 <sys_mincore+564>:   mov    0x4(%edi),%edx
...
                                // eax = PTE page table low address
0xc0507a8e <sys_mincore+574>:   mov    (%edi),%eax

[..]

Please note that the PMD is not read atomically. These are two "mov"
instructions where the high order bits of the PMD entry are fetched
first. Hence, the above machine code is prone to the following race.

-  The PMD entry {high|low} is 0x0000000000000000.
   The "mov" at 0xc0507a84 loads 0x00000000 into edx.

-  A page fault (on another CPU) sneaks in between the two "mov"
   instructions and instantiates the PMD.

-  The PMD entry {high|low} is now 0x00000003fda38067.
   The "mov" at 0xc0507a8e loads 0xfda38067 into eax.
----

Reported-by: Ulrich Obergfell <uobergfe@redhat.com>
Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Hugh Dickins <hughd@google.com>
Cc: Larry Woodman <lwoodman@redhat.com>
Cc: Petr Matousek <pmatouse@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/arch/x86/include/asm/pgtable-3level.h b/arch/x86/include/asm/pgtable-3level.h
index effff47..43876f1 100644
--- a/arch/x86/include/asm/pgtable-3level.h
+++ b/arch/x86/include/asm/pgtable-3level.h
@@ -31,6 +31,56 @@ static inline void native_set_pte(pte_t *ptep, pte_t pte)
 	ptep->pte_low = pte.pte_low;
 }
 
+#define pmd_read_atomic pmd_read_atomic
+/*
+ * pte_offset_map_lock on 32bit PAE kernels was reading the pmd_t with
+ * a "*pmdp" dereference done by gcc. Problem is, in certain places
+ * where pte_offset_map_lock is called, concurrent page faults are
+ * allowed, if the mmap_sem is hold for reading. An example is mincore
+ * vs page faults vs MADV_DONTNEED. On the page fault side
+ * pmd_populate rightfully does a set_64bit, but if we're reading the
+ * pmd_t with a "*pmdp" on the mincore side, a SMP race can happen
+ * because gcc will not read the 64bit of the pmd atomically. To fix
+ * this all places running pmd_offset_map_lock() while holding the
+ * mmap_sem in read mode, shall read the pmdp pointer using this
+ * function to know if the pmd is null nor not, and in turn to know if
+ * they can run pmd_offset_map_lock or pmd_trans_huge or other pmd
+ * operations.
+ *
+ * Without THP if the mmap_sem is hold for reading, the
+ * pmd can only transition from null to not null while pmd_read_atomic runs.
+ * So there's no need of literally reading it atomically.
+ *
+ * With THP if the mmap_sem is hold for reading, the pmd can become
+ * THP or null or point to a pte (and in turn become "stable") at any
+ * time under pmd_read_atomic, so it's mandatory to read it atomically
+ * with cmpxchg8b.
+ */
+#ifndef CONFIG_TRANSPARENT_HUGEPAGE
+static inline pmd_t pmd_read_atomic(pmd_t *pmdp)
+{
+	pmdval_t ret;
+	u32 *tmp = (u32 *)pmdp;
+
+	ret = (pmdval_t) (*tmp);
+	if (ret) {
+		/*
+		 * If the low part is null, we must not read the high part
+		 * or we can end up with a partial pmd.
+		 */
+		smp_rmb();
+		ret |= ((pmdval_t)*(tmp + 1)) << 32;
+	}
+
+	return (pmd_t) { ret };
+}
+#else /* CONFIG_TRANSPARENT_HUGEPAGE */
+static inline pmd_t pmd_read_atomic(pmd_t *pmdp)
+{
+	return (pmd_t) { atomic64_read((atomic64_t *)pmdp) };
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
 static inline void native_set_pte_atomic(pte_t *ptep, pte_t pte)
 {
 	set_64bit((unsigned long long *)(ptep), native_pte_val(pte));
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index e2768f1..6f2b45a 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -445,6 +445,18 @@ static inline int pmd_write(pmd_t pmd)
 #endif /* __HAVE_ARCH_PMD_WRITE */
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 
+#ifndef pmd_read_atomic
+static inline pmd_t pmd_read_atomic(pmd_t *pmdp)
+{
+	/*
+	 * Depend on compiler for an atomic pmd read. NOTE: this is
+	 * only going to work, if the pmdval_t isn't larger than
+	 * an unsigned long.
+	 */
+	return *pmdp;
+}
+#endif
+
 /*
  * This function is meant to be used by sites walking pagetables with
  * the mmap_sem hold in read mode to protect against MADV_DONTNEED and
@@ -458,11 +470,17 @@ static inline int pmd_write(pmd_t pmd)
  * undefined so behaving like if the pmd was none is safe (because it
  * can return none anyway). The compiler level barrier() is critically
  * important to compute the two checks atomically on the same pmdval.
+ *
+ * For 32bit kernels with a 64bit large pmd_t this automatically takes
+ * care of reading the pmd atomically to avoid SMP race conditions
+ * against pmd_populate() when the mmap_sem is hold for reading by the
+ * caller (a special atomic read not done by "gcc" as in the generic
+ * version above, is also needed when THP is disabled because the page
+ * fault can populate the pmd from under us).
  */
 static inline int pmd_none_or_trans_huge_or_clear_bad(pmd_t *pmd)
 {
-	/* depend on compiler for an atomic pmd read */
-	pmd_t pmdval = *pmd;
+	pmd_t pmdval = pmd_read_atomic(pmd);
 	/*
 	 * The barrier will stabilize the pmdval in a register or on
 	 * the stack so that it will stop changing under the code.
-- 
cgit v0.10.2


From dbda591d920b4c7692725b13e3f68ecb251e9080 Mon Sep 17 00:00:00 2001
From: KyongHo <pullip.cho@samsung.com>
Date: Tue, 29 May 2012 15:06:49 -0700
Subject: mm: fix faulty initialization in vmalloc_init()

The transfer of ->flags causes some of the static mapping virtual
addresses to be prematurely freed (before the mapping is removed) because
VM_LAZY_FREE gets "set" if tmp->flags has VM_IOREMAP set.  This might
cause subsequent vmalloc/ioremap calls to fail because it might allocate
one of the freed virtual address ranges that aren't unmapped.

va->flags has different types of flags from tmp->flags.  If a region with
VM_IOREMAP set is registered with vm_area_add_early(), it will be removed
by __purge_vmap_area_lazy().

Fix vmalloc_init() to correctly initialize vmap_area for the given
vm_struct.

Also initialise va->vm.  If it is not set, find_vm_area() for the early
vm regions will always fail.

Signed-off-by: KyongHo Cho <pullip.cho@samsung.com>
Cc: "Olav Haugan" <ohaugan@codeaurora.org>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index c28b0b9..2aad499 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1185,9 +1185,10 @@ void __init vmalloc_init(void)
 	/* Import existing vmlist entries. */
 	for (tmp = vmlist; tmp; tmp = tmp->next) {
 		va = kzalloc(sizeof(struct vmap_area), GFP_NOWAIT);
-		va->flags = tmp->flags | VM_VM_AREA;
+		va->flags = VM_VM_AREA;
 		va->va_start = (unsigned long)tmp->addr;
 		va->va_end = va->va_start + tmp->size;
+		va->vm = tmp;
 		__insert_vmap_area(va);
 	}
 
-- 
cgit v0.10.2


From 5bf5f03c271907978489868a4c72aeb42b5127d2 Mon Sep 17 00:00:00 2001
From: Pravin B Shelar <pshelar@nicira.com>
Date: Tue, 29 May 2012 15:06:49 -0700
Subject: mm: fix slab->page flags corruption

Transparent huge pages can change page->flags (PG_compound_lock) without
taking Slab lock.  Since THP can not break slab pages we can safely access
compound page without taking compound lock.

Specifically this patch fixes a race between compound_unlock() and slab
functions which perform page-flags updates.  This can occur when
get_page()/put_page() is called on a page from slab.

[akpm@linux-foundation.org: tweak comment text, fix comment layout, fix label indenting]
Reported-by: Amey Bhide <abhide@nicira.com>
Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Reviewed-by: Christoph Lameter <cl@linux.com>
Acked-by: Andrea Arcangeli <aarcange@redhat.com>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/include/linux/mm.h b/include/linux/mm.h
index aa20baf..ce26716 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -321,6 +321,7 @@ static inline int is_vmalloc_or_module_addr(const void *x)
 static inline void compound_lock(struct page *page)
 {
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
+	VM_BUG_ON(PageSlab(page));
 	bit_spin_lock(PG_compound_lock, &page->flags);
 #endif
 }
@@ -328,6 +329,7 @@ static inline void compound_lock(struct page *page)
 static inline void compound_unlock(struct page *page)
 {
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
+	VM_BUG_ON(PageSlab(page));
 	bit_spin_unlock(PG_compound_lock, &page->flags);
 #endif
 }
diff --git a/mm/swap.c b/mm/swap.c
index 5c13f13..6fdd72e 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -82,6 +82,25 @@ static void put_compound_page(struct page *page)
 		if (likely(page != page_head &&
 			   get_page_unless_zero(page_head))) {
 			unsigned long flags;
+
+			/*
+			 * THP can not break up slab pages so avoid taking
+			 * compound_lock().  Slab performs non-atomic bit ops
+			 * on page->flags for better performance.  In particular
+			 * slab_unlock() in slub used to be a hot path.  It is
+			 * still hot on arches that do not support
+			 * this_cpu_cmpxchg_double().
+			 */
+			if (PageSlab(page_head)) {
+				if (PageTail(page)) {
+					if (put_page_testzero(page_head))
+						VM_BUG_ON(1);
+
+					atomic_dec(&page->_mapcount);
+					goto skip_lock_tail;
+				} else
+					goto skip_lock;
+			}
 			/*
 			 * page_head wasn't a dangling pointer but it
 			 * may not be a head page anymore by the time
@@ -92,10 +111,10 @@ static void put_compound_page(struct page *page)
 			if (unlikely(!PageTail(page))) {
 				/* __split_huge_page_refcount run before us */
 				compound_unlock_irqrestore(page_head, flags);
-				VM_BUG_ON(PageHead(page_head));
+skip_lock:
 				if (put_page_testzero(page_head))
 					__put_single_page(page_head);
-			out_put_single:
+out_put_single:
 				if (put_page_testzero(page))
 					__put_single_page(page);
 				return;
@@ -115,6 +134,8 @@ static void put_compound_page(struct page *page)
 			VM_BUG_ON(atomic_read(&page_head->_count) <= 0);
 			VM_BUG_ON(atomic_read(&page->_count) != 0);
 			compound_unlock_irqrestore(page_head, flags);
+
+skip_lock_tail:
 			if (put_page_testzero(page_head)) {
 				if (PageHead(page_head))
 					__put_compound_page(page_head);
@@ -162,6 +183,18 @@ bool __get_page_tail(struct page *page)
 	struct page *page_head = compound_trans_head(page);
 
 	if (likely(page != page_head && get_page_unless_zero(page_head))) {
+
+		/* Ref to put_compound_page() comment. */
+		if (PageSlab(page_head)) {
+			if (likely(PageTail(page))) {
+				__get_page_tail_foll(page, false);
+				return true;
+			} else {
+				put_page(page_head);
+				return false;
+			}
+		}
+
 		/*
 		 * page_head wasn't a dangling pointer but it
 		 * may not be a head page anymore by the time
-- 
cgit v0.10.2


From 4e2f07750d9a94e8f23e86408df5ab95be88bf11 Mon Sep 17 00:00:00 2001
From: Gavin Shan <shangw@linux.vnet.ibm.com>
Date: Tue, 29 May 2012 15:06:50 -0700
Subject: mm/memblock: cleanup on duplicate VA/PA conversion

The overall memblock has been organized into the memory regions and
reserved regions.  Initially, the memory regions and reserved regions are
stored in the predetermined arrays of "struct memblock _region".  It's
possible for the arrays to be enlarged when we have newly added regions
for them, but no enough space there.  Under the situation, We will created
double-sized array to meet the requirement.  However, the original
implementation converted the VA (Virtual Address) of the newly allocated
array of regions to PA (Physical Address), then translate back when we
allocates the new array from slab.  That's actually unnecessary.

The patch removes the duplicate VA/PA conversion.

Signed-off-by: Gavin Shan <shangw@linux.vnet.ibm.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/mm/memblock.c b/mm/memblock.c
index a44eab3..eae06ea 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -212,14 +212,15 @@ static int __init_memblock memblock_double_array(struct memblock_type *type)
 	if (use_slab) {
 		new_array = kmalloc(new_size, GFP_KERNEL);
 		addr = new_array ? __pa(new_array) : 0;
-	} else
+	} else {
 		addr = memblock_find_in_range(0, MEMBLOCK_ALLOC_ACCESSIBLE, new_size, sizeof(phys_addr_t));
+		new_array = addr ? __va(addr) : 0;
+	}
 	if (!addr) {
 		pr_err("memblock: Failed to double %s array from %ld to %ld entries !\n",
 		       memblock_type_name(type), type->max, type->max * 2);
 		return -1;
 	}
-	new_array = __va(addr);
 
 	memblock_dbg("memblock: %s array is doubled to %ld at [%#010llx-%#010llx]",
 		 memblock_type_name(type), type->max * 2, (u64)addr, (u64)addr + new_size - 1);
-- 
cgit v0.10.2


From 181eb39425f2b9275afcb015eaa547d11f71a02f Mon Sep 17 00:00:00 2001
From: Gavin Shan <shangw@linux.vnet.ibm.com>
Date: Tue, 29 May 2012 15:06:50 -0700
Subject: mm/memblock: fix memory leak on extending regions

The overall memblock has been organized into the memory regions and
reserved regions.  Initially, the memory regions and reserved regions are
stored in the predetermined arrays of "struct memblock _region".  It's
possible for the arrays to be enlarged when we have newly added regions,
but no free space left there.  The policy here is to create double-sized
array either by slab allocator or memblock allocator.  Unfortunately, we
didn't free the old array, which might be allocated through slab allocator
before.  That would cause memory leak.

The patch introduces 2 variables to trace where (slab or memblock) the
memory and reserved regions come from.  The memory for the memory or
reserved regions will be deallocated by kfree() if that was allocated by
slab allocator.  Thus to fix the memory leak issue.

Signed-off-by: Gavin Shan <shangw@linux.vnet.ibm.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/mm/memblock.c b/mm/memblock.c
index eae06ea..952123e 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -37,6 +37,8 @@ struct memblock memblock __initdata_memblock = {
 
 int memblock_debug __initdata_memblock;
 static int memblock_can_resize __initdata_memblock;
+static int memblock_memory_in_slab __initdata_memblock = 0;
+static int memblock_reserved_in_slab __initdata_memblock = 0;
 
 /* inline so we don't get a warning when pr_debug is compiled out */
 static inline const char *memblock_type_name(struct memblock_type *type)
@@ -187,6 +189,7 @@ static int __init_memblock memblock_double_array(struct memblock_type *type)
 	struct memblock_region *new_array, *old_array;
 	phys_addr_t old_size, new_size, addr;
 	int use_slab = slab_is_available();
+	int *in_slab;
 
 	/* We don't allow resizing until we know about the reserved regions
 	 * of memory that aren't suitable for allocation
@@ -198,6 +201,12 @@ static int __init_memblock memblock_double_array(struct memblock_type *type)
 	old_size = type->max * sizeof(struct memblock_region);
 	new_size = old_size << 1;
 
+	/* Retrieve the slab flag */
+	if (type == &memblock.memory)
+		in_slab = &memblock_memory_in_slab;
+	else
+		in_slab = &memblock_reserved_in_slab;
+
 	/* Try to find some space for it.
 	 *
 	 * WARNING: We assume that either slab_is_available() and we use it or
@@ -235,22 +244,24 @@ static int __init_memblock memblock_double_array(struct memblock_type *type)
 	type->regions = new_array;
 	type->max <<= 1;
 
-	/* If we use SLAB that's it, we are done */
-	if (use_slab)
-		return 0;
-
-	/* Add the new reserved region now. Should not fail ! */
-	BUG_ON(memblock_reserve(addr, new_size));
-
-	/* If the array wasn't our static init one, then free it. We only do
-	 * that before SLAB is available as later on, we don't know whether
-	 * to use kfree or free_bootmem_pages(). Shouldn't be a big deal
-	 * anyways
+	/* Free old array. We needn't free it if the array is the
+	 * static one
 	 */
-	if (old_array != memblock_memory_init_regions &&
-	    old_array != memblock_reserved_init_regions)
+	if (*in_slab)
+		kfree(old_array);
+	else if (old_array != memblock_memory_init_regions &&
+		 old_array != memblock_reserved_init_regions)
 		memblock_free(__pa(old_array), old_size);
 
+	/* Reserve the new array if that comes from the memblock.
+	 * Otherwise, we needn't do it
+	 */
+	if (!use_slab)
+		BUG_ON(memblock_reserve(addr, new_size));
+
+	/* Update slab flag */
+	*in_slab = use_slab;
+
 	return 0;
 }
 
-- 
cgit v0.10.2


From 4b91355e9dc9ac1eb3d69e56de093899ff2677ef Mon Sep 17 00:00:00 2001
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Date: Tue, 29 May 2012 15:06:51 -0700
Subject: memcg: fix/change behavior of shared anon at moving task

This patch changes memcg's behavior at task_move().

At task_move(), the kernel scans a task's page table and move the changes
for mapped pages from source cgroup to target cgroup.  There has been a
bug at handling shared anonymous pages for a long time.

Before patch:
  - The spec says 'shared anonymous pages are not moved.'
  - The implementation was 'shared anonymoys pages may be moved'.
    If page_mapcount <=2, shared anonymous pages's charge were moved.

After patch:
  - The spec says 'all anonymous pages are moved'.
  - The implementation is 'all anonymous pages are moved'.

Considering usage of memcg, this will not affect user's experience.
'shared anonymous' pages only exists between a tree of processes which
don't do exec().  Moving one of process without exec() seems not sane.
For example, libcgroup will not be affected by this change.  (Anyway, no
one noticed the implementation for a long time...)

Below is a discussion log:

 - current spec/implementation are complex
 - Now, shared file caches are moved
 - It adds unclear check as page_mapcount(). To do correct check,
   we should check swap users, etc.
 - No one notice this implementation behavior. So, no one get benefit
   from the design.
 - In general, once task is moved to a cgroup for running, it will not
   be moved....
 - Finally, we have control knob as memory.move_charge_at_immigrate.

Here is a patch to allow moving shared pages, completely. This makes
memcg simpler and fix current broken code.

Suggested-by: Hugh Dickins <hughd@google.com>
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Glauber Costa <glommer@parallels.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt
index e479007..6a066a2 100644
--- a/Documentation/cgroups/memory.txt
+++ b/Documentation/cgroups/memory.txt
@@ -184,12 +184,14 @@ behind this approach is that a cgroup that aggressively uses a shared
 page will eventually get charged for it (once it is uncharged from
 the cgroup that brought it in -- this will happen on memory pressure).
 
+But see section 8.2: when moving a task to another cgroup, its pages may
+be recharged to the new cgroup, if move_charge_at_immigrate has been chosen.
+
 Exception: If CONFIG_CGROUP_CGROUP_MEM_RES_CTLR_SWAP is not used.
 When you do swapoff and make swapped-out pages of shmem(tmpfs) to
 be backed into memory in force, charges for pages are accounted against the
 caller of swapoff rather than the users of shmem.
 
-
 2.4 Swap Extension (CONFIG_CGROUP_MEM_RES_CTLR_SWAP)
 
 Swap Extension allows you to record charge for swap. A swapped-in page is
@@ -615,8 +617,7 @@ memory cgroup.
   bit | what type of charges would be moved ?
  -----+------------------------------------------------------------------------
    0  | A charge of an anonymous page(or swap of it) used by the target task.
-      | Those pages and swaps must be used only by the target task. You must
-      | enable Swap Extension(see 2.4) to enable move of swap charges.
+      | You must enable Swap Extension(see 2.4) to enable move of swap charges.
  -----+------------------------------------------------------------------------
    1  | A charge of file pages(normal file, tmpfs file(e.g. ipc shared memory)
       | and swaps of tmpfs file) mmapped by the target task. Unlike the case of
@@ -629,8 +630,6 @@ memory cgroup.
 
 8.3 TODO
 
-- Implement madvise(2) to let users decide the vma to be moved or not to be
-  moved.
 - All of moving charge operations are done under cgroup_mutex. It's not good
   behavior to hold the mutex too long, so we may need some trick.
 
diff --git a/include/linux/swap.h b/include/linux/swap.h
index d965c4b..49c0fa9 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -359,7 +359,6 @@ struct backing_dev_info;
 #ifdef CONFIG_CGROUP_MEM_RES_CTLR
 extern void
 mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout);
-extern int mem_cgroup_count_swap_user(swp_entry_t ent, struct page **pagep);
 #else
 static inline void
 mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout)
@@ -470,14 +469,6 @@ mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent)
 {
 }
 
-#ifdef CONFIG_CGROUP_MEM_RES_CTLR
-static inline int
-mem_cgroup_count_swap_user(swp_entry_t ent, struct page **pagep)
-{
-	return 0;
-}
-#endif
-
 #endif /* CONFIG_SWAP */
 #endif /* __KERNEL__*/
 #endif /* _LINUX_SWAP_H */
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index d7ce417..e7db70f 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -5154,7 +5154,7 @@ static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
 		return NULL;
 	if (PageAnon(page)) {
 		/* we don't move shared anon */
-		if (!move_anon() || page_mapcount(page) > 2)
+		if (!move_anon())
 			return NULL;
 	} else if (!move_file())
 		/* we ignore mapcount for file pages */
@@ -5165,26 +5165,32 @@ static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
 	return page;
 }
 
+#ifdef CONFIG_SWAP
 static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
 			unsigned long addr, pte_t ptent, swp_entry_t *entry)
 {
-	int usage_count;
 	struct page *page = NULL;
 	swp_entry_t ent = pte_to_swp_entry(ptent);
 
 	if (!move_anon() || non_swap_entry(ent))
 		return NULL;
-	usage_count = mem_cgroup_count_swap_user(ent, &page);
-	if (usage_count > 1) { /* we don't move shared anon */
-		if (page)
-			put_page(page);
-		return NULL;
-	}
+	/*
+	 * Because lookup_swap_cache() updates some statistics counter,
+	 * we call find_get_page() with swapper_space directly.
+	 */
+	page = find_get_page(&swapper_space, ent.val);
 	if (do_swap_account)
 		entry->val = ent.val;
 
 	return page;
 }
+#else
+static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
+			unsigned long addr, pte_t ptent, swp_entry_t *entry)
+{
+	return NULL;
+}
+#endif
 
 static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
 			unsigned long addr, pte_t ptent, swp_entry_t *entry)
diff --git a/mm/swapfile.c b/mm/swapfile.c
index b0c86e9..457b10b 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -717,37 +717,6 @@ int free_swap_and_cache(swp_entry_t entry)
 	return p != NULL;
 }
 
-#ifdef CONFIG_CGROUP_MEM_RES_CTLR
-/**
- * mem_cgroup_count_swap_user - count the user of a swap entry
- * @ent: the swap entry to be checked
- * @pagep: the pointer for the swap cache page of the entry to be stored
- *
- * Returns the number of the user of the swap entry. The number is valid only
- * for swaps of anonymous pages.
- * If the entry is found on swap cache, the page is stored to pagep with
- * refcount of it being incremented.
- */
-int mem_cgroup_count_swap_user(swp_entry_t ent, struct page **pagep)
-{
-	struct page *page;
-	struct swap_info_struct *p;
-	int count = 0;
-
-	page = find_get_page(&swapper_space, ent.val);
-	if (page)
-		count += page_mapcount(page);
-	p = swap_info_get(ent);
-	if (p) {
-		count += swap_count(p->swap_map[swp_offset(ent)]);
-		spin_unlock(&swap_lock);
-	}
-
-	*pagep = page;
-	return count;
-}
-#endif
-
 #ifdef CONFIG_HIBERNATION
 /*
  * Find the swap type that corresponds to given device (if any).
-- 
cgit v0.10.2


From e91cbb42531626cd4fd0673ca01daf53e338d8f9 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Tue, 29 May 2012 15:06:51 -0700
Subject: memcg swap: mem_cgroup_move_swap_account never needs fixup

The need_fixup arg to mem_cgroup_move_swap_account() is always false,
so just remove it.

Signed-off-by: Hugh Dickins <hughd@google.com>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index e7db70f..d5ba631e 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -3165,7 +3165,6 @@ void mem_cgroup_uncharge_swap(swp_entry_t ent)
  * @entry: swap entry to be moved
  * @from:  mem_cgroup which the entry is moved from
  * @to:  mem_cgroup which the entry is moved to
- * @need_fixup: whether we should fixup res_counters and refcounts.
  *
  * It succeeds only when the swap_cgroup's record for this entry is the same
  * as the mem_cgroup's id of @from.
@@ -3176,7 +3175,7 @@ void mem_cgroup_uncharge_swap(swp_entry_t ent)
  * both res and memsw, and called css_get().
  */
 static int mem_cgroup_move_swap_account(swp_entry_t entry,
-		struct mem_cgroup *from, struct mem_cgroup *to, bool need_fixup)
+				struct mem_cgroup *from, struct mem_cgroup *to)
 {
 	unsigned short old_id, new_id;
 
@@ -3195,24 +3194,13 @@ static int mem_cgroup_move_swap_account(swp_entry_t entry,
 		 * swap-in, the refcount of @to might be decreased to 0.
 		 */
 		mem_cgroup_get(to);
-		if (need_fixup) {
-			if (!mem_cgroup_is_root(from))
-				res_counter_uncharge(&from->memsw, PAGE_SIZE);
-			mem_cgroup_put(from);
-			/*
-			 * we charged both to->res and to->memsw, so we should
-			 * uncharge to->res.
-			 */
-			if (!mem_cgroup_is_root(to))
-				res_counter_uncharge(&to->res, PAGE_SIZE);
-		}
 		return 0;
 	}
 	return -EINVAL;
 }
 #else
 static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
-		struct mem_cgroup *from, struct mem_cgroup *to, bool need_fixup)
+				struct mem_cgroup *from, struct mem_cgroup *to)
 {
 	return -EINVAL;
 }
@@ -5546,8 +5534,7 @@ put:			/* get_mctgt_type() gets the page */
 			break;
 		case MC_TARGET_SWAP:
 			ent = target.ent;
-			if (!mem_cgroup_move_swap_account(ent,
-						mc.from, mc.to, false)) {
+			if (!mem_cgroup_move_swap_account(ent, mc.from, mc.to)) {
 				mc.precharge--;
 				/* we fixup refcnts and charges later. */
 				mc.moved_swap++;
-- 
cgit v0.10.2


From 86493009d3b7e51eee38575f9537b754f5b6c536 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Tue, 29 May 2012 15:06:52 -0700
Subject: memcg swap: use mem_cgroup_uncharge_swap()

That stuff __mem_cgroup_commit_charge_swapin() does with a swap entry, it
has a name and even a declaration: just use mem_cgroup_uncharge_swap().

Signed-off-by: Hugh Dickins <hughd@google.com>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index d5ba631e..30f938c 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2855,24 +2855,7 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg,
 	 */
 	if (do_swap_account && PageSwapCache(page)) {
 		swp_entry_t ent = {.val = page_private(page)};
-		struct mem_cgroup *swap_memcg;
-		unsigned short id;
-
-		id = swap_cgroup_record(ent, 0);
-		rcu_read_lock();
-		swap_memcg = mem_cgroup_lookup(id);
-		if (swap_memcg) {
-			/*
-			 * This recorded memcg can be obsolete one. So, avoid
-			 * calling css_tryget
-			 */
-			if (!mem_cgroup_is_root(swap_memcg))
-				res_counter_uncharge(&swap_memcg->memsw,
-						     PAGE_SIZE);
-			mem_cgroup_swap_statistics(swap_memcg, false);
-			mem_cgroup_put(swap_memcg);
-		}
-		rcu_read_unlock();
+		mem_cgroup_uncharge_swap(ent);
 	}
 	/*
 	 * At swapin, we may charge account against cgroup which has no tasks.
-- 
cgit v0.10.2


From c3c787e8c38557ccf44c670d73aebe630a2b1479 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Tue, 29 May 2012 15:06:52 -0700
Subject: mm/memcg: scanning_global_lru means mem_cgroup_disabled

Although one has to admire the skill with which it has been concealed,
scanning_global_lru(mz) is actually just an interesting way to test
mem_cgroup_disabled().  Too many developer hours have been wasted on
confusing it with global_reclaim(): just use mem_cgroup_disabled().

Signed-off-by: Hugh Dickins <hughd@google.com>
Signed-off-by: Konstantin Khlebnikov <khlebnikov@openvz.org>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Acked-by: Glauber Costa <glommer@parallels.com>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Minchan Kim <minchan@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/mm/vmscan.c b/mm/vmscan.c
index ee97530..52fac58 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -140,26 +140,16 @@ static bool global_reclaim(struct scan_control *sc)
 {
 	return !sc->target_mem_cgroup;
 }
-
-static bool scanning_global_lru(struct mem_cgroup_zone *mz)
-{
-	return !mz->mem_cgroup;
-}
 #else
 static bool global_reclaim(struct scan_control *sc)
 {
 	return true;
 }
-
-static bool scanning_global_lru(struct mem_cgroup_zone *mz)
-{
-	return true;
-}
 #endif
 
 static struct zone_reclaim_stat *get_reclaim_stat(struct mem_cgroup_zone *mz)
 {
-	if (!scanning_global_lru(mz))
+	if (!mem_cgroup_disabled())
 		return mem_cgroup_get_reclaim_stat(mz->mem_cgroup, mz->zone);
 
 	return &mz->zone->reclaim_stat;
@@ -168,7 +158,7 @@ static struct zone_reclaim_stat *get_reclaim_stat(struct mem_cgroup_zone *mz)
 static unsigned long zone_nr_lru_pages(struct mem_cgroup_zone *mz,
 				       enum lru_list lru)
 {
-	if (!scanning_global_lru(mz))
+	if (!mem_cgroup_disabled())
 		return mem_cgroup_zone_nr_lru_pages(mz->mem_cgroup,
 						    zone_to_nid(mz->zone),
 						    zone_idx(mz->zone),
@@ -1589,7 +1579,7 @@ static int inactive_anon_is_low(struct mem_cgroup_zone *mz)
 	if (!total_swap_pages)
 		return 0;
 
-	if (!scanning_global_lru(mz))
+	if (!mem_cgroup_disabled())
 		return mem_cgroup_inactive_anon_is_low(mz->mem_cgroup,
 						       mz->zone);
 
@@ -1628,7 +1618,7 @@ static int inactive_file_is_low_global(struct zone *zone)
  */
 static int inactive_file_is_low(struct mem_cgroup_zone *mz)
 {
-	if (!scanning_global_lru(mz))
+	if (!mem_cgroup_disabled())
 		return mem_cgroup_inactive_file_is_low(mz->mem_cgroup,
 						       mz->zone);
 
-- 
cgit v0.10.2


From 89abfab133ef1f5902abafb744df72793213ac19 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Tue, 29 May 2012 15:06:53 -0700
Subject: mm/memcg: move reclaim_stat into lruvec

With mem_cgroup_disabled() now explicit, it becomes clear that the
zone_reclaim_stat structure actually belongs in lruvec, per-zone when
memcg is disabled but per-memcg per-zone when it's enabled.

We can delete mem_cgroup_get_reclaim_stat(), and change
update_page_reclaim_stat() to update just the one set of stats, the one
which get_scan_count() will actually use.

Signed-off-by: Hugh Dickins <hughd@google.com>
Signed-off-by: Konstantin Khlebnikov <khlebnikov@openvz.org>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Acked-by: Michal Hocko <mhocko@suse.cz>
Reviewed-by: Minchan Kim <minchan@kernel.org>
Reviewed-by: Michal Hocko <mhocko@suse.cz>
Cc: Glauber Costa <glommer@parallels.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 18ea0b7..cfe9050 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -126,8 +126,6 @@ int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg,
 int mem_cgroup_select_victim_node(struct mem_cgroup *memcg);
 unsigned long mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *memcg,
 					int nid, int zid, unsigned int lrumask);
-struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg,
-						      struct zone *zone);
 struct zone_reclaim_stat*
 mem_cgroup_get_reclaim_stat_from_page(struct page *page);
 extern void mem_cgroup_print_oom_info(struct mem_cgroup *memcg,
@@ -356,13 +354,6 @@ mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *memcg, int nid, int zid,
 	return 0;
 }
 
-
-static inline struct zone_reclaim_stat*
-mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg, struct zone *zone)
-{
-	return NULL;
-}
-
 static inline struct zone_reclaim_stat*
 mem_cgroup_get_reclaim_stat_from_page(struct page *page)
 {
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 4871e31..1b89861 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -185,8 +185,22 @@ static inline int is_unevictable_lru(enum lru_list lru)
 	return (lru == LRU_UNEVICTABLE);
 }
 
+struct zone_reclaim_stat {
+	/*
+	 * The pageout code in vmscan.c keeps track of how many of the
+	 * mem/swap backed and file backed pages are refeferenced.
+	 * The higher the rotated/scanned ratio, the more valuable
+	 * that cache is.
+	 *
+	 * The anon LRU stats live in [0], file LRU stats in [1]
+	 */
+	unsigned long		recent_rotated[2];
+	unsigned long		recent_scanned[2];
+};
+
 struct lruvec {
 	struct list_head lists[NR_LRU_LISTS];
+	struct zone_reclaim_stat reclaim_stat;
 };
 
 /* Mask used at gathering information at once (see memcontrol.c) */
@@ -313,19 +327,6 @@ enum zone_type {
 #error ZONES_SHIFT -- too many zones configured adjust calculation
 #endif
 
-struct zone_reclaim_stat {
-	/*
-	 * The pageout code in vmscan.c keeps track of how many of the
-	 * mem/swap backed and file backed pages are refeferenced.
-	 * The higher the rotated/scanned ratio, the more valuable
-	 * that cache is.
-	 *
-	 * The anon LRU stats live in [0], file LRU stats in [1]
-	 */
-	unsigned long		recent_rotated[2];
-	unsigned long		recent_scanned[2];
-};
-
 struct zone {
 	/* Fields commonly accessed by the page allocator */
 
@@ -407,8 +408,6 @@ struct zone {
 	spinlock_t		lru_lock;
 	struct lruvec		lruvec;
 
-	struct zone_reclaim_stat reclaim_stat;
-
 	unsigned long		pages_scanned;	   /* since last reclaim */
 	unsigned long		flags;		   /* zone flags, see below */
 
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 30f938c..00c8898 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -138,7 +138,6 @@ struct mem_cgroup_per_zone {
 
 	struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1];
 
-	struct zone_reclaim_stat reclaim_stat;
 	struct rb_node		tree_node;	/* RB tree node */
 	unsigned long long	usage_in_excess;/* Set to the value by which */
 						/* the soft limit is exceeded*/
@@ -1243,16 +1242,6 @@ int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg, struct zone *zone)
 	return (active > inactive);
 }
 
-struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg,
-						      struct zone *zone)
-{
-	int nid = zone_to_nid(zone);
-	int zid = zone_idx(zone);
-	struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid);
-
-	return &mz->reclaim_stat;
-}
-
 struct zone_reclaim_stat *
 mem_cgroup_get_reclaim_stat_from_page(struct page *page)
 {
@@ -1268,7 +1257,7 @@ mem_cgroup_get_reclaim_stat_from_page(struct page *page)
 	/* Ensure pc->mem_cgroup is visible after reading PCG_USED. */
 	smp_rmb();
 	mz = page_cgroup_zoneinfo(pc->mem_cgroup, page);
-	return &mz->reclaim_stat;
+	return &mz->lruvec.reclaim_stat;
 }
 
 #define mem_cgroup_from_res_counter(counter, member)	\
@@ -4216,21 +4205,19 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
 	{
 		int nid, zid;
 		struct mem_cgroup_per_zone *mz;
+		struct zone_reclaim_stat *rstat;
 		unsigned long recent_rotated[2] = {0, 0};
 		unsigned long recent_scanned[2] = {0, 0};
 
 		for_each_online_node(nid)
 			for (zid = 0; zid < MAX_NR_ZONES; zid++) {
 				mz = mem_cgroup_zoneinfo(memcg, nid, zid);
+				rstat = &mz->lruvec.reclaim_stat;
 
-				recent_rotated[0] +=
-					mz->reclaim_stat.recent_rotated[0];
-				recent_rotated[1] +=
-					mz->reclaim_stat.recent_rotated[1];
-				recent_scanned[0] +=
-					mz->reclaim_stat.recent_scanned[0];
-				recent_scanned[1] +=
-					mz->reclaim_stat.recent_scanned[1];
+				recent_rotated[0] += rstat->recent_rotated[0];
+				recent_rotated[1] += rstat->recent_rotated[1];
+				recent_scanned[0] += rstat->recent_scanned[0];
+				recent_scanned[1] += rstat->recent_scanned[1];
 			}
 		cb->fill(cb, "recent_rotated_anon", recent_rotated[0]);
 		cb->fill(cb, "recent_rotated_file", recent_rotated[1]);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 4fc462b..8cbfc38 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4410,10 +4410,10 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
 		zone_pcp_init(zone);
 		for_each_lru(lru)
 			INIT_LIST_HEAD(&zone->lruvec.lists[lru]);
-		zone->reclaim_stat.recent_rotated[0] = 0;
-		zone->reclaim_stat.recent_rotated[1] = 0;
-		zone->reclaim_stat.recent_scanned[0] = 0;
-		zone->reclaim_stat.recent_scanned[1] = 0;
+		zone->lruvec.reclaim_stat.recent_rotated[0] = 0;
+		zone->lruvec.reclaim_stat.recent_rotated[1] = 0;
+		zone->lruvec.reclaim_stat.recent_scanned[0] = 0;
+		zone->lruvec.reclaim_stat.recent_scanned[1] = 0;
 		zap_zone_vm_stats(zone);
 		zone->flags = 0;
 		if (!size)
diff --git a/mm/swap.c b/mm/swap.c
index 6fdd72e..0503ad7 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -312,21 +312,15 @@ void rotate_reclaimable_page(struct page *page)
 static void update_page_reclaim_stat(struct zone *zone, struct page *page,
 				     int file, int rotated)
 {
-	struct zone_reclaim_stat *reclaim_stat = &zone->reclaim_stat;
-	struct zone_reclaim_stat *memcg_reclaim_stat;
+	struct zone_reclaim_stat *reclaim_stat;
 
-	memcg_reclaim_stat = mem_cgroup_get_reclaim_stat_from_page(page);
+	reclaim_stat = mem_cgroup_get_reclaim_stat_from_page(page);
+	if (!reclaim_stat)
+		reclaim_stat = &zone->lruvec.reclaim_stat;
 
 	reclaim_stat->recent_scanned[file]++;
 	if (rotated)
 		reclaim_stat->recent_rotated[file]++;
-
-	if (!memcg_reclaim_stat)
-		return;
-
-	memcg_reclaim_stat->recent_scanned[file]++;
-	if (rotated)
-		memcg_reclaim_stat->recent_rotated[file]++;
 }
 
 static void __activate_page(struct page *page, void *arg)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 52fac58..e234ada 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -149,10 +149,7 @@ static bool global_reclaim(struct scan_control *sc)
 
 static struct zone_reclaim_stat *get_reclaim_stat(struct mem_cgroup_zone *mz)
 {
-	if (!mem_cgroup_disabled())
-		return mem_cgroup_get_reclaim_stat(mz->mem_cgroup, mz->zone);
-
-	return &mz->zone->reclaim_stat;
+	return &mem_cgroup_zone_lruvec(mz->zone, mz->mem_cgroup)->reclaim_stat;
 }
 
 static unsigned long zone_nr_lru_pages(struct mem_cgroup_zone *mz,
-- 
cgit v0.10.2