From a82afdfcb8c0df09776b6458af6b68fc58b2e87b Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 3 Jul 2009 17:48:16 +0900
Subject: block: use the same failfast bits for bio and request

bio and request use the same set of failfast bits.  This patch makes
the following changes to simplify things.

* enumify BIO_RW* bits and reorder bits such that BIOS_RW_FAILFAST_*
  bits coincide with __REQ_FAILFAST_* bits.

* The above pushes BIO_RW_AHEAD out of sync with __REQ_FAILFAST_DEV
  but the matching is useless anyway.  init_request_from_bio() is
  responsible for setting FAILFAST bits on FS requests and non-FS
  requests never use BIO_RW_AHEAD.  Drop the code and comment from
  blk_rq_bio_prep().

* Define REQ_FAILFAST_MASK which is OR of all FAILFAST bits and
  simplify FAILFAST flags handling in init_request_from_bio().

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/block/blk-core.c b/block/blk-core.c
index e3299a7..4daae1e 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1111,17 +1111,13 @@ void init_request_from_bio(struct request *req, struct bio *bio)
 	req->cmd_type = REQ_TYPE_FS;
 
 	/*
-	 * inherit FAILFAST from bio (for read-ahead, and explicit FAILFAST)
+	 * Inherit FAILFAST from bio (for read-ahead, and explicit
+	 * FAILFAST).  FAILFAST flags are identical for req and bio.
 	 */
 	if (bio_rw_ahead(bio))
-		req->cmd_flags |= (REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT |
-				   REQ_FAILFAST_DRIVER);
-	if (bio_failfast_dev(bio))
-		req->cmd_flags |= REQ_FAILFAST_DEV;
-	if (bio_failfast_transport(bio))
-		req->cmd_flags |= REQ_FAILFAST_TRANSPORT;
-	if (bio_failfast_driver(bio))
-		req->cmd_flags |= REQ_FAILFAST_DRIVER;
+		req->cmd_flags |= REQ_FAILFAST_MASK;
+	else
+		req->cmd_flags |= bio->bi_rw & REQ_FAILFAST_MASK;
 
 	if (unlikely(bio_discard(bio))) {
 		req->cmd_flags |= REQ_DISCARD;
@@ -2239,9 +2235,8 @@ EXPORT_SYMBOL(__blk_end_request_cur);
 void blk_rq_bio_prep(struct request_queue *q, struct request *rq,
 		     struct bio *bio)
 {
-	/* Bit 0 (R/W) is identical in rq->cmd_flags and bio->bi_rw, and
-	   we want BIO_RW_AHEAD (bit 1) to imply REQ_FAILFAST (bit 1). */
-	rq->cmd_flags |= (bio->bi_rw & 3);
+	/* Bit 0 (R/W) is identical in rq->cmd_flags and bio->bi_rw */
+	rq->cmd_flags |= bio->bi_rw & REQ_RW;
 
 	if (bio_has_data(bio)) {
 		rq->nr_phys_segments = bio_phys_segments(q, bio);
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 2892b71..a299ed3 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -142,37 +142,40 @@ struct bio {
  *
  * bit 0 -- data direction
  *	If not set, bio is a read from device. If set, it's a write to device.
- * bit 1 -- rw-ahead when set
- * bit 2 -- barrier
+ * bit 1 -- fail fast device errors
+ * bit 2 -- fail fast transport errors
+ * bit 3 -- fail fast driver errors
+ * bit 4 -- rw-ahead when set
+ * bit 5 -- barrier
  *	Insert a serialization point in the IO queue, forcing previously
  *	submitted IO to be completed before this one is issued.
- * bit 3 -- synchronous I/O hint.
- * bit 4 -- Unplug the device immediately after submitting this bio.
- * bit 5 -- metadata request
+ * bit 6 -- synchronous I/O hint.
+ * bit 7 -- Unplug the device immediately after submitting this bio.
+ * bit 8 -- metadata request
  *	Used for tracing to differentiate metadata and data IO. May also
  *	get some preferential treatment in the IO scheduler
- * bit 6 -- discard sectors
+ * bit 9 -- discard sectors
  *	Informs the lower level device that this range of sectors is no longer
  *	used by the file system and may thus be freed by the device. Used
  *	for flash based storage.
- * bit 7 -- fail fast device errors
- * bit 8 -- fail fast transport errors
- * bit 9 -- fail fast driver errors
  *	Don't want driver retries for any fast fail whatever the reason.
  * bit 10 -- Tell the IO scheduler not to wait for more requests after this
 	one has been submitted, even if it is a SYNC request.
  */
-#define BIO_RW		0	/* Must match RW in req flags (blkdev.h) */
-#define BIO_RW_AHEAD	1	/* Must match FAILFAST in req flags */
-#define BIO_RW_BARRIER	2
-#define BIO_RW_SYNCIO	3
-#define BIO_RW_UNPLUG	4
-#define BIO_RW_META	5
-#define BIO_RW_DISCARD	6
-#define BIO_RW_FAILFAST_DEV		7
-#define BIO_RW_FAILFAST_TRANSPORT	8
-#define BIO_RW_FAILFAST_DRIVER		9
-#define BIO_RW_NOIDLE	10
+enum bio_rw_flags {
+	BIO_RW,
+	BIO_RW_FAILFAST_DEV,
+	BIO_RW_FAILFAST_TRANSPORT,
+	BIO_RW_FAILFAST_DRIVER,
+	/* above flags must match REQ_* */
+	BIO_RW_AHEAD,
+	BIO_RW_BARRIER,
+	BIO_RW_SYNCIO,
+	BIO_RW_UNPLUG,
+	BIO_RW_META,
+	BIO_RW_DISCARD,
+	BIO_RW_NOIDLE,
+};
 
 #define bio_rw_flagged(bio, flag)	((bio)->bi_rw & (1 << (flag)))
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 69103e0..c3015736 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -93,6 +93,7 @@ enum rq_flag_bits {
 	__REQ_FAILFAST_DEV,	/* no driver retries of device errors */
 	__REQ_FAILFAST_TRANSPORT, /* no driver retries of transport errors */
 	__REQ_FAILFAST_DRIVER,	/* no driver retries of driver errors */
+	/* above flags must match BIO_RW_* */
 	__REQ_DISCARD,		/* request to discard sectors */
 	__REQ_SORTED,		/* elevator knows about this request */
 	__REQ_SOFTBARRIER,	/* may not be passed by ioscheduler */
@@ -143,6 +144,9 @@ enum rq_flag_bits {
 #define REQ_NOIDLE	(1 << __REQ_NOIDLE)
 #define REQ_IO_STAT	(1 << __REQ_IO_STAT)
 
+#define REQ_FAILFAST_MASK	(REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT | \
+				 REQ_FAILFAST_DRIVER)
+
 #define BLK_MAX_CDB	16
 
 /*
-- 
cgit v0.10.2


From 80a761fd33cf812f771e212139157bf8f58d4b3f Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 3 Jul 2009 17:48:17 +0900
Subject: block: implement mixed merge of different failfast requests

Failfast has characteristics from other attributes.  When issuing,
executing and successuflly completing requests, failfast doesn't make
any difference.  It only affects how a request is handled on failure.
Allowing requests with different failfast settings to be merged cause
normal IOs to fail prematurely while not allowing has performance
penalties as failfast is used for read aheads which are likely to be
located near in-flight or to-be-issued normal IOs.

This patch introduces the concept of 'mixed merge'.  A request is a
mixed merge if it is merge of segments which require different
handling on failure.  Currently the only mixable attributes are
failfast ones (or lack thereof).

When a bio with different failfast settings is added to an existing
request or requests of different failfast settings are merged, the
merged request is marked mixed.  Each bio carries failfast settings
and the request always tracks failfast state of the first bio.  When
the request fails, blk_rq_err_bytes() can be used to determine how
many bytes can be safely failed without crossing into an area which
requires further retrials.

This allows request merging regardless of failfast settings while
keeping the failure handling correct.

This patch only implements mixed merge but doesn't enable it.  The
next one will update SCSI to make use of mixed merge.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Niel Lambrechts <niel.lambrechts@gmail.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/block/blk-core.c b/block/blk-core.c
index 4daae1e..c822239 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1157,6 +1157,7 @@ static int __make_request(struct request_queue *q, struct bio *bio)
 	const unsigned short prio = bio_prio(bio);
 	const int sync = bio_sync(bio);
 	const int unplug = bio_unplug(bio);
+	const unsigned int ff = bio->bi_rw & REQ_FAILFAST_MASK;
 	int rw_flags;
 
 	if (bio_barrier(bio) && bio_has_data(bio) &&
@@ -1186,6 +1187,9 @@ static int __make_request(struct request_queue *q, struct bio *bio)
 
 		trace_block_bio_backmerge(q, bio);
 
+		if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff)
+			blk_rq_set_mixed_merge(req);
+
 		req->biotail->bi_next = bio;
 		req->biotail = bio;
 		req->__data_len += bytes;
@@ -1205,6 +1209,12 @@ static int __make_request(struct request_queue *q, struct bio *bio)
 
 		trace_block_bio_frontmerge(q, bio);
 
+		if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff) {
+			blk_rq_set_mixed_merge(req);
+			req->cmd_flags &= ~REQ_FAILFAST_MASK;
+			req->cmd_flags |= ff;
+		}
+
 		bio->bi_next = req->bio;
 		req->bio = bio;
 
@@ -1649,6 +1659,50 @@ int blk_insert_cloned_request(struct request_queue *q, struct request *rq)
 }
 EXPORT_SYMBOL_GPL(blk_insert_cloned_request);
 
+/**
+ * blk_rq_err_bytes - determine number of bytes till the next failure boundary
+ * @rq: request to examine
+ *
+ * Description:
+ *     A request could be merge of IOs which require different failure
+ *     handling.  This function determines the number of bytes which
+ *     can be failed from the beginning of the request without
+ *     crossing into area which need to be retried further.
+ *
+ * Return:
+ *     The number of bytes to fail.
+ *
+ * Context:
+ *     queue_lock must be held.
+ */
+unsigned int blk_rq_err_bytes(const struct request *rq)
+{
+	unsigned int ff = rq->cmd_flags & REQ_FAILFAST_MASK;
+	unsigned int bytes = 0;
+	struct bio *bio;
+
+	if (!(rq->cmd_flags & REQ_MIXED_MERGE))
+		return blk_rq_bytes(rq);
+
+	/*
+	 * Currently the only 'mixing' which can happen is between
+	 * different fastfail types.  We can safely fail portions
+	 * which have all the failfast bits that the first one has -
+	 * the ones which are at least as eager to fail as the first
+	 * one.
+	 */
+	for (bio = rq->bio; bio; bio = bio->bi_next) {
+		if ((bio->bi_rw & ff) != ff)
+			break;
+		bytes += bio->bi_size;
+	}
+
+	/* this could lead to infinite loop */
+	BUG_ON(blk_rq_bytes(rq) && !bytes);
+	return bytes;
+}
+EXPORT_SYMBOL_GPL(blk_rq_err_bytes);
+
 static void blk_account_io_completion(struct request *req, unsigned int bytes)
 {
 	if (blk_do_io_stat(req)) {
@@ -1995,6 +2049,12 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes)
 	if (blk_fs_request(req) || blk_discard_rq(req))
 		req->__sector += total_bytes >> 9;
 
+	/* mixed attributes always follow the first bio */
+	if (req->cmd_flags & REQ_MIXED_MERGE) {
+		req->cmd_flags &= ~REQ_FAILFAST_MASK;
+		req->cmd_flags |= req->bio->bi_rw & REQ_FAILFAST_MASK;
+	}
+
 	/*
 	 * If total number of sectors is less than the first segment
 	 * size, something has gone terribly wrong.
@@ -2174,6 +2234,25 @@ bool blk_end_request_cur(struct request *rq, int error)
 EXPORT_SYMBOL(blk_end_request_cur);
 
 /**
+ * blk_end_request_err - Finish a request till the next failure boundary.
+ * @rq: the request to finish till the next failure boundary for
+ * @error: must be negative errno
+ *
+ * Description:
+ *     Complete @rq till the next failure boundary.
+ *
+ * Return:
+ *     %false - we are done with this request
+ *     %true  - still buffers pending for this request
+ */
+bool blk_end_request_err(struct request *rq, int error)
+{
+	WARN_ON(error >= 0);
+	return blk_end_request(rq, error, blk_rq_err_bytes(rq));
+}
+EXPORT_SYMBOL_GPL(blk_end_request_err);
+
+/**
  * __blk_end_request - Helper function for drivers to complete the request.
  * @rq:       the request being processed
  * @error:    %0 for success, < %0 for error
@@ -2232,6 +2311,26 @@ bool __blk_end_request_cur(struct request *rq, int error)
 }
 EXPORT_SYMBOL(__blk_end_request_cur);
 
+/**
+ * __blk_end_request_err - Finish a request till the next failure boundary.
+ * @rq: the request to finish till the next failure boundary for
+ * @error: must be negative errno
+ *
+ * Description:
+ *     Complete @rq till the next failure boundary.  Must be called
+ *     with queue lock held.
+ *
+ * Return:
+ *     %false - we are done with this request
+ *     %true  - still buffers pending for this request
+ */
+bool __blk_end_request_err(struct request *rq, int error)
+{
+	WARN_ON(error >= 0);
+	return __blk_end_request(rq, error, blk_rq_err_bytes(rq));
+}
+EXPORT_SYMBOL_GPL(__blk_end_request_err);
+
 void blk_rq_bio_prep(struct request_queue *q, struct request *rq,
 		     struct bio *bio)
 {
diff --git a/block/blk-merge.c b/block/blk-merge.c
index e199967..7c9ca01 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -311,6 +311,36 @@ static int ll_merge_requests_fn(struct request_queue *q, struct request *req,
 	return 1;
 }
 
+/**
+ * blk_rq_set_mixed_merge - mark a request as mixed merge
+ * @rq: request to mark as mixed merge
+ *
+ * Description:
+ *     @rq is about to be mixed merged.  Make sure the attributes
+ *     which can be mixed are set in each bio and mark @rq as mixed
+ *     merged.
+ */
+void blk_rq_set_mixed_merge(struct request *rq)
+{
+	unsigned int ff = rq->cmd_flags & REQ_FAILFAST_MASK;
+	struct bio *bio;
+
+	if (rq->cmd_flags & REQ_MIXED_MERGE)
+		return;
+
+	/*
+	 * @rq will no longer represent mixable attributes for all the
+	 * contained bios.  It will just track those of the first one.
+	 * Distributes the attributs to each bio.
+	 */
+	for (bio = rq->bio; bio; bio = bio->bi_next) {
+		WARN_ON_ONCE((bio->bi_rw & REQ_FAILFAST_MASK) &&
+			     (bio->bi_rw & REQ_FAILFAST_MASK) != ff);
+		bio->bi_rw |= ff;
+	}
+	rq->cmd_flags |= REQ_MIXED_MERGE;
+}
+
 static void blk_account_io_merge(struct request *req)
 {
 	if (blk_do_io_stat(req)) {
@@ -366,6 +396,19 @@ static int attempt_merge(struct request_queue *q, struct request *req,
 		return 0;
 
 	/*
+	 * If failfast settings disagree or any of the two is already
+	 * a mixed merge, mark both as mixed before proceeding.  This
+	 * makes sure that all involved bios have mixable attributes
+	 * set properly.
+	 */
+	if ((req->cmd_flags | next->cmd_flags) & REQ_MIXED_MERGE ||
+	    (req->cmd_flags & REQ_FAILFAST_MASK) !=
+	    (next->cmd_flags & REQ_FAILFAST_MASK)) {
+		blk_rq_set_mixed_merge(req);
+		blk_rq_set_mixed_merge(next);
+	}
+
+	/*
 	 * At this point we have either done a back merge
 	 * or front merge. We need the smaller start_time of
 	 * the merged requests to be the current request
diff --git a/block/blk.h b/block/blk.h
index 3fae6ad..5ee3d7e 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -104,6 +104,7 @@ int ll_front_merge_fn(struct request_queue *q, struct request *req,
 int attempt_back_merge(struct request_queue *q, struct request *rq);
 int attempt_front_merge(struct request_queue *q, struct request *rq);
 void blk_recalc_rq_segments(struct request *rq);
+void blk_rq_set_mixed_merge(struct request *rq);
 
 void blk_queue_congestion_threshold(struct request_queue *q);
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index c3015736..650b6a9 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -115,6 +115,7 @@ enum rq_flag_bits {
 	__REQ_INTEGRITY,	/* integrity metadata has been remapped */
 	__REQ_NOIDLE,		/* Don't anticipate more IO after this one */
 	__REQ_IO_STAT,		/* account I/O stat */
+	__REQ_MIXED_MERGE,	/* merge of different types, fail separately */
 	__REQ_NR_BITS,		/* stops here */
 };
 
@@ -143,6 +144,7 @@ enum rq_flag_bits {
 #define REQ_INTEGRITY	(1 << __REQ_INTEGRITY)
 #define REQ_NOIDLE	(1 << __REQ_NOIDLE)
 #define REQ_IO_STAT	(1 << __REQ_IO_STAT)
+#define REQ_MIXED_MERGE	(1 << __REQ_MIXED_MERGE)
 
 #define REQ_FAILFAST_MASK	(REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT | \
 				 REQ_FAILFAST_DRIVER)
@@ -832,11 +834,13 @@ static inline void blk_run_address_space(struct address_space *mapping)
 }
 
 /*
- * blk_rq_pos()		: the current sector
- * blk_rq_bytes()	: bytes left in the entire request
- * blk_rq_cur_bytes()	: bytes left in the current segment
- * blk_rq_sectors()	: sectors left in the entire request
- * blk_rq_cur_sectors()	: sectors left in the current segment
+ * blk_rq_pos()			: the current sector
+ * blk_rq_bytes()		: bytes left in the entire request
+ * blk_rq_cur_bytes()		: bytes left in the current segment
+ * blk_rq_err_bytes()		: bytes left till the next error boundary
+ * blk_rq_sectors()		: sectors left in the entire request
+ * blk_rq_cur_sectors()		: sectors left in the current segment
+ * blk_rq_err_sectors()		: sectors left till the next error boundary
  */
 static inline sector_t blk_rq_pos(const struct request *rq)
 {
@@ -853,6 +857,8 @@ static inline int blk_rq_cur_bytes(const struct request *rq)
 	return rq->bio ? bio_cur_bytes(rq->bio) : 0;
 }
 
+extern unsigned int blk_rq_err_bytes(const struct request *rq);
+
 static inline unsigned int blk_rq_sectors(const struct request *rq)
 {
 	return blk_rq_bytes(rq) >> 9;
@@ -863,6 +869,11 @@ static inline unsigned int blk_rq_cur_sectors(const struct request *rq)
 	return blk_rq_cur_bytes(rq) >> 9;
 }
 
+static inline unsigned int blk_rq_err_sectors(const struct request *rq)
+{
+	return blk_rq_err_bytes(rq) >> 9;
+}
+
 /*
  * Request issue related functions.
  */
@@ -889,10 +900,12 @@ extern bool blk_end_request(struct request *rq, int error,
 			    unsigned int nr_bytes);
 extern void blk_end_request_all(struct request *rq, int error);
 extern bool blk_end_request_cur(struct request *rq, int error);
+extern bool blk_end_request_err(struct request *rq, int error);
 extern bool __blk_end_request(struct request *rq, int error,
 			      unsigned int nr_bytes);
 extern void __blk_end_request_all(struct request *rq, int error);
 extern bool __blk_end_request_cur(struct request *rq, int error);
+extern bool __blk_end_request_err(struct request *rq, int error);
 
 extern void blk_complete_request(struct request *);
 extern void __blk_complete_request(struct request *);
-- 
cgit v0.10.2


From da6c5c720c52cc717124f8f0830b710ea6a092fd Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 11 Sep 2009 14:26:40 +0200
Subject: scsi,block: update SCSI to handle mixed merge failures

Update scsi_io_completion() such that it only fails requests till the
next error boundary and retry the leftover.  This enables block layer
to merge requests with different failfast settings and still behave
correctly on errors.  Allow merge of requests of different failfast
settings.

As SCSI is currently the only subsystem which follows failfast status,
there's no need to worry about other block drivers for now.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Niel Lambrechts <niel.lambrechts@gmail.com>
Cc: James Bottomley <James.Bottomley@HansenPartnership.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/block/blk-merge.c b/block/blk-merge.c
index 7c9ca01..b0de857 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -380,12 +380,6 @@ static int attempt_merge(struct request_queue *q, struct request *req,
 	if (blk_integrity_rq(req) != blk_integrity_rq(next))
 		return 0;
 
-	/* don't merge requests of different failfast settings */
-	if (blk_failfast_dev(req)	!= blk_failfast_dev(next)	||
-	    blk_failfast_transport(req)	!= blk_failfast_transport(next)	||
-	    blk_failfast_driver(req)	!= blk_failfast_driver(next))
-		return 0;
-
 	/*
 	 * If we are allowed to merge, then append bio list
 	 * from next to rq and release next. merge_requests_fn
diff --git a/block/elevator.c b/block/elevator.c
index 2d511f9..ca86192 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -100,19 +100,6 @@ int elv_rq_merge_ok(struct request *rq, struct bio *bio)
 	if (bio_integrity(bio) != blk_integrity_rq(rq))
 		return 0;
 
-	/*
-	 * Don't merge if failfast settings don't match.
-	 *
-	 * FIXME: The negation in front of each condition is necessary
-	 * because bio and request flags use different bit positions
-	 * and the accessors return those bits directly.  This
-	 * ugliness will soon go away.
-	 */
-	if (!bio_failfast_dev(bio)	 != !blk_failfast_dev(rq)	||
-	    !bio_failfast_transport(bio) != !blk_failfast_transport(rq)	||
-	    !bio_failfast_driver(bio)	 != !blk_failfast_driver(rq))
-		return 0;
-
 	if (!elv_iosched_allow_merge(rq, bio))
 		return 0;
 
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index f3c4089..90c94da 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -897,8 +897,10 @@ void scsi_io_completion(struct scsi_cmnd *cmd, unsigned int good_bytes)
 			if (driver_byte(result) & DRIVER_SENSE)
 				scsi_print_sense("", cmd);
 		}
-		blk_end_request_all(req, -EIO);
-		scsi_next_command(cmd);
+		if (blk_end_request_err(req, -EIO))
+			scsi_requeue_command(q, cmd);
+		else
+			scsi_next_command(cmd);
 		break;
 	case ACTION_REPREP:
 		/* Unprep the request and put it back at the head of the queue.
-- 
cgit v0.10.2


From 5ad531db6e0f3c3c985666e83d3c1c4d53acccf9 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Fri, 3 Jul 2009 12:57:48 +0200
Subject: cfq-iosched: drain device queue before switching to a sync queue

To lessen the impact of async IO on sync IO, let the device drain of
any async IO in progress when switching to a sync cfqq that has idling
enabled.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index fd7080e..93693bf 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -140,7 +140,7 @@ struct cfq_data {
 	 */
 	unsigned int busy_rt_queues;
 
-	int rq_in_driver;
+	int rq_in_driver[2];
 	int sync_flight;
 
 	/*
@@ -239,6 +239,11 @@ static struct cfq_queue *cfq_get_queue(struct cfq_data *, int,
 static struct cfq_io_context *cfq_cic_lookup(struct cfq_data *,
 						struct io_context *);
 
+static inline int rq_in_driver(struct cfq_data *cfqd)
+{
+	return cfqd->rq_in_driver[0] + cfqd->rq_in_driver[1];
+}
+
 static inline struct cfq_queue *cic_to_cfqq(struct cfq_io_context *cic,
 					    int is_sync)
 {
@@ -760,9 +765,9 @@ static void cfq_activate_request(struct request_queue *q, struct request *rq)
 {
 	struct cfq_data *cfqd = q->elevator->elevator_data;
 
-	cfqd->rq_in_driver++;
+	cfqd->rq_in_driver[rq_is_sync(rq)]++;
 	cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "activate rq, drv=%d",
-						cfqd->rq_in_driver);
+						rq_in_driver(cfqd));
 
 	cfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq);
 }
@@ -770,11 +775,12 @@ static void cfq_activate_request(struct request_queue *q, struct request *rq)
 static void cfq_deactivate_request(struct request_queue *q, struct request *rq)
 {
 	struct cfq_data *cfqd = q->elevator->elevator_data;
+	const int sync = rq_is_sync(rq);
 
-	WARN_ON(!cfqd->rq_in_driver);
-	cfqd->rq_in_driver--;
+	WARN_ON(!cfqd->rq_in_driver[sync]);
+	cfqd->rq_in_driver[sync]--;
 	cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "deactivate rq, drv=%d",
-						cfqd->rq_in_driver);
+						rq_in_driver(cfqd));
 }
 
 static void cfq_remove_request(struct request *rq)
@@ -1080,7 +1086,7 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd)
 	/*
 	 * still requests with the driver, don't idle
 	 */
-	if (cfqd->rq_in_driver)
+	if (rq_in_driver(cfqd))
 		return;
 
 	/*
@@ -1312,6 +1318,12 @@ static int cfq_dispatch_requests(struct request_queue *q, int force)
 		return 0;
 
 	/*
+	 * Drain async requests before we start sync IO
+	 */
+	if (cfq_cfqq_idle_window(cfqq) && cfqd->rq_in_driver[BLK_RW_ASYNC])
+		return 0;
+
+	/*
 	 * If this is an async queue and we have sync IO in flight, let it wait
 	 */
 	if (cfqd->sync_flight && !cfq_cfqq_sync(cfqq))
@@ -2130,11 +2142,11 @@ static void cfq_insert_request(struct request_queue *q, struct request *rq)
  */
 static void cfq_update_hw_tag(struct cfq_data *cfqd)
 {
-	if (cfqd->rq_in_driver > cfqd->rq_in_driver_peak)
-		cfqd->rq_in_driver_peak = cfqd->rq_in_driver;
+	if (rq_in_driver(cfqd) > cfqd->rq_in_driver_peak)
+		cfqd->rq_in_driver_peak = rq_in_driver(cfqd);
 
 	if (cfqd->rq_queued <= CFQ_HW_QUEUE_MIN &&
-	    cfqd->rq_in_driver <= CFQ_HW_QUEUE_MIN)
+	    rq_in_driver(cfqd) <= CFQ_HW_QUEUE_MIN)
 		return;
 
 	if (cfqd->hw_tag_samples++ < 50)
@@ -2161,9 +2173,9 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq)
 
 	cfq_update_hw_tag(cfqd);
 
-	WARN_ON(!cfqd->rq_in_driver);
+	WARN_ON(!cfqd->rq_in_driver[sync]);
 	WARN_ON(!cfqq->dispatched);
-	cfqd->rq_in_driver--;
+	cfqd->rq_in_driver[sync]--;
 	cfqq->dispatched--;
 
 	if (cfq_cfqq_sync(cfqq))
@@ -2197,7 +2209,7 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq)
 			cfq_arm_slice_timer(cfqd);
 	}
 
-	if (!cfqd->rq_in_driver)
+	if (!rq_in_driver(cfqd))
 		cfq_schedule_dispatch(cfqd);
 }
 
-- 
cgit v0.10.2


From d58b85e1e891cd842d6e183f5d94d06a4fd0122c Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Fri, 10 Jul 2009 10:20:30 +0200
Subject: cfq-iosched: no need to keep track of busy_rt_queues

o Get rid of busy_rt_queues infrastructure. Looks like it is redundant.

o Once an RT queue gets request it will preempt any of the BE or IDLE queues
  immediately. Otherwise this queue will be put on service tree and scheduler
  will anyway select this queue before any of the BE or IDLE queue. Hence
  looks like there is no need to keep track of how many busy RT queues are
  currently on service tree.

Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 93693bf..ca0d7e7 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -134,11 +134,6 @@ struct cfq_data {
 	struct rb_root prio_trees[CFQ_PRIO_LISTS];
 
 	unsigned int busy_queues;
-	/*
-	 * Used to track any pending rt requests so we can pre-empt current
-	 * non-RT cfqq in service when this value is non-zero.
-	 */
-	unsigned int busy_rt_queues;
 
 	int rq_in_driver[2];
 	int sync_flight;
@@ -653,8 +648,6 @@ static void cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 	BUG_ON(cfq_cfqq_on_rr(cfqq));
 	cfq_mark_cfqq_on_rr(cfqq);
 	cfqd->busy_queues++;
-	if (cfq_class_rt(cfqq))
-		cfqd->busy_rt_queues++;
 
 	cfq_resort_rr_list(cfqd, cfqq);
 }
@@ -678,8 +671,6 @@ static void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 
 	BUG_ON(!cfqd->busy_queues);
 	cfqd->busy_queues--;
-	if (cfq_class_rt(cfqq))
-		cfqd->busy_rt_queues--;
 }
 
 /*
@@ -1185,20 +1176,6 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
 		goto expire;
 
 	/*
-	 * If we have a RT cfqq waiting, then we pre-empt the current non-rt
-	 * cfqq.
-	 */
-	if (!cfq_class_rt(cfqq) && cfqd->busy_rt_queues) {
-		/*
-		 * We simulate this as cfqq timed out so that it gets to bank
-		 * the remaining of its time slice.
-		 */
-		cfq_log_cfqq(cfqd, cfqq, "preempt");
-		cfq_slice_expired(cfqd, 1);
-		goto new_queue;
-	}
-
-	/*
 	 * The active queue has requests and isn't expired, allow it to
 	 * dispatch.
 	 */
-- 
cgit v0.10.2


From e3264a4d7de147677f1995f119eba05c9abe9c1c Mon Sep 17 00:00:00 2001
From: Hannes Reinecke <hare@suse.de>
Date: Tue, 28 Jul 2009 09:13:13 +0200
Subject: Send uevents for write_protect changes

Whenever a block device changes it's read-only attribute
notify the userspace about it.

Signed-off-by: Hannes Reinecke <hare@suse.de>
Signed-off-by: Nikanth Karthikesan <knikanth@suse.de>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/block/genhd.c b/block/genhd.c
index f4c64c2..b89328e 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -1215,6 +1215,16 @@ void put_disk(struct gendisk *disk)
 
 EXPORT_SYMBOL(put_disk);
 
+static void set_disk_ro_uevent(struct gendisk *gd, int ro)
+{
+	char event[] = "DISK_RO=1";
+	char *envp[] = { event, NULL };
+
+	if (!ro)
+		event[8] = '0';
+	kobject_uevent_env(&disk_to_dev(gd)->kobj, KOBJ_CHANGE, envp);
+}
+
 void set_device_ro(struct block_device *bdev, int flag)
 {
 	bdev->bd_part->policy = flag;
@@ -1227,8 +1237,12 @@ void set_disk_ro(struct gendisk *disk, int flag)
 	struct disk_part_iter piter;
 	struct hd_struct *part;
 
-	disk_part_iter_init(&piter, disk,
-			    DISK_PITER_INCL_EMPTY | DISK_PITER_INCL_PART0);
+	if (disk->part0.policy != flag) {
+		set_disk_ro_uevent(disk, flag);
+		disk->part0.policy = flag;
+	}
+
+	disk_part_iter_init(&piter, disk, DISK_PITER_INCL_EMPTY);
 	while ((part = disk_part_iter_next(&piter)))
 		part->policy = flag;
 	disk_part_iter_exit(&piter);
-- 
cgit v0.10.2


From e7e503aedb1f4d165081cb8d47a58c38f80f0cb4 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Tue, 28 Jul 2009 10:15:30 +0200
Subject: block: make bio_rw_flagged() return a bool

Makes for a saner interface, instead of returning the bit position.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/include/linux/bio.h b/include/linux/bio.h
index a299ed3..4f8fd02 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -177,7 +177,10 @@ enum bio_rw_flags {
 	BIO_RW_NOIDLE,
 };
 
-#define bio_rw_flagged(bio, flag)	((bio)->bi_rw & (1 << (flag)))
+static inline bool bio_rw_flagged(struct bio *bio, enum bio_rw_flags flag)
+{
+	return (bio->bi_rw & (1 << flag)) != 0;
+}
 
 /*
  * Old defines, these should eventually be replaced by direct usage of
-- 
cgit v0.10.2


From 1f98a13f623e0ef666690a18c1250335fc6d7ef1 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Fri, 11 Sep 2009 14:32:04 +0200
Subject: bio: first step in sanitizing the bio->bi_rw flag testing

Get rid of any functions that test for these bits and make callers
use bio_rw_flagged() directly. Then it is at least directly apparent
what variable and flag they check.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/block/blk-core.c b/block/blk-core.c
index c822239..5255971 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1114,24 +1114,24 @@ void init_request_from_bio(struct request *req, struct bio *bio)
 	 * Inherit FAILFAST from bio (for read-ahead, and explicit
 	 * FAILFAST).  FAILFAST flags are identical for req and bio.
 	 */
-	if (bio_rw_ahead(bio))
+	if (bio_rw_flagged(bio, BIO_RW_AHEAD))
 		req->cmd_flags |= REQ_FAILFAST_MASK;
 	else
 		req->cmd_flags |= bio->bi_rw & REQ_FAILFAST_MASK;
 
-	if (unlikely(bio_discard(bio))) {
+	if (unlikely(bio_rw_flagged(bio, BIO_RW_DISCARD))) {
 		req->cmd_flags |= REQ_DISCARD;
-		if (bio_barrier(bio))
+		if (bio_rw_flagged(bio, BIO_RW_BARRIER))
 			req->cmd_flags |= REQ_SOFTBARRIER;
 		req->q->prepare_discard_fn(req->q, req);
-	} else if (unlikely(bio_barrier(bio)))
+	} else if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER)))
 		req->cmd_flags |= REQ_HARDBARRIER;
 
-	if (bio_sync(bio))
+	if (bio_rw_flagged(bio, BIO_RW_SYNCIO))
 		req->cmd_flags |= REQ_RW_SYNC;
-	if (bio_rw_meta(bio))
+	if (bio_rw_flagged(bio, BIO_RW_META))
 		req->cmd_flags |= REQ_RW_META;
-	if (bio_noidle(bio))
+	if (bio_rw_flagged(bio, BIO_RW_NOIDLE))
 		req->cmd_flags |= REQ_NOIDLE;
 
 	req->errors = 0;
@@ -1155,12 +1155,12 @@ static int __make_request(struct request_queue *q, struct bio *bio)
 	int el_ret;
 	unsigned int bytes = bio->bi_size;
 	const unsigned short prio = bio_prio(bio);
-	const int sync = bio_sync(bio);
-	const int unplug = bio_unplug(bio);
+	const bool sync = bio_rw_flagged(bio, BIO_RW_SYNCIO);
+	const bool unplug = bio_rw_flagged(bio, BIO_RW_UNPLUG);
 	const unsigned int ff = bio->bi_rw & REQ_FAILFAST_MASK;
 	int rw_flags;
 
-	if (bio_barrier(bio) && bio_has_data(bio) &&
+	if (bio_rw_flagged(bio, BIO_RW_BARRIER) && bio_has_data(bio) &&
 	    (q->next_ordered == QUEUE_ORDERED_NONE)) {
 		bio_endio(bio, -EOPNOTSUPP);
 		return 0;
@@ -1174,7 +1174,7 @@ static int __make_request(struct request_queue *q, struct bio *bio)
 
 	spin_lock_irq(q->queue_lock);
 
-	if (unlikely(bio_barrier(bio)) || elv_queue_empty(q))
+	if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER)) || elv_queue_empty(q))
 		goto get_rq;
 
 	el_ret = elv_merge(q, &req, bio);
@@ -1470,7 +1470,8 @@ static inline void __generic_make_request(struct bio *bio)
 		if (bio_check_eod(bio, nr_sectors))
 			goto end_io;
 
-		if (bio_discard(bio) && !q->prepare_discard_fn) {
+		if (bio_rw_flagged(bio, BIO_RW_DISCARD) &&
+		    !q->prepare_discard_fn) {
 			err = -EOPNOTSUPP;
 			goto end_io;
 		}
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index ca0d7e7..9e6d0af 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -257,7 +257,7 @@ static inline void cic_set_cfqq(struct cfq_io_context *cic,
  */
 static inline int cfq_bio_sync(struct bio *bio)
 {
-	if (bio_data_dir(bio) == READ || bio_sync(bio))
+	if (bio_data_dir(bio) == READ || bio_rw_flagged(bio, BIO_RW_SYNCIO))
 		return 1;
 
 	return 0;
diff --git a/block/elevator.c b/block/elevator.c
index ca86192..1975b61 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -79,7 +79,8 @@ int elv_rq_merge_ok(struct request *rq, struct bio *bio)
 	/*
 	 * Don't merge file system requests and discard requests
 	 */
-	if (bio_discard(bio) != bio_discard(rq->bio))
+	if (bio_rw_flagged(bio, BIO_RW_DISCARD) !=
+	    bio_rw_flagged(rq->bio, BIO_RW_DISCARD))
 		return 0;
 
 	/*
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 5757188..bbb7944 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -475,7 +475,7 @@ static int do_bio_filebacked(struct loop_device *lo, struct bio *bio)
 	pos = ((loff_t) bio->bi_sector << 9) + lo->lo_offset;
 
 	if (bio_rw(bio) == WRITE) {
-		int barrier = bio_barrier(bio);
+		bool barrier = bio_rw_flagged(bio, BIO_RW_BARRIER);
 		struct file *file = lo->lo_backing_file;
 
 		if (barrier) {
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index 33f179e..cc9dc79 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -1129,7 +1129,7 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio,
 	if (error == -EOPNOTSUPP)
 		goto out;
 
-	if ((error == -EWOULDBLOCK) && bio_rw_ahead(bio))
+	if ((error == -EWOULDBLOCK) && bio_rw_flagged(bio, BIO_RW_AHEAD))
 		goto out;
 
 	if (unlikely(error)) {
diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c
index 3e563d2..fde658c 100644
--- a/drivers/md/dm-stripe.c
+++ b/drivers/md/dm-stripe.c
@@ -285,7 +285,7 @@ static int stripe_end_io(struct dm_target *ti, struct bio *bio,
 	if (!error)
 		return 0; /* I/O complete */
 
-	if ((error == -EWOULDBLOCK) && bio_rw_ahead(bio))
+	if ((error == -EWOULDBLOCK) && bio_rw_flagged(bio, BIO_RW_AHEAD))
 		return error;
 
 	if (error == -EOPNOTSUPP)
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index b4845b1..ec012f0 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -586,7 +586,7 @@ static void dec_pending(struct dm_io *io, int error)
 			 */
 			spin_lock_irqsave(&md->deferred_lock, flags);
 			if (__noflush_suspending(md)) {
-				if (!bio_barrier(io->bio))
+				if (!bio_rw_flagged(io->bio, BIO_RW_BARRIER))
 					bio_list_add_head(&md->deferred,
 							  io->bio);
 			} else
@@ -598,7 +598,7 @@ static void dec_pending(struct dm_io *io, int error)
 		io_error = io->error;
 		bio = io->bio;
 
-		if (bio_barrier(bio)) {
+		if (bio_rw_flagged(bio, BIO_RW_BARRIER)) {
 			/*
 			 * There can be just one barrier request so we use
 			 * a per-device variable for error reporting.
@@ -1209,7 +1209,7 @@ static void __split_and_process_bio(struct mapped_device *md, struct bio *bio)
 
 	ci.map = dm_get_table(md);
 	if (unlikely(!ci.map)) {
-		if (!bio_barrier(bio))
+		if (!bio_rw_flagged(bio, BIO_RW_BARRIER))
 			bio_io_error(bio);
 		else
 			if (!md->barrier_error)
@@ -1321,7 +1321,7 @@ static int _dm_request(struct request_queue *q, struct bio *bio)
 	 * we have to queue this io for later.
 	 */
 	if (unlikely(test_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags)) ||
-	    unlikely(bio_barrier(bio))) {
+	    unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) {
 		up_read(&md->io_lock);
 
 		if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) &&
@@ -1344,7 +1344,7 @@ static int dm_make_request(struct request_queue *q, struct bio *bio)
 {
 	struct mapped_device *md = q->queuedata;
 
-	if (unlikely(bio_barrier(bio))) {
+	if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) {
 		bio_endio(bio, -EOPNOTSUPP);
 		return 0;
 	}
@@ -2164,7 +2164,7 @@ static void dm_wq_work(struct work_struct *work)
 		if (dm_request_based(md))
 			generic_make_request(c);
 		else {
-			if (bio_barrier(c))
+			if (bio_rw_flagged(c, BIO_RW_BARRIER))
 				process_barrier(md, c);
 			else
 				__split_and_process_bio(md, c);
diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index 5fe39c2..ea48429 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -288,7 +288,7 @@ static int linear_make_request (struct request_queue *q, struct bio *bio)
 	sector_t start_sector;
 	int cpu;
 
-	if (unlikely(bio_barrier(bio))) {
+	if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) {
 		bio_endio(bio, -EOPNOTSUPP);
 		return 0;
 	}
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index 7140909..89e7681 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -90,7 +90,7 @@ static void multipath_end_request(struct bio *bio, int error)
 
 	if (uptodate)
 		multipath_end_bh_io(mp_bh, 0);
-	else if (!bio_rw_ahead(bio)) {
+	else if (!bio_rw_flagged(bio, BIO_RW_AHEAD)) {
 		/*
 		 * oops, IO error:
 		 */
@@ -144,7 +144,7 @@ static int multipath_make_request (struct request_queue *q, struct bio * bio)
 	const int rw = bio_data_dir(bio);
 	int cpu;
 
-	if (unlikely(bio_barrier(bio))) {
+	if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) {
 		bio_endio(bio, -EOPNOTSUPP);
 		return 0;
 	}
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index 898e2bd..f845ed9 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -448,7 +448,7 @@ static int raid0_make_request(struct request_queue *q, struct bio *bio)
 	const int rw = bio_data_dir(bio);
 	int cpu;
 
-	if (unlikely(bio_barrier(bio))) {
+	if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) {
 		bio_endio(bio, -EOPNOTSUPP);
 		return 0;
 	}
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 8726fd7..ff7ed33 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -782,8 +782,9 @@ static int make_request(struct request_queue *q, struct bio * bio)
 	struct bio_list bl;
 	struct page **behind_pages = NULL;
 	const int rw = bio_data_dir(bio);
-	const int do_sync = bio_sync(bio);
-	int cpu, do_barriers;
+	const bool do_sync = bio_rw_flagged(bio, BIO_RW_SYNCIO);
+	int cpu;
+	bool do_barriers;
 	mdk_rdev_t *blocked_rdev;
 
 	/*
@@ -797,7 +798,8 @@ static int make_request(struct request_queue *q, struct bio * bio)
 
 	md_write_start(mddev, bio); /* wait on superblock update early */
 
-	if (unlikely(!mddev->barriers_work && bio_barrier(bio))) {
+	if (unlikely(!mddev->barriers_work &&
+		     bio_rw_flagged(bio, BIO_RW_BARRIER))) {
 		if (rw == WRITE)
 			md_write_end(mddev);
 		bio_endio(bio, -EOPNOTSUPP);
@@ -925,7 +927,7 @@ static int make_request(struct request_queue *q, struct bio * bio)
 	atomic_set(&r1_bio->remaining, 0);
 	atomic_set(&r1_bio->behind_remaining, 0);
 
-	do_barriers = bio_barrier(bio);
+	do_barriers = bio_rw_flagged(bio, BIO_RW_BARRIER);
 	if (do_barriers)
 		set_bit(R1BIO_Barrier, &r1_bio->state);
 
@@ -1600,7 +1602,7 @@ static void raid1d(mddev_t *mddev)
 			 * We already have a nr_pending reference on these rdevs.
 			 */
 			int i;
-			const int do_sync = bio_sync(r1_bio->master_bio);
+			const bool do_sync = bio_rw_flagged(r1_bio->master_bio, BIO_RW_SYNCIO);
 			clear_bit(R1BIO_BarrierRetry, &r1_bio->state);
 			clear_bit(R1BIO_Barrier, &r1_bio->state);
 			for (i=0; i < conf->raid_disks; i++)
@@ -1654,7 +1656,7 @@ static void raid1d(mddev_t *mddev)
 				       (unsigned long long)r1_bio->sector);
 				raid_end_bio_io(r1_bio);
 			} else {
-				const int do_sync = bio_sync(r1_bio->master_bio);
+				const bool do_sync = bio_rw_flagged(r1_bio->master_bio, BIO_RW_SYNCIO);
 				r1_bio->bios[r1_bio->read_disk] =
 					mddev->ro ? IO_BLOCKED : NULL;
 				r1_bio->read_disk = disk;
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 3d9020c..d0a2152 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -796,12 +796,12 @@ static int make_request(struct request_queue *q, struct bio * bio)
 	int i;
 	int chunk_sects = conf->chunk_mask + 1;
 	const int rw = bio_data_dir(bio);
-	const int do_sync = bio_sync(bio);
+	const bool do_sync = bio_rw_flagged(bio, BIO_RW_SYNCIO);
 	struct bio_list bl;
 	unsigned long flags;
 	mdk_rdev_t *blocked_rdev;
 
-	if (unlikely(bio_barrier(bio))) {
+	if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) {
 		bio_endio(bio, -EOPNOTSUPP);
 		return 0;
 	}
@@ -1610,7 +1610,7 @@ static void raid10d(mddev_t *mddev)
 				raid_end_bio_io(r10_bio);
 				bio_put(bio);
 			} else {
-				const int do_sync = bio_sync(r10_bio->master_bio);
+				const bool do_sync = bio_rw_flagged(r10_bio->master_bio, BIO_RW_SYNCIO);
 				bio_put(bio);
 				rdev = conf->mirrors[mirror].rdev;
 				if (printk_ratelimit())
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index b8a2c5d..826eb34 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -3606,7 +3606,7 @@ static int make_request(struct request_queue *q, struct bio * bi)
 	const int rw = bio_data_dir(bi);
 	int cpu, remaining;
 
-	if (unlikely(bio_barrier(bi))) {
+	if (unlikely(bio_rw_flagged(bi, BIO_RW_BARRIER))) {
 		bio_endio(bi, -EOPNOTSUPP);
 		return 0;
 	}
diff --git a/drivers/staging/dst/dcore.c b/drivers/staging/dst/dcore.c
index fad25b7..b1c258c 100644
--- a/drivers/staging/dst/dcore.c
+++ b/drivers/staging/dst/dcore.c
@@ -112,8 +112,9 @@ static int dst_request(struct request_queue *q, struct bio *bio)
 		 * I worked with.
 		 *
 		 * Empty barriers are not allowed anyway, see 51fd77bd9f512
-		 * for example, although later it was changed to bio_discard()
-		 * only, which does not work in this case.
+		 * for example, although later it was changed to
+		 * bio_rw_flagged(bio, BIO_RW_DISCARD) only, which does not
+		 * work in this case.
 		 */
 		//err = -EOPNOTSUPP;
 		err = 0;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 5dbefd1..5cf405b 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -260,7 +260,7 @@ loop_lock:
 		num_run++;
 		batch_run++;
 
-		if (bio_sync(cur))
+		if (bio_rw_flagged(cur, BIO_RW_SYNCIO))
 			num_sync_run++;
 
 		if (need_resched()) {
@@ -2903,7 +2903,7 @@ static noinline int schedule_bio(struct btrfs_root *root,
 	bio->bi_rw |= rw;
 
 	spin_lock(&device->io_lock);
-	if (bio_sync(bio))
+	if (bio_rw_flagged(bio, BIO_RW_SYNCIO))
 		pending_bios = &device->pending_sync_bios;
 	else
 		pending_bios = &device->pending_bios;
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 4f8fd02..5be93f1 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -177,29 +177,18 @@ enum bio_rw_flags {
 	BIO_RW_NOIDLE,
 };
 
+/*
+ * First four bits must match between bio->bi_rw and rq->cmd_flags, make
+ * that explicit here.
+ */
+#define BIO_RW_RQ_MASK		0xf
+
 static inline bool bio_rw_flagged(struct bio *bio, enum bio_rw_flags flag)
 {
 	return (bio->bi_rw & (1 << flag)) != 0;
 }
 
 /*
- * Old defines, these should eventually be replaced by direct usage of
- * bio_rw_flagged()
- */
-#define bio_barrier(bio)	bio_rw_flagged(bio, BIO_RW_BARRIER)
-#define bio_sync(bio)		bio_rw_flagged(bio, BIO_RW_SYNCIO)
-#define bio_unplug(bio)		bio_rw_flagged(bio, BIO_RW_UNPLUG)
-#define bio_failfast_dev(bio)	bio_rw_flagged(bio, BIO_RW_FAILFAST_DEV)
-#define bio_failfast_transport(bio)	\
-		bio_rw_flagged(bio, BIO_RW_FAILFAST_TRANSPORT)
-#define bio_failfast_driver(bio) 	\
-		bio_rw_flagged(bio, BIO_RW_FAILFAST_DRIVER)
-#define bio_rw_ahead(bio)	bio_rw_flagged(bio, BIO_RW_AHEAD)
-#define bio_rw_meta(bio)	bio_rw_flagged(bio, BIO_RW_META)
-#define bio_discard(bio)	bio_rw_flagged(bio, BIO_RW_DISCARD)
-#define bio_noidle(bio)		bio_rw_flagged(bio, BIO_RW_NOIDLE)
-
-/*
  * upper 16 bits of bi_rw define the io priority of this bio
  */
 #define BIO_PRIO_SHIFT	(8 * sizeof(unsigned long) - IOPRIO_BITS)
@@ -222,7 +211,7 @@ static inline bool bio_rw_flagged(struct bio *bio, enum bio_rw_flags flag)
 #define bio_offset(bio)		bio_iovec((bio))->bv_offset
 #define bio_segments(bio)	((bio)->bi_vcnt - (bio)->bi_idx)
 #define bio_sectors(bio)	((bio)->bi_size >> 9)
-#define bio_empty_barrier(bio)	(bio_barrier(bio) && !bio_has_data(bio) && !bio_discard(bio))
+#define bio_empty_barrier(bio)	(bio_rw_flagged(bio, BIO_RW_BARRIER) && !bio_has_data(bio) && !bio_rw_flagged(bio, BIO_RW_DISCARD))
 
 static inline unsigned int bio_cur_bytes(struct bio *bio)
 {
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 650b6a9..88edb62 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -86,7 +86,7 @@ enum {
 };
 
 /*
- * request type modified bits. first two bits match BIO_RW* bits, important
+ * request type modified bits. first four bits match BIO_RW* bits, important
  */
 enum rq_flag_bits {
 	__REQ_RW,		/* not set, read. set, write */
-- 
cgit v0.10.2


From fb1e75389bd06fd5987e9cda1b4e0305c782f854 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Thu, 30 Jul 2009 08:18:24 +0200
Subject: block: improve queue_should_plug() by looking at IO depths

Instead of just checking whether this device uses block layer
tagging, we can improve the detection by looking at the maximum
queue depth it has reached. If that crosses 4, then deem it a
queuing device.

This is important on high IOPS devices, since plugging hurts
the performance there (it can be as much as 10-15% of the sys
time).

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/block/blk-core.c b/block/blk-core.c
index 5255971..93051d1 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1146,7 +1146,7 @@ void init_request_from_bio(struct request *req, struct bio *bio)
  */
 static inline bool queue_should_plug(struct request_queue *q)
 {
-	return !(blk_queue_nonrot(q) && blk_queue_tagged(q));
+	return !(blk_queue_nonrot(q) && blk_queue_queuing(q));
 }
 
 static int __make_request(struct request_queue *q, struct bio *bio)
@@ -1857,8 +1857,15 @@ void blk_dequeue_request(struct request *rq)
 	 * and to it is freed is accounted as io that is in progress at
 	 * the driver side.
 	 */
-	if (blk_account_rq(rq))
+	if (blk_account_rq(rq)) {
 		q->in_flight[rq_is_sync(rq)]++;
+		/*
+		 * Mark this device as supporting hardware queuing, if
+		 * we have more IOs in flight than 4.
+		 */
+		if (!blk_queue_queuing(q) && queue_in_flight(q) > 4)
+			set_bit(QUEUE_FLAG_CQ, &q->queue_flags);
+	}
 }
 
 /**
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 88edb62..98b4563 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -459,6 +459,7 @@ struct request_queue
 #define QUEUE_FLAG_NONROT      14	/* non-rotational device (SSD) */
 #define QUEUE_FLAG_VIRT        QUEUE_FLAG_NONROT /* paravirt device */
 #define QUEUE_FLAG_IO_STAT     15	/* do IO stats */
+#define QUEUE_FLAG_CQ	       16	/* hardware does queuing */
 
 #define QUEUE_FLAG_DEFAULT	((1 << QUEUE_FLAG_IO_STAT) |		\
 				 (1 << QUEUE_FLAG_CLUSTER) |		\
@@ -581,6 +582,7 @@ enum {
 
 #define blk_queue_plugged(q)	test_bit(QUEUE_FLAG_PLUGGED, &(q)->queue_flags)
 #define blk_queue_tagged(q)	test_bit(QUEUE_FLAG_QUEUED, &(q)->queue_flags)
+#define blk_queue_queuing(q)	test_bit(QUEUE_FLAG_CQ, &(q)->queue_flags)
 #define blk_queue_stopped(q)	test_bit(QUEUE_FLAG_STOPPED, &(q)->queue_flags)
 #define blk_queue_nomerges(q)	test_bit(QUEUE_FLAG_NOMERGES, &(q)->queue_flags)
 #define blk_queue_nonrot(q)	test_bit(QUEUE_FLAG_NONROT, &(q)->queue_flags)
-- 
cgit v0.10.2


From 5e605b64a183a6c0e84cdb99a6f8acb1f8200437 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Wed, 5 Aug 2009 09:07:21 +0200
Subject: block: add blk-iopoll, a NAPI like approach for block devices

This borrows some code from NAPI and implements a polled completion
mode for block devices. The idea is the same as NAPI - instead of
doing the command completion when the irq occurs, schedule a dedicated
softirq in the hopes that we will complete more IO when the iopoll
handler is invoked. Devices have a budget of commands assigned, and will
stay in polled mode as long as they continue to consume their budget
from the iopoll softirq handler. If they do not, the device is set back
to interrupt completion mode.

This patch holds the core bits for blk-iopoll, device driver support
sold separately.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/block/Makefile b/block/Makefile
index 6c54ed0..ba74ca6 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -5,7 +5,7 @@
 obj-$(CONFIG_BLOCK) := elevator.o blk-core.o blk-tag.o blk-sysfs.o \
 			blk-barrier.o blk-settings.o blk-ioc.o blk-map.o \
 			blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \
-			ioctl.o genhd.o scsi_ioctl.o
+			blk-iopoll.o ioctl.o genhd.o scsi_ioctl.o
 
 obj-$(CONFIG_BLK_DEV_BSG)	+= bsg.o
 obj-$(CONFIG_IOSCHED_NOOP)	+= noop-iosched.o
diff --git a/block/blk-iopoll.c b/block/blk-iopoll.c
new file mode 100644
index 0000000..566db1e
--- /dev/null
+++ b/block/blk-iopoll.c
@@ -0,0 +1,220 @@
+/*
+ * Functions related to interrupt-poll handling in the block layer. This
+ * is similar to NAPI for network devices.
+ */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <linux/interrupt.h>
+#include <linux/cpu.h>
+#include <linux/blk-iopoll.h>
+#include <linux/delay.h>
+
+#include "blk.h"
+
+int blk_iopoll_enabled = 1;
+EXPORT_SYMBOL(blk_iopoll_enabled);
+
+static DEFINE_PER_CPU(struct list_head, blk_cpu_iopoll);
+
+/**
+ * blk_iopoll_sched - Schedule a run of the iopoll handler
+ * @iop:      The parent iopoll structure
+ *
+ * Description:
+ *     Add this blk_iopoll structure to the pending poll list and trigger the raise
+ *     of the blk iopoll softirq. The driver must already have gotten a succesful
+ *     return from blk_iopoll_sched_prep() before calling this.
+ **/
+void blk_iopoll_sched(struct blk_iopoll *iop)
+{
+	unsigned long flags;
+
+	local_irq_save(flags);
+	list_add_tail(&iop->list, &__get_cpu_var(blk_cpu_iopoll));
+	__raise_softirq_irqoff(BLOCK_IOPOLL_SOFTIRQ);
+	local_irq_restore(flags);
+}
+EXPORT_SYMBOL(blk_iopoll_sched);
+
+/**
+ * __blk_iopoll_complete - Mark this @iop as un-polled again
+ * @iop:      The parent iopoll structure
+ *
+ * Description:
+ *     See blk_iopoll_complete(). This function must be called with interrupts disabled.
+ **/
+void __blk_iopoll_complete(struct blk_iopoll *iop)
+{
+	list_del(&iop->list);
+	smp_mb__before_clear_bit();
+	clear_bit_unlock(IOPOLL_F_SCHED, &iop->state);
+}
+EXPORT_SYMBOL(__blk_iopoll_complete);
+
+/**
+ * blk_iopoll_complete - Mark this @iop as un-polled again
+ * @iop:      The parent iopoll structure
+ *
+ * Description:
+ *     If a driver consumes less than the assigned budget in its run of the iopoll
+ *     handler, it'll end the polled mode by calling this function. The iopoll handler
+ *     will not be invoked again before blk_iopoll_sched_prep() is called.
+ **/
+void blk_iopoll_complete(struct blk_iopoll *iopoll)
+{
+	unsigned long flags;
+
+	local_irq_save(flags);
+	__blk_iopoll_complete(iopoll);
+	local_irq_restore(flags);
+}
+EXPORT_SYMBOL(blk_iopoll_complete);
+
+static void blk_iopoll_softirq(struct softirq_action *h)
+{
+	struct list_head *list = &__get_cpu_var(blk_cpu_iopoll);
+	unsigned long start_time = jiffies;
+	int rearm = 0, budget = 64;
+
+	local_irq_disable();
+
+	while (!list_empty(list)) {
+		struct blk_iopoll *iop;
+		int work, weight;
+
+		/*
+		 * If softirq window is exhausted then punt.
+		 */
+		if (budget <= 0 || time_after(jiffies, start_time)) {
+			rearm = 1;
+			break;
+		}
+
+		local_irq_enable();
+
+		/* Even though interrupts have been re-enabled, this
+		 * access is safe because interrupts can only add new
+		 * entries to the tail of this list, and only ->poll()
+		 * calls can remove this head entry from the list.
+		 */
+		iop = list_entry(list->next, struct blk_iopoll, list);
+
+		weight = iop->weight;
+		work = 0;
+		if (test_bit(IOPOLL_F_SCHED, &iop->state))
+			work = iop->poll(iop, weight);
+
+		budget -= work;
+
+		local_irq_disable();
+
+		/* Drivers must not modify the NAPI state if they
+		 * consume the entire weight.  In such cases this code
+		 * still "owns" the NAPI instance and therefore can
+		 * move the instance around on the list at-will.
+		 */
+		if (work >= weight) {
+			if (blk_iopoll_disable_pending(iop))
+				__blk_iopoll_complete(iop);
+			else
+				list_move_tail(&iop->list, list);
+		}
+	}
+
+	if (rearm)
+		__raise_softirq_irqoff(BLOCK_IOPOLL_SOFTIRQ);
+
+	local_irq_enable();
+}
+
+/**
+ * blk_iopoll_disable - Disable iopoll on this @iop
+ * @iop:      The parent iopoll structure
+ *
+ * Description:
+ *     Disable io polling and wait for any pending callbacks to have completed.
+ **/
+void blk_iopoll_disable(struct blk_iopoll *iop)
+{
+	set_bit(IOPOLL_F_DISABLE, &iop->state);
+	while (test_and_set_bit(IOPOLL_F_SCHED, &iop->state))
+		msleep(1);
+	clear_bit(IOPOLL_F_DISABLE, &iop->state);
+}
+EXPORT_SYMBOL(blk_iopoll_disable);
+
+/**
+ * blk_iopoll_enable - Enable iopoll on this @iop
+ * @iop:      The parent iopoll structure
+ *
+ * Description:
+ *     Enable iopoll on this @iop. Note that the handler run will not be scheduled, it
+ *     will only mark it as active.
+ **/
+void blk_iopoll_enable(struct blk_iopoll *iop)
+{
+	BUG_ON(!test_bit(IOPOLL_F_SCHED, &iop->state));
+        smp_mb__before_clear_bit();
+	clear_bit_unlock(IOPOLL_F_SCHED, &iop->state);
+}
+EXPORT_SYMBOL(blk_iopoll_enable);
+
+/**
+ * blk_iopoll_init - Initialize this @iop
+ * @iop:      The parent iopoll structure
+ * @weight:   The default weight (or command completion budget)
+ * @poll_fn:  The handler to invoke
+ *
+ * Description:
+ *     Initialize this blk_iopoll structure. Before being actively used, the driver
+ *     must call blk_iopoll_enable().
+ **/
+void blk_iopoll_init(struct blk_iopoll *iop, int weight, blk_iopoll_fn *poll_fn)
+{
+	memset(iop, 0, sizeof(*iop));
+	INIT_LIST_HEAD(&iop->list);
+	iop->weight = weight;
+	iop->poll = poll_fn;
+	set_bit(IOPOLL_F_SCHED, &iop->state);
+}
+EXPORT_SYMBOL(blk_iopoll_init);
+
+static int __cpuinit blk_iopoll_cpu_notify(struct notifier_block *self,
+					  unsigned long action, void *hcpu)
+{
+	/*
+	 * If a CPU goes away, splice its entries to the current CPU
+	 * and trigger a run of the softirq
+	 */
+	if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
+		int cpu = (unsigned long) hcpu;
+
+		local_irq_disable();
+		list_splice_init(&per_cpu(blk_cpu_iopoll, cpu),
+				 &__get_cpu_var(blk_cpu_iopoll));
+		raise_softirq_irqoff(BLOCK_IOPOLL_SOFTIRQ);
+		local_irq_enable();
+	}
+
+	return NOTIFY_OK;
+}
+
+static struct notifier_block __cpuinitdata blk_iopoll_cpu_notifier = {
+	.notifier_call	= blk_iopoll_cpu_notify,
+};
+
+static __init int blk_iopoll_setup(void)
+{
+	int i;
+
+	for_each_possible_cpu(i)
+		INIT_LIST_HEAD(&per_cpu(blk_cpu_iopoll, i));
+
+	open_softirq(BLOCK_IOPOLL_SOFTIRQ, blk_iopoll_softirq);
+	register_hotcpu_notifier(&blk_iopoll_cpu_notifier);
+	return 0;
+}
+subsys_initcall(blk_iopoll_setup);
diff --git a/include/linux/blk-iopoll.h b/include/linux/blk-iopoll.h
new file mode 100644
index 0000000..b2e1739
--- /dev/null
+++ b/include/linux/blk-iopoll.h
@@ -0,0 +1,41 @@
+#ifndef BLK_IOPOLL_H
+#define BLK_IOPOLL_H
+
+struct blk_iopoll;
+typedef int (blk_iopoll_fn)(struct blk_iopoll *, int);
+
+struct blk_iopoll {
+	struct list_head list;
+	unsigned long state;
+	unsigned long data;
+	int weight;
+	int max;
+	blk_iopoll_fn *poll;
+};
+
+enum {
+	IOPOLL_F_SCHED		= 0,
+	IOPOLL_F_DISABLE	= 1,
+};
+
+static inline int blk_iopoll_sched_prep(struct blk_iopoll *iop)
+{
+	return !test_bit(IOPOLL_F_DISABLE, &iop->state) &&
+		!test_and_set_bit(IOPOLL_F_SCHED, &iop->state);
+}
+
+static inline int blk_iopoll_disable_pending(struct blk_iopoll *iop)
+{
+	return test_bit(IOPOLL_F_DISABLE, &iop->state);
+}
+
+extern void blk_iopoll_sched(struct blk_iopoll *);
+extern void blk_iopoll_init(struct blk_iopoll *, int, blk_iopoll_fn *);
+extern void blk_iopoll_complete(struct blk_iopoll *);
+extern void __blk_iopoll_complete(struct blk_iopoll *);
+extern void blk_iopoll_enable(struct blk_iopoll *);
+extern void blk_iopoll_disable(struct blk_iopoll *);
+
+extern int blk_iopoll_enabled;
+
+#endif
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 35e7df1..edd8d5c 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -344,6 +344,7 @@ enum
 	NET_TX_SOFTIRQ,
 	NET_RX_SOFTIRQ,
 	BLOCK_SOFTIRQ,
+	BLOCK_IOPOLL_SOFTIRQ,
 	TASKLET_SOFTIRQ,
 	SCHED_SOFTIRQ,
 	HRTIMER_SOFTIRQ,
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 58be760..0ed9fa6 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -92,6 +92,7 @@ extern int sysctl_nr_trim_pages;
 #ifdef CONFIG_RCU_TORTURE_TEST
 extern int rcutorture_runnable;
 #endif /* #ifdef CONFIG_RCU_TORTURE_TEST */
+extern int blk_iopoll_enabled;
 
 /* Constants used for minimum and  maximum */
 #ifdef CONFIG_DETECT_SOFTLOCKUP
@@ -990,7 +991,14 @@ static struct ctl_table kern_table[] = {
 		.proc_handler	= &proc_dointvec,
 	},
 #endif
-
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "blk_iopoll",
+		.data		= &blk_iopoll_enabled,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
 /*
  * NOTE: do not add new entries to this table unless you have read
  * Documentation/sysctl/ctl_unnumbered.txt
-- 
cgit v0.10.2


From 1badcfbd7febd33f71fc02bda9112bd25e9c294b Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Thu, 6 Aug 2009 20:49:14 +0200
Subject: block: fix long lines in block/blk-iopoll.c

Note sure why they happened in the first place, probably some bad
terminal setting.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/block/blk-iopoll.c b/block/blk-iopoll.c
index 566db1e..df6f192 100644
--- a/block/blk-iopoll.c
+++ b/block/blk-iopoll.c
@@ -24,9 +24,9 @@ static DEFINE_PER_CPU(struct list_head, blk_cpu_iopoll);
  * @iop:      The parent iopoll structure
  *
  * Description:
- *     Add this blk_iopoll structure to the pending poll list and trigger the raise
- *     of the blk iopoll softirq. The driver must already have gotten a succesful
- *     return from blk_iopoll_sched_prep() before calling this.
+ *     Add this blk_iopoll structure to the pending poll list and trigger the
+ *     raise of the blk iopoll softirq. The driver must already have gotten a
+ *     succesful return from blk_iopoll_sched_prep() before calling this.
  **/
 void blk_iopoll_sched(struct blk_iopoll *iop)
 {
@@ -44,7 +44,8 @@ EXPORT_SYMBOL(blk_iopoll_sched);
  * @iop:      The parent iopoll structure
  *
  * Description:
- *     See blk_iopoll_complete(). This function must be called with interrupts disabled.
+ *     See blk_iopoll_complete(). This function must be called with interrupts
+ *     disabled.
  **/
 void __blk_iopoll_complete(struct blk_iopoll *iop)
 {
@@ -59,9 +60,10 @@ EXPORT_SYMBOL(__blk_iopoll_complete);
  * @iop:      The parent iopoll structure
  *
  * Description:
- *     If a driver consumes less than the assigned budget in its run of the iopoll
- *     handler, it'll end the polled mode by calling this function. The iopoll handler
- *     will not be invoked again before blk_iopoll_sched_prep() is called.
+ *     If a driver consumes less than the assigned budget in its run of the
+ *     iopoll handler, it'll end the polled mode by calling this function. The
+ *     iopoll handler will not be invoked again before blk_iopoll_sched_prep()
+ *     is called.
  **/
 void blk_iopoll_complete(struct blk_iopoll *iopoll)
 {
@@ -151,13 +153,13 @@ EXPORT_SYMBOL(blk_iopoll_disable);
  * @iop:      The parent iopoll structure
  *
  * Description:
- *     Enable iopoll on this @iop. Note that the handler run will not be scheduled, it
- *     will only mark it as active.
+ *     Enable iopoll on this @iop. Note that the handler run will not be
+ *     scheduled, it will only mark it as active.
  **/
 void blk_iopoll_enable(struct blk_iopoll *iop)
 {
 	BUG_ON(!test_bit(IOPOLL_F_SCHED, &iop->state));
-        smp_mb__before_clear_bit();
+	smp_mb__before_clear_bit();
 	clear_bit_unlock(IOPOLL_F_SCHED, &iop->state);
 }
 EXPORT_SYMBOL(blk_iopoll_enable);
@@ -169,8 +171,8 @@ EXPORT_SYMBOL(blk_iopoll_enable);
  * @poll_fn:  The handler to invoke
  *
  * Description:
- *     Initialize this blk_iopoll structure. Before being actively used, the driver
- *     must call blk_iopoll_enable().
+ *     Initialize this blk_iopoll structure. Before being actively used, the
+ *     driver must call blk_iopoll_enable().
  **/
 void blk_iopoll_init(struct blk_iopoll *iop, int weight, blk_iopoll_fn *poll_fn)
 {
-- 
cgit v0.10.2


From 37867ae7c549c58d44337e27738577ed8b7cd753 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Thu, 6 Aug 2009 20:50:48 +0200
Subject: block: adjust default budget for blk-iopoll

It's not exported, I doubt we'll have a reason to change this...

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/block/blk-iopoll.c b/block/blk-iopoll.c
index df6f192..0671d46 100644
--- a/block/blk-iopoll.c
+++ b/block/blk-iopoll.c
@@ -17,6 +17,8 @@
 int blk_iopoll_enabled = 1;
 EXPORT_SYMBOL(blk_iopoll_enabled);
 
+static unsigned int blk_iopoll_budget __read_mostly = 256;
+
 static DEFINE_PER_CPU(struct list_head, blk_cpu_iopoll);
 
 /**
@@ -78,8 +80,8 @@ EXPORT_SYMBOL(blk_iopoll_complete);
 static void blk_iopoll_softirq(struct softirq_action *h)
 {
 	struct list_head *list = &__get_cpu_var(blk_cpu_iopoll);
+	int rearm = 0, budget = blk_iopoll_budget;
 	unsigned long start_time = jiffies;
-	int rearm = 0, budget = 64;
 
 	local_irq_disable();
 
-- 
cgit v0.10.2


From fca51d64c5baf64604bc1edc5c5f0e7ed176322f Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Thu, 6 Aug 2009 20:53:23 +0200
Subject: block: fix comment in blk-iopoll.c

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/block/blk-iopoll.c b/block/blk-iopoll.c
index 0671d46..b9b3265 100644
--- a/block/blk-iopoll.c
+++ b/block/blk-iopoll.c
@@ -115,9 +115,12 @@ static void blk_iopoll_softirq(struct softirq_action *h)
 
 		local_irq_disable();
 
-		/* Drivers must not modify the NAPI state if they
-		 * consume the entire weight.  In such cases this code
-		 * still "owns" the NAPI instance and therefore can
+		/*
+		 * Drivers must not modify the iopoll state, if they
+		 * consume their assigned weight (or more, some drivers can't
+		 * easily just stop processing, they have to complete an
+		 * entire mask of commands).In such cases this code
+		 * still "owns" the iopoll instance and therefore can
 		 * move the instance around on the list at-will.
 		 */
 		if (work >= weight) {
-- 
cgit v0.10.2


From a33dac26d42d6f156b3b05a929961bd8d904f6e2 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Thu, 6 Aug 2009 20:53:45 +0200
Subject: block: use interrupts disabled version of raise_softirq_irqoff()

We already have interrupts disabled at that point, so use the
__raise_softirq_irqoff() variant.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/block/blk-iopoll.c b/block/blk-iopoll.c
index b9b3265..ca56420 100644
--- a/block/blk-iopoll.c
+++ b/block/blk-iopoll.c
@@ -202,7 +202,7 @@ static int __cpuinit blk_iopoll_cpu_notify(struct notifier_block *self,
 		local_irq_disable();
 		list_splice_init(&per_cpu(blk_cpu_iopoll, cpu),
 				 &__get_cpu_var(blk_cpu_iopoll));
-		raise_softirq_irqoff(BLOCK_IOPOLL_SOFTIRQ);
+		__raise_softirq_irqoff(BLOCK_IOPOLL_SOFTIRQ);
 		local_irq_enable();
 	}
 
-- 
cgit v0.10.2


From 1b379d8daf4e981b2220f057683e35af022d45bc Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Tue, 11 Aug 2009 08:26:11 +0200
Subject: cfq-iosched: get rid of must_alloc flag

It's not currently used, as pointed out by
Gui Jianfeng <guijianfeng@cn.fujitsu.com>. We already check the
wait_request flag to allow an idling queue priority allocation access,
so we don't need this extra flag.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 9e6d0af..f75a54b 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -186,7 +186,6 @@ enum cfqq_state_flags {
 	CFQ_CFQQ_FLAG_on_rr = 0,	/* on round-robin busy list */
 	CFQ_CFQQ_FLAG_wait_request,	/* waiting for a request */
 	CFQ_CFQQ_FLAG_must_dispatch,	/* must be allowed a dispatch */
-	CFQ_CFQQ_FLAG_must_alloc,	/* must be allowed rq alloc */
 	CFQ_CFQQ_FLAG_must_alloc_slice,	/* per-slice must_alloc flag */
 	CFQ_CFQQ_FLAG_fifo_expire,	/* FIFO checked in this slice */
 	CFQ_CFQQ_FLAG_idle_window,	/* slice idling enabled */
@@ -213,7 +212,6 @@ static inline int cfq_cfqq_##name(const struct cfq_queue *cfqq)		\
 CFQ_CFQQ_FNS(on_rr);
 CFQ_CFQQ_FNS(wait_request);
 CFQ_CFQQ_FNS(must_dispatch);
-CFQ_CFQQ_FNS(must_alloc);
 CFQ_CFQQ_FNS(must_alloc_slice);
 CFQ_CFQQ_FNS(fifo_expire);
 CFQ_CFQQ_FNS(idle_window);
@@ -2218,8 +2216,7 @@ static void cfq_prio_boost(struct cfq_queue *cfqq)
 
 static inline int __cfq_may_queue(struct cfq_queue *cfqq)
 {
-	if ((cfq_cfqq_wait_request(cfqq) || cfq_cfqq_must_alloc(cfqq)) &&
-	    !cfq_cfqq_must_alloc_slice(cfqq)) {
+	if (cfq_cfqq_wait_request(cfqq) && !cfq_cfqq_must_alloc_slice(cfqq)) {
 		cfq_mark_cfqq_must_alloc_slice(cfqq);
 		return ELV_MQUEUE_MUST;
 	}
@@ -2306,7 +2303,6 @@ cfq_set_request(struct request_queue *q, struct request *rq, gfp_t gfp_mask)
 	}
 
 	cfqq->allocated[rw]++;
-	cfq_clear_cfqq_must_alloc(cfqq);
 	atomic_inc(&cfqq->ref);
 
 	spin_unlock_irqrestore(q->queue_lock, flags);
-- 
cgit v0.10.2


From d62f843b295393124970d29316344150c7de009b Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Tue, 11 Aug 2009 08:31:14 +0200
Subject: block: make blk_iopoll_prep_sched() follow normal 0/1 return
 convention

Return 0 if we successfully marked this iopoll structure as ours for
scheduling, instead of 1.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/include/linux/blk-iopoll.h b/include/linux/blk-iopoll.h
index b2e1739..308734d 100644
--- a/include/linux/blk-iopoll.h
+++ b/include/linux/blk-iopoll.h
@@ -18,10 +18,17 @@ enum {
 	IOPOLL_F_DISABLE	= 1,
 };
 
+/*
+ * Returns 0 if we successfully set the IOPOLL_F_SCHED bit, indicating
+ * that we were the first to acquire this iop for scheduling. If this iop
+ * is currently disabled, return "failure".
+ */
 static inline int blk_iopoll_sched_prep(struct blk_iopoll *iop)
 {
-	return !test_bit(IOPOLL_F_DISABLE, &iop->state) &&
-		!test_and_set_bit(IOPOLL_F_SCHED, &iop->state);
+	if (!test_bit(IOPOLL_F_DISABLE, &iop->state))
+		return test_and_set_bit(IOPOLL_F_SCHED, &iop->state);
+
+	return 1;
 }
 
 static inline int blk_iopoll_disable_pending(struct blk_iopoll *iop)
-- 
cgit v0.10.2


From 723590ed52d244b025f10a8e2f54c2f57ed02b4d Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Sat, 15 Aug 2009 08:43:22 +0200
Subject: splice: update mtime and atime on files

Splice should update the modification and access times on regular
files just like read and write. Not updating mtime will confuse
backup tools, etc...

This patch only adds the time updates for regular files.  For pipes
and other special files that splice touches the need for updating the
times is less clear.  Let's discuss and fix that separately.

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/fs/splice.c b/fs/splice.c
index 73766d2..aec4014 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -502,8 +502,10 @@ ssize_t generic_file_splice_read(struct file *in, loff_t *ppos,
 		len = left;
 
 	ret = __generic_file_splice_read(in, ppos, pipe, len, flags);
-	if (ret > 0)
+	if (ret > 0) {
 		*ppos += ret;
+		file_accessed(in);
+	}
 
 	return ret;
 }
@@ -963,8 +965,10 @@ generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
 
 		mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);
 		ret = file_remove_suid(out);
-		if (!ret)
+		if (!ret) {
+			file_update_time(out);
 			ret = splice_from_pipe_feed(pipe, &sd, pipe_to_file);
+		}
 		mutex_unlock(&inode->i_mutex);
 	} while (ret > 0);
 	splice_from_pipe_end(pipe, &sd);
-- 
cgit v0.10.2


From 212a502676c308ead964a024e081529ad3f8d94c Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Mon, 24 Aug 2009 10:01:53 +0200
Subject: cciss: memory leak in cciss_init_one()

commit 22bece00dc1f28dd3374c55e464c9f02eb642876
(cciss: fix regression firmware not displayed in procfs)
added a small memory leak in cciss_init_one()

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index a52cc7f..0589dfb 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -3889,7 +3889,7 @@ static int __devinit cciss_init_one(struct pci_dev *pdev,
 	int j = 0;
 	int rc;
 	int dac, return_code;
-	InquiryData_struct *inq_buff = NULL;
+	InquiryData_struct *inq_buff;
 
 	if (reset_devices) {
 		/* Reset the controller with a PCI power-cycle */
@@ -4029,6 +4029,7 @@ static int __devinit cciss_init_one(struct pci_dev *pdev,
 		printk(KERN_WARNING "cciss: unable to determine firmware"
 			" version of controller\n");
 	}
+	kfree(inq_buff);
 
 	cciss_procinit(i);
 
@@ -4045,7 +4046,6 @@ static int __devinit cciss_init_one(struct pci_dev *pdev,
 	return 1;
 
 clean4:
-	kfree(inq_buff);
 	kfree(hba[i]->cmd_pool_bits);
 	if (hba[i]->cmd_pool)
 		pci_free_consistent(hba[i]->pdev,
-- 
cgit v0.10.2


From 49b3a3cbc0311f8f809183696b1f2ab824b18a51 Mon Sep 17 00:00:00 2001
From: Marcin Slusarz <marcin.slusarz@gmail.com>
Date: Mon, 24 Aug 2009 10:56:38 +0200
Subject: block: use printk_once

Signed-off-by: Marcin Slusarz <marcin.slusarz@gmail.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Tim Waugh <tim@cyberelk.net>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/drivers/block/paride/pcd.c b/drivers/block/paride/pcd.c
index 911dfd9..9f3518c 100644
--- a/drivers/block/paride/pcd.c
+++ b/drivers/block/paride/pcd.c
@@ -219,8 +219,6 @@ static int pcd_sector;		/* address of next requested sector */
 static int pcd_count;		/* number of blocks still to do */
 static char *pcd_buf;		/* buffer for request in progress */
 
-static int pcd_warned;		/* Have we logged a phase warning ? */
-
 /* kernel glue structures */
 
 static int pcd_block_open(struct block_device *bdev, fmode_t mode)
@@ -417,12 +415,10 @@ static int pcd_completion(struct pcd_unit *cd, char *buf, char *fun)
 					printk
 					    ("%s: %s: Unexpected phase %d, d=%d, k=%d\n",
 					     cd->name, fun, p, d, k);
-				if ((verbose < 2) && !pcd_warned) {
-					pcd_warned = 1;
-					printk
-					    ("%s: WARNING: ATAPI phase errors\n",
-					     cd->name);
-				}
+				if (verbose < 2)
+					printk_once(
+					    "%s: WARNING: ATAPI phase errors\n",
+					    cd->name);
 				mdelay(1);
 			}
 			if (k++ > PCD_TMO) {
diff --git a/drivers/block/sx8.c b/drivers/block/sx8.c
index da403b6..f5cd2e8 100644
--- a/drivers/block/sx8.c
+++ b/drivers/block/sx8.c
@@ -1564,15 +1564,13 @@ static int carm_init_shm(struct carm_host *host)
 
 static int carm_init_one (struct pci_dev *pdev, const struct pci_device_id *ent)
 {
-	static unsigned int printed_version;
 	struct carm_host *host;
 	unsigned int pci_dac;
 	int rc;
 	struct request_queue *q;
 	unsigned int i;
 
-	if (!printed_version++)
-		printk(KERN_DEBUG DRV_NAME " version " DRV_VERSION "\n");
+	printk_once(KERN_DEBUG DRV_NAME " version " DRV_VERSION "\n");
 
 	rc = pci_enable_device(pdev);
 	if (rc)
diff --git a/drivers/block/viodasd.c b/drivers/block/viodasd.c
index 390d69b..b441ce3 100644
--- a/drivers/block/viodasd.c
+++ b/drivers/block/viodasd.c
@@ -416,15 +416,9 @@ retry:
 		goto retry;
 	}
 	if (we.max_disk > (MAX_DISKNO - 1)) {
-		static int warned;
-
-		if (warned == 0) {
-			warned++;
-			printk(VIOD_KERN_INFO
-				"Only examining the first %d "
-				"of %d disks connected\n",
-				MAX_DISKNO, we.max_disk + 1);
-		}
+		printk_once(VIOD_KERN_INFO
+			"Only examining the first %d of %d disks connected\n",
+			MAX_DISKNO, we.max_disk + 1);
 	}
 
 	/* Send the close event to OS/400.  We DON'T expect a response */
-- 
cgit v0.10.2


From b217a903ab6581cba04f88c44284dcdd2a752561 Mon Sep 17 00:00:00 2001
From: Shan Wei <shanwei@cn.fujitsu.com>
Date: Tue, 1 Sep 2009 10:06:42 +0200
Subject: cfq: fix the log message after dispatched a request

The blktrace tools can show process id when cfq dispatched a request,
using cfq_log_cfqq() instead of cfq_log().

Signed-off-by: Shan Wei <shanwei@cn.fujitsu.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index f75a54b..a34686f 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -1349,7 +1349,7 @@ static int cfq_dispatch_requests(struct request_queue *q, int force)
 		cfq_slice_expired(cfqd, 0);
 	}
 
-	cfq_log(cfqd, "dispatched a request");
+	cfq_log_cfqq(cfqd, cfqq, "dispatched a request");
 	return 1;
 }
 
-- 
cgit v0.10.2


From 01e97f6b897bf06ec83375d691f2f4d57f5b3a09 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Thu, 3 Sep 2009 20:06:47 +0200
Subject: block: enable rq CPU completion affinity by default

Test results here look good, and on big OLTP runs it has also shown
to significantly increase cycles attributed to the database and
cause a performance boost.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 98b4563..8bf1a10 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -463,7 +463,8 @@ struct request_queue
 
 #define QUEUE_FLAG_DEFAULT	((1 << QUEUE_FLAG_IO_STAT) |		\
 				 (1 << QUEUE_FLAG_CLUSTER) |		\
-				 (1 << QUEUE_FLAG_STACKABLE))
+				 (1 << QUEUE_FLAG_STACKABLE)	|	\
+				 (1 << QUEUE_FLAG_SAME_COMP))
 
 static inline int queue_is_locked(struct request_queue *q)
 {
-- 
cgit v0.10.2


From 01edede41e352e4879a89cdc5468f72ffc89b713 Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan.kim@gmail.com>
Date: Tue, 8 Sep 2009 21:56:38 +0200
Subject: block: trace bio queueing trial only when it occurs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

If BIO is discarded or cross over end of device,
BIO queueing trial doesn't occur.

Actually the trace was called just before make_request at first:
[PATCH] Block queue IO tracing support (blktrace) as of 2006-03-23
      2056a782f8e7e65fd4bfd027506b4ce1c5e9ccd4

And then 2 patches added some checks between them:
[PATCH] md: check bio address after mapping through partitions
        5ddfe9691c91a244e8d1be597b6428fcefd58103,
[BLOCK] Don't allow empty barriers to be passed down to
queues that don't grok them
        51fd77bd9f512ab6cc9df0733ba1caaab89eb957

It breaks original goal.
Let's trace it only when it happens.

Signed-off-by: Minchan Kim <minchan.kim@gmail.com>
Acked-by: Wu Fengguang <fengguang.wu@intel.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/block/blk-core.c b/block/blk-core.c
index 93051d1..982d634 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1462,8 +1462,6 @@ static inline void __generic_make_request(struct bio *bio)
 		if (old_sector != -1)
 			trace_block_remap(q, bio, old_dev, old_sector);
 
-		trace_block_bio_queue(q, bio);
-
 		old_sector = bio->bi_sector;
 		old_dev = bio->bi_bdev->bd_dev;
 
@@ -1476,6 +1474,8 @@ static inline void __generic_make_request(struct bio *bio)
 			goto end_io;
 		}
 
+		trace_block_bio_queue(q, bio);
+
 		ret = q->make_request_fn(q, bio);
 	} while (ret);
 
-- 
cgit v0.10.2


From 18d8217bc441630c3c5ec7416c5a65c69e8a0979 Mon Sep 17 00:00:00 2001
From: Ed Cashin <ecashin@coraid.com>
Date: Thu, 10 Sep 2009 22:30:47 +0200
Subject: aoe: end barrier bios with EOPNOTSUPP

BugLink: http://bugzilla.kernel.org/show_bug.cgi?id=13942

Bruno Premont noticed that aoe throws a BUG during umount of an XFS in
2.6.31:

[ 5259.349897] aoe: bi_io_vec is NULL
[ 5259.349940] ------------[ cut here ]------------
[ 5259.349958] kernel BUG at /usr/src/linux-2.6/drivers/block/aoe/aoeblk.c:177!
[ 5259.349990] invalid opcode: 0000 [#1]

The bio in question is a barrier.  Jens Axboe suggested that such bios
need to be recognized and ended with -EOPNOTSUPP by any driver that
provides its own ->make_request_fn handler and does not handle
barriers.

In testing the changes below eliminate the BUG.

(Better would be real barrier support, something that Ed says he'll add
for later in the .32 cycle. For now, this at least gets rid of a bug
with crashing on an empty barrier. Jens)

Signed-off-by: Ed L. Cashin <ecashin@coraid.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/drivers/block/aoe/aoeblk.c b/drivers/block/aoe/aoeblk.c
index 1e15889..bf7ceba 100644
--- a/drivers/block/aoe/aoeblk.c
+++ b/drivers/block/aoe/aoeblk.c
@@ -172,6 +172,9 @@ aoeblk_make_request(struct request_queue *q, struct bio *bio)
 		BUG();
 		bio_endio(bio, -ENXIO);
 		return 0;
+	} else if (bio_rw_flagged(bio, BIO_RW_BARRIER)) {
+		bio_endio(bio, -EOPNOTSUPP);
+		return 0;
 	} else if (bio->bi_io_vec == NULL) {
 		printk(KERN_ERR "aoe: bi_io_vec is NULL\n");
 		BUG();
-- 
cgit v0.10.2


From a9327cac440be4d8333bba975cbbf76045096275 Mon Sep 17 00:00:00 2001
From: Nikanth Karthikesan <knikanth@suse.de>
Date: Fri, 11 Sep 2009 09:18:54 +0200
Subject: Seperate read and write statistics of in_flight requests

Currently, there is a single in_flight counter measuring the number of
requests in the request_queue. But some monitoring tools would like to
know how many read requests and write requests are in progress. Split the
current in_flight counter into two seperate counters for read and write.

This information is exported as a sysfs attribute, as changing the
currently available stat files would break the existing tools.

Signed-off-by: Nikanth Karthikesan <knikanth@suse.de>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/block/blk-core.c b/block/blk-core.c
index 982d634..182ebe3 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -69,7 +69,7 @@ static void drive_stat_acct(struct request *rq, int new_io)
 		part_stat_inc(cpu, part, merges[rw]);
 	else {
 		part_round_stats(cpu, part);
-		part_inc_in_flight(part);
+		part_inc_in_flight(part, rw);
 	}
 
 	part_stat_unlock();
@@ -1030,7 +1030,7 @@ static void part_round_stats_single(int cpu, struct hd_struct *part,
 
 	if (part->in_flight) {
 		__part_stat_add(cpu, part, time_in_queue,
-				part->in_flight * (now - part->stamp));
+				part_in_flight(part) * (now - part->stamp));
 		__part_stat_add(cpu, part, io_ticks, (now - part->stamp));
 	}
 	part->stamp = now;
@@ -1737,7 +1737,7 @@ static void blk_account_io_done(struct request *req)
 		part_stat_inc(cpu, part, ios[rw]);
 		part_stat_add(cpu, part, ticks[rw], duration);
 		part_round_stats(cpu, part);
-		part_dec_in_flight(part);
+		part_dec_in_flight(part, rw);
 
 		part_stat_unlock();
 	}
diff --git a/block/blk-merge.c b/block/blk-merge.c
index b0de857..99cb5cf 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -351,7 +351,7 @@ static void blk_account_io_merge(struct request *req)
 		part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req));
 
 		part_round_stats(cpu, part);
-		part_dec_in_flight(part);
+		part_dec_in_flight(part, rq_data_dir(req));
 
 		part_stat_unlock();
 	}
diff --git a/block/genhd.c b/block/genhd.c
index b89328e..5b76bf5 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -869,6 +869,7 @@ static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL);
 static DEVICE_ATTR(alignment_offset, S_IRUGO, disk_alignment_offset_show, NULL);
 static DEVICE_ATTR(capability, S_IRUGO, disk_capability_show, NULL);
 static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL);
+static DEVICE_ATTR(inflight, S_IRUGO, part_inflight_show, NULL);
 #ifdef CONFIG_FAIL_MAKE_REQUEST
 static struct device_attribute dev_attr_fail =
 	__ATTR(make-it-fail, S_IRUGO|S_IWUSR, part_fail_show, part_fail_store);
@@ -888,6 +889,7 @@ static struct attribute *disk_attrs[] = {
 	&dev_attr_alignment_offset.attr,
 	&dev_attr_capability.attr,
 	&dev_attr_stat.attr,
+	&dev_attr_inflight.attr,
 #ifdef CONFIG_FAIL_MAKE_REQUEST
 	&dev_attr_fail.attr,
 #endif
@@ -1053,7 +1055,7 @@ static int diskstats_show(struct seq_file *seqf, void *v)
 			   part_stat_read(hd, merges[1]),
 			   (unsigned long long)part_stat_read(hd, sectors[1]),
 			   jiffies_to_msecs(part_stat_read(hd, ticks[1])),
-			   hd->in_flight,
+			   part_in_flight(hd),
 			   jiffies_to_msecs(part_stat_read(hd, io_ticks)),
 			   jiffies_to_msecs(part_stat_read(hd, time_in_queue))
 			);
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index ec012f0..eee28fa 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -130,7 +130,7 @@ struct mapped_device {
 	/*
 	 * A list of ios that arrived while we were suspended.
 	 */
-	atomic_t pending;
+	atomic_t pending[2];
 	wait_queue_head_t wait;
 	struct work_struct work;
 	struct bio_list deferred;
@@ -453,13 +453,14 @@ static void start_io_acct(struct dm_io *io)
 {
 	struct mapped_device *md = io->md;
 	int cpu;
+	int rw = bio_data_dir(io->bio);
 
 	io->start_time = jiffies;
 
 	cpu = part_stat_lock();
 	part_round_stats(cpu, &dm_disk(md)->part0);
 	part_stat_unlock();
-	dm_disk(md)->part0.in_flight = atomic_inc_return(&md->pending);
+	dm_disk(md)->part0.in_flight[rw] = atomic_inc_return(&md->pending[rw]);
 }
 
 static void end_io_acct(struct dm_io *io)
@@ -479,8 +480,9 @@ static void end_io_acct(struct dm_io *io)
 	 * After this is decremented the bio must not be touched if it is
 	 * a barrier.
 	 */
-	dm_disk(md)->part0.in_flight = pending =
-		atomic_dec_return(&md->pending);
+	dm_disk(md)->part0.in_flight[rw] = pending =
+		atomic_dec_return(&md->pending[rw]);
+	pending += atomic_read(&md->pending[rw^0x1]);
 
 	/* nudge anyone waiting on suspend queue */
 	if (!pending)
@@ -1785,7 +1787,8 @@ static struct mapped_device *alloc_dev(int minor)
 	if (!md->disk)
 		goto bad_disk;
 
-	atomic_set(&md->pending, 0);
+	atomic_set(&md->pending[0], 0);
+	atomic_set(&md->pending[1], 0);
 	init_waitqueue_head(&md->wait);
 	INIT_WORK(&md->work, dm_wq_work);
 	init_waitqueue_head(&md->eventq);
@@ -2088,7 +2091,8 @@ static int dm_wait_for_completion(struct mapped_device *md, int interruptible)
 				break;
 			}
 			spin_unlock_irqrestore(q->queue_lock, flags);
-		} else if (!atomic_read(&md->pending))
+		} else if (!atomic_read(&md->pending[0]) &&
+					!atomic_read(&md->pending[1]))
 			break;
 
 		if (interruptible == TASK_INTERRUPTIBLE &&
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index ea4e6cb..619ba99 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -248,11 +248,19 @@ ssize_t part_stat_show(struct device *dev,
 		part_stat_read(p, merges[WRITE]),
 		(unsigned long long)part_stat_read(p, sectors[WRITE]),
 		jiffies_to_msecs(part_stat_read(p, ticks[WRITE])),
-		p->in_flight,
+		part_in_flight(p),
 		jiffies_to_msecs(part_stat_read(p, io_ticks)),
 		jiffies_to_msecs(part_stat_read(p, time_in_queue)));
 }
 
+ssize_t part_inflight_show(struct device *dev,
+			struct device_attribute *attr, char *buf)
+{
+	struct hd_struct *p = dev_to_part(dev);
+
+	return sprintf(buf, "%8u %8u\n", p->in_flight[0], p->in_flight[1]);
+}
+
 #ifdef CONFIG_FAIL_MAKE_REQUEST
 ssize_t part_fail_show(struct device *dev,
 		       struct device_attribute *attr, char *buf)
@@ -281,6 +289,7 @@ static DEVICE_ATTR(start, S_IRUGO, part_start_show, NULL);
 static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL);
 static DEVICE_ATTR(alignment_offset, S_IRUGO, part_alignment_offset_show, NULL);
 static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL);
+static DEVICE_ATTR(inflight, S_IRUGO, part_inflight_show, NULL);
 #ifdef CONFIG_FAIL_MAKE_REQUEST
 static struct device_attribute dev_attr_fail =
 	__ATTR(make-it-fail, S_IRUGO|S_IWUSR, part_fail_show, part_fail_store);
@@ -292,6 +301,7 @@ static struct attribute *part_attrs[] = {
 	&dev_attr_size.attr,
 	&dev_attr_alignment_offset.attr,
 	&dev_attr_stat.attr,
+	&dev_attr_inflight.attr,
 #ifdef CONFIG_FAIL_MAKE_REQUEST
 	&dev_attr_fail.attr,
 #endif
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 45fc320..44263cb 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -98,7 +98,7 @@ struct hd_struct {
 	int make_it_fail;
 #endif
 	unsigned long stamp;
-	int in_flight;
+	int in_flight[2];
 #ifdef	CONFIG_SMP
 	struct disk_stats *dkstats;
 #else
@@ -322,18 +322,23 @@ static inline void free_part_stats(struct hd_struct *part)
 #define part_stat_sub(cpu, gendiskp, field, subnd)			\
 	part_stat_add(cpu, gendiskp, field, -subnd)
 
-static inline void part_inc_in_flight(struct hd_struct *part)
+static inline void part_inc_in_flight(struct hd_struct *part, int rw)
 {
-	part->in_flight++;
+	part->in_flight[rw]++;
 	if (part->partno)
-		part_to_disk(part)->part0.in_flight++;
+		part_to_disk(part)->part0.in_flight[rw]++;
 }
 
-static inline void part_dec_in_flight(struct hd_struct *part)
+static inline void part_dec_in_flight(struct hd_struct *part, int rw)
 {
-	part->in_flight--;
+	part->in_flight[rw]--;
 	if (part->partno)
-		part_to_disk(part)->part0.in_flight--;
+		part_to_disk(part)->part0.in_flight[rw]--;
+}
+
+static inline int part_in_flight(struct hd_struct *part)
+{
+	return part->in_flight[0] + part->in_flight[1];
 }
 
 /* block/blk-core.c */
@@ -546,6 +551,8 @@ extern ssize_t part_size_show(struct device *dev,
 			      struct device_attribute *attr, char *buf);
 extern ssize_t part_stat_show(struct device *dev,
 			      struct device_attribute *attr, char *buf);
+extern ssize_t part_inflight_show(struct device *dev,
+			      struct device_attribute *attr, char *buf);
 #ifdef CONFIG_FAIL_MAKE_REQUEST
 extern ssize_t part_fail_show(struct device *dev,
 			      struct device_attribute *attr, char *buf);
-- 
cgit v0.10.2


From 06d2188644c85c56d243efab914f368d1d23c4a3 Mon Sep 17 00:00:00 2001
From: Jeff Moyer <jmoyer@redhat.com>
Date: Fri, 11 Sep 2009 17:08:59 +0200
Subject: cfq: choose a new next_req when a request is dispatched

This patch addresses http://bugzilla.kernel.org/show_bug.cgi?id=13401, a
regression introduced in 2.6.30.

From the bug report:

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index a34686f..0e3814b 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -1110,6 +1110,7 @@ static void cfq_dispatch_insert(struct request_queue *q, struct request *rq)
 
 	cfq_log_cfqq(cfqd, cfqq, "dispatch_insert");
 
+	cfqq->next_rq = cfq_find_next_rq(cfqd, cfqq, rq);
 	cfq_remove_request(rq);
 	cfqq->dispatched++;
 	elv_dispatch_sort(q, rq);
-- 
cgit v0.10.2


From 3c5820c743479285ce2678fd3c12b1fd39fe998f Mon Sep 17 00:00:00 2001
From: "Martin K. Petersen" <martin.petersen@oracle.com>
Date: Fri, 11 Sep 2009 21:54:52 +0200
Subject: block: Optimal I/O limit wrapper

Implement blk_limits_io_opt() and make blk_queue_io_opt() a wrapper
around it. DM needs this to avoid poking at the queue_limits directly.

Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/block/blk-settings.c b/block/blk-settings.c
index 476d870..83413ff 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -428,6 +428,25 @@ void blk_queue_io_min(struct request_queue *q, unsigned int min)
 EXPORT_SYMBOL(blk_queue_io_min);
 
 /**
+ * blk_limits_io_opt - set optimal request size for a device
+ * @limits: the queue limits
+ * @opt:  smallest I/O size in bytes
+ *
+ * Description:
+ *   Storage devices may report an optimal I/O size, which is the
+ *   device's preferred unit for sustained I/O.  This is rarely reported
+ *   for disk drives.  For RAID arrays it is usually the stripe width or
+ *   the internal track size.  A properly aligned multiple of
+ *   optimal_io_size is the preferred request size for workloads where
+ *   sustained throughput is desired.
+ */
+void blk_limits_io_opt(struct queue_limits *limits, unsigned int opt)
+{
+	limits->io_opt = opt;
+}
+EXPORT_SYMBOL(blk_limits_io_opt);
+
+/**
  * blk_queue_io_opt - set optimal request size for the queue
  * @q:	the request queue for the device
  * @opt:  optimal request size in bytes
@@ -442,7 +461,7 @@ EXPORT_SYMBOL(blk_queue_io_min);
  */
 void blk_queue_io_opt(struct request_queue *q, unsigned int opt)
 {
-	q->limits.io_opt = opt;
+	blk_limits_io_opt(&q->limits, opt);
 }
 EXPORT_SYMBOL(blk_queue_io_opt);
 
diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c
index fde658c..e0efc1a 100644
--- a/drivers/md/dm-stripe.c
+++ b/drivers/md/dm-stripe.c
@@ -336,7 +336,7 @@ static void stripe_io_hints(struct dm_target *ti,
 	unsigned chunk_size = (sc->chunk_mask + 1) << 9;
 
 	blk_limits_io_min(limits, chunk_size);
-	limits->io_opt = chunk_size * sc->stripes;
+	blk_limits_io_opt(limits, chunk_size * sc->stripes);
 }
 
 static struct target_type stripe_target = {
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 8bf1a10..86c2bdf 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -935,6 +935,7 @@ extern void blk_queue_alignment_offset(struct request_queue *q,
 				       unsigned int alignment);
 extern void blk_limits_io_min(struct queue_limits *limits, unsigned int min);
 extern void blk_queue_io_min(struct request_queue *q, unsigned int min);
+extern void blk_limits_io_opt(struct queue_limits *limits, unsigned int opt);
 extern void blk_queue_io_opt(struct request_queue *q, unsigned int opt);
 extern void blk_set_default_limits(struct queue_limits *lim);
 extern int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
-- 
cgit v0.10.2


From b8a9ae779f2c7049071034661e09cb7e1e82250c Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Fri, 11 Sep 2009 22:44:29 +0200
Subject: block: don't assume device has a request list backing in nr_requests
 store

Stacked devices do not. For now, just error out with -EINVAL. Later
we could make the limit apply on stacked devices too, for throttling
reasons.

This fixes

5a54cd13353bb3b88887604e2c980aa01e314309

and should go into 2.6.31 stable as well.

Cc: stable@kernel.org
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index d3aa2aa..b78c9c3 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -40,7 +40,12 @@ queue_requests_store(struct request_queue *q, const char *page, size_t count)
 {
 	struct request_list *rl = &q->rq;
 	unsigned long nr;
-	int ret = queue_var_store(&nr, page, count);
+	int ret;
+
+	if (!q->request_fn)
+		return -EINVAL;
+
+	ret = queue_var_store(&nr, page, count);
 	if (nr < BLKDEV_MIN_RQ)
 		nr = BLKDEV_MIN_RQ;
 
-- 
cgit v0.10.2


From 3d2257f157c2324acbc0fa0fa54e8626a987edd2 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Sat, 12 Sep 2009 07:35:37 +0200
Subject: Make DISCARD_BARRIER and DISCARD_NOBARRIER writes instead of reads

The commands are conceptually writes, and in the case of IDE and SCSI
commands actually are writes.  They were only reads because we thought
that would interact better with the elevators.  Now the elevators know
about discard requests, that advantage no longer exists.

Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
Signed-off-by: Matthew Wilcox <willy@linux.intel.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/include/linux/fs.h b/include/linux/fs.h
index c1f9935..7f5a8ad2 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -161,8 +161,8 @@ struct inodes_stat_t {
  * These aren't really reads or writes, they pass down information about
  * parts of device that are now unused by the file system.
  */
-#define DISCARD_NOBARRIER (1 << BIO_RW_DISCARD)
-#define DISCARD_BARRIER ((1 << BIO_RW_DISCARD) | (1 << BIO_RW_BARRIER))
+#define DISCARD_NOBARRIER (WRITE | (1 << BIO_RW_DISCARD))
+#define DISCARD_BARRIER (DISCARD_NOBARRIER | (1 << BIO_RW_BARRIER))
 
 #define SEL_IN		1
 #define SEL_OUT		2
-- 
cgit v0.10.2


From 746cd1e7e4a555ddaee53b19a46e05c9c61eaf09 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Sat, 12 Sep 2009 07:35:43 +0200
Subject: block: use blkdev_issue_discard in blk_ioctl_discard

blk_ioctl_discard duplicates large amounts of code from blkdev_issue_discard,
the only difference between the two is that blkdev_issue_discard needs to
send a barrier discard request and blk_ioctl_discard a non-barrier one,
and blk_ioctl_discard needs to wait on the request.  To facilitates this
add a flags argument to blkdev_issue_discard to control both aspects of the
behaviour.  This will be very useful later on for using the waiting
funcitonality for other callers.

Based on an earlier patch from Matthew Wilcox <matthew@wil.cx>.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/block/blk-barrier.c b/block/blk-barrier.c
index 30022b4..6593ab3 100644
--- a/block/blk-barrier.c
+++ b/block/blk-barrier.c
@@ -348,6 +348,9 @@ static void blkdev_discard_end_io(struct bio *bio, int err)
 		clear_bit(BIO_UPTODATE, &bio->bi_flags);
 	}
 
+	if (bio->bi_private)
+		complete(bio->bi_private);
+
 	bio_put(bio);
 }
 
@@ -357,21 +360,20 @@ static void blkdev_discard_end_io(struct bio *bio, int err)
  * @sector:	start sector
  * @nr_sects:	number of sectors to discard
  * @gfp_mask:	memory allocation flags (for bio_alloc)
+ * @flags:	DISCARD_FL_* flags to control behaviour
  *
  * Description:
- *    Issue a discard request for the sectors in question. Does not wait.
+ *    Issue a discard request for the sectors in question.
  */
-int blkdev_issue_discard(struct block_device *bdev,
-			 sector_t sector, sector_t nr_sects, gfp_t gfp_mask)
+int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
+		sector_t nr_sects, gfp_t gfp_mask, int flags)
 {
-	struct request_queue *q;
-	struct bio *bio;
+	DECLARE_COMPLETION_ONSTACK(wait);
+	struct request_queue *q = bdev_get_queue(bdev);
+	int type = flags & DISCARD_FL_BARRIER ?
+		DISCARD_BARRIER : DISCARD_NOBARRIER;
 	int ret = 0;
 
-	if (bdev->bd_disk == NULL)
-		return -ENXIO;
-
-	q = bdev_get_queue(bdev);
 	if (!q)
 		return -ENXIO;
 
@@ -379,12 +381,14 @@ int blkdev_issue_discard(struct block_device *bdev,
 		return -EOPNOTSUPP;
 
 	while (nr_sects && !ret) {
-		bio = bio_alloc(gfp_mask, 0);
+		struct bio *bio = bio_alloc(gfp_mask, 0);
 		if (!bio)
 			return -ENOMEM;
 
 		bio->bi_end_io = blkdev_discard_end_io;
 		bio->bi_bdev = bdev;
+		if (flags & DISCARD_FL_WAIT)
+			bio->bi_private = &wait;
 
 		bio->bi_sector = sector;
 
@@ -396,10 +400,13 @@ int blkdev_issue_discard(struct block_device *bdev,
 			bio->bi_size = nr_sects << 9;
 			nr_sects = 0;
 		}
+
 		bio_get(bio);
-		submit_bio(DISCARD_BARRIER, bio);
+		submit_bio(type, bio);
+
+		if (flags & DISCARD_FL_WAIT)
+			wait_for_completion(&wait);
 
-		/* Check if it failed immediately */
 		if (bio_flagged(bio, BIO_EOPNOTSUPP))
 			ret = -EOPNOTSUPP;
 		else if (!bio_flagged(bio, BIO_UPTODATE))
diff --git a/block/ioctl.c b/block/ioctl.c
index 500e4c7..d3e6b58 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -112,22 +112,9 @@ static int blkdev_reread_part(struct block_device *bdev)
 	return res;
 }
 
-static void blk_ioc_discard_endio(struct bio *bio, int err)
-{
-	if (err) {
-		if (err == -EOPNOTSUPP)
-			set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
-		clear_bit(BIO_UPTODATE, &bio->bi_flags);
-	}
-	complete(bio->bi_private);
-}
-
 static int blk_ioctl_discard(struct block_device *bdev, uint64_t start,
 			     uint64_t len)
 {
-	struct request_queue *q = bdev_get_queue(bdev);
-	int ret = 0;
-
 	if (start & 511)
 		return -EINVAL;
 	if (len & 511)
@@ -137,40 +124,8 @@ static int blk_ioctl_discard(struct block_device *bdev, uint64_t start,
 
 	if (start + len > (bdev->bd_inode->i_size >> 9))
 		return -EINVAL;
-
-	if (!q->prepare_discard_fn)
-		return -EOPNOTSUPP;
-
-	while (len && !ret) {
-		DECLARE_COMPLETION_ONSTACK(wait);
-		struct bio *bio;
-
-		bio = bio_alloc(GFP_KERNEL, 0);
-
-		bio->bi_end_io = blk_ioc_discard_endio;
-		bio->bi_bdev = bdev;
-		bio->bi_private = &wait;
-		bio->bi_sector = start;
-
-		if (len > queue_max_hw_sectors(q)) {
-			bio->bi_size = queue_max_hw_sectors(q) << 9;
-			len -= queue_max_hw_sectors(q);
-			start += queue_max_hw_sectors(q);
-		} else {
-			bio->bi_size = len << 9;
-			len = 0;
-		}
-		submit_bio(DISCARD_NOBARRIER, bio);
-
-		wait_for_completion(&wait);
-
-		if (bio_flagged(bio, BIO_EOPNOTSUPP))
-			ret = -EOPNOTSUPP;
-		else if (!bio_flagged(bio, BIO_UPTODATE))
-			ret = -EIO;
-		bio_put(bio);
-	}
-	return ret;
+	return blkdev_issue_discard(bdev, start, len, GFP_KERNEL,
+				    DISCARD_FL_WAIT);
 }
 
 static int put_ushort(unsigned long arg, unsigned short val)
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 72a2b9c..535f85b 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1511,7 +1511,8 @@ static int remove_extent_backref(struct btrfs_trans_handle *trans,
 static void btrfs_issue_discard(struct block_device *bdev,
 				u64 start, u64 len)
 {
-	blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL);
+	blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL,
+			     DISCARD_FL_BARRIER);
 }
 #endif
 
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index fba7957..fbc4324 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -857,7 +857,8 @@ static void gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
 					goto start_new_extent;
 				if ((start + nr_sects) != blk) {
 					rv = blkdev_issue_discard(bdev, start,
-							    nr_sects, GFP_NOFS);
+							    nr_sects, GFP_NOFS,
+							    DISCARD_FL_BARRIER);
 					if (rv)
 						goto fail;
 					nr_sects = 0;
@@ -871,7 +872,8 @@ start_new_extent:
 		}
 	}
 	if (nr_sects) {
-		rv = blkdev_issue_discard(bdev, start, nr_sects, GFP_NOFS);
+		rv = blkdev_issue_discard(bdev, start, nr_sects, GFP_NOFS,
+					 DISCARD_FL_BARRIER);
 		if (rv)
 			goto fail;
 	}
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 86c2bdf..e23a86c 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -998,15 +998,18 @@ static inline struct request *blk_map_queue_find_tag(struct blk_queue_tag *bqt,
 }
 
 extern int blkdev_issue_flush(struct block_device *, sector_t *);
-extern int blkdev_issue_discard(struct block_device *,
-				sector_t sector, sector_t nr_sects, gfp_t);
+#define DISCARD_FL_WAIT		0x01	/* wait for completion */
+#define DISCARD_FL_BARRIER	0x02	/* issue DISCARD_BARRIER request */
+extern int blkdev_issue_discard(struct block_device *, sector_t sector,
+		sector_t nr_sects, gfp_t, int flags);
 
 static inline int sb_issue_discard(struct super_block *sb,
 				   sector_t block, sector_t nr_blocks)
 {
 	block <<= (sb->s_blocksize_bits - 9);
 	nr_blocks <<= (sb->s_blocksize_bits - 9);
-	return blkdev_issue_discard(sb->s_bdev, block, nr_blocks, GFP_KERNEL);
+	return blkdev_issue_discard(sb->s_bdev, block, nr_blocks, GFP_KERNEL,
+				    DISCARD_FL_BARRIER);
 }
 
 extern int blk_verify_command(unsigned char *cmd, fmode_t has_write_perm);
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 8ffdc0d..74f1102 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -161,7 +161,8 @@ static int discard_swap(struct swap_info_struct *si)
 		}
 
 		err = blkdev_issue_discard(si->bdev, start_block,
-						nr_blocks, GFP_KERNEL);
+						nr_blocks, GFP_KERNEL,
+						DISCARD_FL_BARRIER);
 		if (err)
 			break;
 
@@ -200,7 +201,8 @@ static void discard_swap_cluster(struct swap_info_struct *si,
 			start_block <<= PAGE_SHIFT - 9;
 			nr_blocks <<= PAGE_SHIFT - 9;
 			if (blkdev_issue_discard(si->bdev, start_block,
-							nr_blocks, GFP_NOIO))
+							nr_blocks, GFP_NOIO,
+							DISCARD_FL_BARRIER))
 				break;
 		}
 
-- 
cgit v0.10.2