summaryrefslogtreecommitdiff
path: root/drivers/md
diff options
context:
space:
mode:
authorScott Wood <scottwood@freescale.com>2015-02-13 22:12:06 (GMT)
committerScott Wood <scottwood@freescale.com>2015-02-13 22:19:22 (GMT)
commit6faa2909871d8937cb2f79a10e1b21ffe193fac1 (patch)
treef558a94f1553814cc122ab8d9e04c0ebad5262a5 /drivers/md
parentfcb2fb84301c673ee15ca04e7a2fc965712d49a0 (diff)
downloadlinux-fsl-qoriq-6faa2909871d8937cb2f79a10e1b21ffe193fac1.tar.xz
Reset to 3.12.37
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/bcache/Kconfig1
-rw-r--r--drivers/md/bcache/bcache.h4
-rw-r--r--drivers/md/bcache/btree.c2
-rw-r--r--drivers/md/bcache/journal.c19
-rw-r--r--drivers/md/bcache/super.c15
-rw-r--r--drivers/md/bcache/util.h4
-rw-r--r--drivers/md/dm-bufio.c39
-rw-r--r--drivers/md/dm-cache-metadata.c173
-rw-r--r--drivers/md/dm-cache-target.c143
-rw-r--r--drivers/md/dm-crypt.c81
-rw-r--r--drivers/md/dm-io.c22
-rw-r--r--drivers/md/dm-log-userspace-transfer.c2
-rw-r--r--drivers/md/dm-raid.c11
-rw-r--r--drivers/md/dm-thin-metadata.c89
-rw-r--r--drivers/md/dm-thin.c21
-rw-r--r--drivers/md/dm.c4
-rw-r--r--drivers/md/md.c21
-rw-r--r--drivers/md/persistent-data/dm-block-manager.c15
-rw-r--r--drivers/md/persistent-data/dm-block-manager.h3
-rw-r--r--drivers/md/persistent-data/dm-btree-internal.h6
-rw-r--r--drivers/md/persistent-data/dm-btree-spine.c2
-rw-r--r--drivers/md/persistent-data/dm-btree.c24
-rw-r--r--drivers/md/persistent-data/dm-space-map-metadata.c4
-rw-r--r--drivers/md/persistent-data/dm-transaction-manager.c5
-rw-r--r--drivers/md/persistent-data/dm-transaction-manager.h17
-rw-r--r--drivers/md/raid1.c29
-rw-r--r--drivers/md/raid10.c14
-rw-r--r--drivers/md/raid5.c30
-rw-r--r--drivers/md/raid5.h1
29 files changed, 487 insertions, 314 deletions
diff --git a/drivers/md/bcache/Kconfig b/drivers/md/bcache/Kconfig
index 5eb76dd..f950c9d 100644
--- a/drivers/md/bcache/Kconfig
+++ b/drivers/md/bcache/Kconfig
@@ -1,7 +1,6 @@
config BCACHE
tristate "Block device as cache"
- depends on !PREEMPT_RT_FULL
---help---
Allows a block device to be used as cache for other devices; uses
a btree for indexing and the layout is optimized for SSDs.
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index 0f12382..7552207 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -663,9 +663,13 @@ struct gc_stat {
* CACHE_SET_STOPPING always gets set first when we're closing down a cache set;
* we'll continue to run normally for awhile with CACHE_SET_STOPPING set (i.e.
* flushing dirty data).
+ *
+ * CACHE_SET_RUNNING means all cache devices have been registered and journal
+ * replay is complete.
*/
#define CACHE_SET_UNREGISTERING 0
#define CACHE_SET_STOPPING 1
+#define CACHE_SET_RUNNING 2
struct cache_set {
struct closure cl;
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index d1734d9..26ca4db 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -141,7 +141,7 @@ static void bch_btree_node_read_done(struct btree *b)
struct bset *i = b->sets[0].data;
struct btree_iter *iter;
- iter = mempool_alloc(b->c->fill_iter, GFP_NOWAIT);
+ iter = mempool_alloc(b->c->fill_iter, GFP_NOIO);
iter->size = b->c->sb.bucket_size / b->c->sb.block_size;
iter->used = 0;
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
index 8435f81..c494379 100644
--- a/drivers/md/bcache/journal.c
+++ b/drivers/md/bcache/journal.c
@@ -42,11 +42,11 @@ static int journal_read_bucket(struct cache *ca, struct list_head *list,
int ret = 0;
sector_t bucket = bucket_to_sector(ca->set, ca->sb.d[bucket_index]);
- pr_debug("reading %llu", (uint64_t) bucket);
+ pr_debug("reading %u", bucket_index);
while (offset < ca->sb.bucket_size) {
reread: left = ca->sb.bucket_size - offset;
- len = min_t(unsigned, left, PAGE_SECTORS * 8);
+ len = min_t(unsigned, left, PAGE_SECTORS << JSET_BITS);
bio_reset(bio);
bio->bi_sector = bucket + offset;
@@ -72,17 +72,26 @@ reread: left = ca->sb.bucket_size - offset;
struct list_head *where;
size_t blocks, bytes = set_bytes(j);
- if (j->magic != jset_magic(ca->set))
+ if (j->magic != jset_magic(ca->set)) {
+ pr_debug("%u: bad magic", bucket_index);
return ret;
+ }
- if (bytes > left << 9)
+ if (bytes > left << 9 ||
+ bytes > PAGE_SIZE << JSET_BITS) {
+ pr_info("%u: too big, %zu bytes, offset %u",
+ bucket_index, bytes, offset);
return ret;
+ }
if (bytes > len << 9)
goto reread;
- if (j->csum != csum_set(j))
+ if (j->csum != csum_set(j)) {
+ pr_info("%u: bad csum, %zu bytes, offset %u",
+ bucket_index, bytes, offset);
return ret;
+ }
blocks = set_blocks(j, ca->set);
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 547c4c5..f5004c5 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -1235,6 +1235,9 @@ int bch_flash_dev_create(struct cache_set *c, uint64_t size)
if (test_bit(CACHE_SET_STOPPING, &c->flags))
return -EINTR;
+ if (!test_bit(CACHE_SET_RUNNING, &c->flags))
+ return -EPERM;
+
u = uuid_find_empty(c);
if (!u) {
pr_err("Can't create volume, no room for UUID");
@@ -1300,8 +1303,11 @@ static void cache_set_free(struct closure *cl)
bch_journal_free(c);
for_each_cache(ca, c, i)
- if (ca)
+ if (ca) {
+ ca->set = NULL;
+ c->cache[ca->sb.nr_this_dev] = NULL;
kobject_put(&ca->kobj);
+ }
free_pages((unsigned long) c->uuids, ilog2(bucket_pages(c)));
free_pages((unsigned long) c->sort, ilog2(bucket_pages(c)));
@@ -1637,6 +1643,7 @@ static void run_cache_set(struct cache_set *c)
flash_devs_run(c);
+ set_bit(CACHE_SET_RUNNING, &c->flags);
return;
err_unlock_gc:
closure_set_stopped(&c->gc.cl);
@@ -1722,8 +1729,10 @@ void bch_cache_release(struct kobject *kobj)
{
struct cache *ca = container_of(kobj, struct cache, kobj);
- if (ca->set)
+ if (ca->set) {
+ BUG_ON(ca->set->cache[ca->sb.nr_this_dev] != ca);
ca->set->cache[ca->sb.nr_this_dev] = NULL;
+ }
bch_cache_allocator_exit(ca);
@@ -1794,7 +1803,7 @@ err:
}
static void register_cache(struct cache_sb *sb, struct page *sb_page,
- struct block_device *bdev, struct cache *ca)
+ struct block_device *bdev, struct cache *ca)
{
char name[BDEVNAME_SIZE];
const char *err = "cannot allocate memory";
diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h
index ea345c6..4dd1e2c 100644
--- a/drivers/md/bcache/util.h
+++ b/drivers/md/bcache/util.h
@@ -417,8 +417,8 @@ do { \
average_frequency, frequency_units); \
__print_time_stat(stats, name, \
average_duration, duration_units); \
- __print_time_stat(stats, name, \
- max_duration, duration_units); \
+ sysfs_print(name ## _ ##max_duration ## _ ## duration_units, \
+ div_u64((stats)->max_duration, NSEC_PER_ ## duration_units));\
\
sysfs_print(name ## _last_ ## frequency_units, (stats)->last \
? div_s64(local_clock() - (stats)->last, \
diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
index 54bdd923..93edd89 100644
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -463,6 +463,7 @@ static void __relink_lru(struct dm_buffer *b, int dirty)
c->n_buffers[dirty]++;
b->list_mode = dirty;
list_move(&b->lru_list, &c->lru[dirty]);
+ b->last_accessed = jiffies;
}
/*----------------------------------------------------------------
@@ -529,6 +530,19 @@ static void use_dmio(struct dm_buffer *b, int rw, sector_t block,
end_io(&b->bio, r);
}
+static void inline_endio(struct bio *bio, int error)
+{
+ bio_end_io_t *end_fn = bio->bi_private;
+
+ /*
+ * Reset the bio to free any attached resources
+ * (e.g. bio integrity profiles).
+ */
+ bio_reset(bio);
+
+ end_fn(bio, error);
+}
+
static void use_inline_bio(struct dm_buffer *b, int rw, sector_t block,
bio_end_io_t *end_io)
{
@@ -540,7 +554,12 @@ static void use_inline_bio(struct dm_buffer *b, int rw, sector_t block,
b->bio.bi_max_vecs = DM_BUFIO_INLINE_VECS;
b->bio.bi_sector = block << b->c->sectors_per_block_bits;
b->bio.bi_bdev = b->c->bdev;
- b->bio.bi_end_io = end_io;
+ b->bio.bi_end_io = inline_endio;
+ /*
+ * Use of .bi_private isn't a problem here because
+ * the dm_buffer's inline bio is local to bufio.
+ */
+ b->bio.bi_private = end_io;
/*
* We assume that if len >= PAGE_SIZE ptr is page-aligned.
@@ -1417,9 +1436,9 @@ static void drop_buffers(struct dm_bufio_client *c)
/*
* Test if the buffer is unused and too old, and commit it.
- * At if noio is set, we must not do any I/O because we hold
- * dm_bufio_clients_lock and we would risk deadlock if the I/O gets rerouted to
- * different bufio client.
+ * And if GFP_NOFS is used, we must not do any I/O because we hold
+ * dm_bufio_clients_lock and we would risk deadlock if the I/O gets
+ * rerouted to different bufio client.
*/
static int __cleanup_old_buffer(struct dm_buffer *b, gfp_t gfp,
unsigned long max_jiffies)
@@ -1427,7 +1446,7 @@ static int __cleanup_old_buffer(struct dm_buffer *b, gfp_t gfp,
if (jiffies - b->last_accessed < max_jiffies)
return 0;
- if (!(gfp & __GFP_IO)) {
+ if (!(gfp & __GFP_FS)) {
if (test_bit(B_READING, &b->state) ||
test_bit(B_WRITING, &b->state) ||
test_bit(B_DIRTY, &b->state))
@@ -1455,9 +1474,9 @@ static long __scan(struct dm_bufio_client *c, unsigned long nr_to_scan,
list_for_each_entry_safe_reverse(b, tmp, &c->lru[l], lru_list) {
freed += __cleanup_old_buffer(b, gfp_mask, 0);
if (!--nr_to_scan)
- break;
+ return freed;
+ dm_bufio_cond_resched();
}
- dm_bufio_cond_resched();
}
return freed;
}
@@ -1469,7 +1488,7 @@ dm_bufio_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
unsigned long freed;
c = container_of(shrink, struct dm_bufio_client, shrinker);
- if (sc->gfp_mask & __GFP_IO)
+ if (sc->gfp_mask & __GFP_FS)
dm_bufio_lock(c);
else if (!dm_bufio_trylock(c))
return SHRINK_STOP;
@@ -1486,7 +1505,7 @@ dm_bufio_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
unsigned long count;
c = container_of(shrink, struct dm_bufio_client, shrinker);
- if (sc->gfp_mask & __GFP_IO)
+ if (sc->gfp_mask & __GFP_FS)
dm_bufio_lock(c);
else if (!dm_bufio_trylock(c))
return 0;
@@ -1511,7 +1530,7 @@ struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsign
BUG_ON(block_size < 1 << SECTOR_SHIFT ||
(block_size & (block_size - 1)));
- c = kmalloc(sizeof(*c), GFP_KERNEL);
+ c = kzalloc(sizeof(*c), GFP_KERNEL);
if (!c) {
r = -ENOMEM;
goto bad_client;
diff --git a/drivers/md/dm-cache-metadata.c b/drivers/md/dm-cache-metadata.c
index 1af7255..0bfd9c0 100644
--- a/drivers/md/dm-cache-metadata.c
+++ b/drivers/md/dm-cache-metadata.c
@@ -88,6 +88,9 @@ struct cache_disk_superblock {
} __packed;
struct dm_cache_metadata {
+ atomic_t ref_count;
+ struct list_head list;
+
struct block_device *bdev;
struct dm_block_manager *bm;
struct dm_space_map *metadata_sm;
@@ -114,6 +117,12 @@ struct dm_cache_metadata {
unsigned policy_version[CACHE_POLICY_VERSION_SIZE];
size_t policy_hint_size;
struct dm_cache_statistics stats;
+
+ /*
+ * Reading the space map root can fail, so we read it into this
+ * buffer before the superblock is locked and updated.
+ */
+ __u8 metadata_space_map_root[SPACE_MAP_ROOT_SIZE];
};
/*-------------------------------------------------------------------
@@ -242,11 +251,31 @@ static void __setup_mapping_info(struct dm_cache_metadata *cmd)
}
}
+static int __save_sm_root(struct dm_cache_metadata *cmd)
+{
+ int r;
+ size_t metadata_len;
+
+ r = dm_sm_root_size(cmd->metadata_sm, &metadata_len);
+ if (r < 0)
+ return r;
+
+ return dm_sm_copy_root(cmd->metadata_sm, &cmd->metadata_space_map_root,
+ metadata_len);
+}
+
+static void __copy_sm_root(struct dm_cache_metadata *cmd,
+ struct cache_disk_superblock *disk_super)
+{
+ memcpy(&disk_super->metadata_space_map_root,
+ &cmd->metadata_space_map_root,
+ sizeof(cmd->metadata_space_map_root));
+}
+
static int __write_initial_superblock(struct dm_cache_metadata *cmd)
{
int r;
struct dm_block *sblock;
- size_t metadata_len;
struct cache_disk_superblock *disk_super;
sector_t bdev_size = i_size_read(cmd->bdev->bd_inode) >> SECTOR_SHIFT;
@@ -254,12 +283,16 @@ static int __write_initial_superblock(struct dm_cache_metadata *cmd)
if (bdev_size > DM_CACHE_METADATA_MAX_SECTORS)
bdev_size = DM_CACHE_METADATA_MAX_SECTORS;
- r = dm_sm_root_size(cmd->metadata_sm, &metadata_len);
+ r = dm_tm_pre_commit(cmd->tm);
if (r < 0)
return r;
- r = dm_tm_pre_commit(cmd->tm);
- if (r < 0)
+ /*
+ * dm_sm_copy_root() can fail. So we need to do it before we start
+ * updating the superblock.
+ */
+ r = __save_sm_root(cmd);
+ if (r)
return r;
r = superblock_lock_zero(cmd, &sblock);
@@ -275,10 +308,7 @@ static int __write_initial_superblock(struct dm_cache_metadata *cmd)
memset(disk_super->policy_version, 0, sizeof(disk_super->policy_version));
disk_super->policy_hint_size = 0;
- r = dm_sm_copy_root(cmd->metadata_sm, &disk_super->metadata_space_map_root,
- metadata_len);
- if (r < 0)
- goto bad_locked;
+ __copy_sm_root(cmd, disk_super);
disk_super->mapping_root = cpu_to_le64(cmd->root);
disk_super->hint_root = cpu_to_le64(cmd->hint_root);
@@ -295,10 +325,6 @@ static int __write_initial_superblock(struct dm_cache_metadata *cmd)
disk_super->write_misses = cpu_to_le32(0);
return dm_tm_commit(cmd->tm, sblock);
-
-bad_locked:
- dm_bm_unlock(sblock);
- return r;
}
static int __format_metadata(struct dm_cache_metadata *cmd)
@@ -384,6 +410,15 @@ static int __open_metadata(struct dm_cache_metadata *cmd)
disk_super = dm_block_data(sblock);
+ /* Verify the data block size hasn't changed */
+ if (le32_to_cpu(disk_super->data_block_size) != cmd->data_block_size) {
+ DMERR("changing the data block size (from %u to %llu) is not supported",
+ le32_to_cpu(disk_super->data_block_size),
+ (unsigned long long)cmd->data_block_size);
+ r = -EINVAL;
+ goto bad;
+ }
+
r = __check_incompat_features(disk_super, cmd);
if (r < 0)
goto bad;
@@ -511,8 +546,9 @@ static int __begin_transaction_flags(struct dm_cache_metadata *cmd,
disk_super = dm_block_data(sblock);
update_flags(disk_super, mutator);
read_superblock_fields(cmd, disk_super);
+ dm_bm_unlock(sblock);
- return dm_bm_flush_and_unlock(cmd->bm, sblock);
+ return dm_bm_flush(cmd->bm);
}
static int __begin_transaction(struct dm_cache_metadata *cmd)
@@ -540,7 +576,6 @@ static int __commit_transaction(struct dm_cache_metadata *cmd,
flags_mutator mutator)
{
int r;
- size_t metadata_len;
struct cache_disk_superblock *disk_super;
struct dm_block *sblock;
@@ -558,8 +593,8 @@ static int __commit_transaction(struct dm_cache_metadata *cmd,
if (r < 0)
return r;
- r = dm_sm_root_size(cmd->metadata_sm, &metadata_len);
- if (r < 0)
+ r = __save_sm_root(cmd);
+ if (r)
return r;
r = superblock_lock(cmd, &sblock);
@@ -586,13 +621,7 @@ static int __commit_transaction(struct dm_cache_metadata *cmd,
disk_super->read_misses = cpu_to_le32(cmd->stats.read_misses);
disk_super->write_hits = cpu_to_le32(cmd->stats.write_hits);
disk_super->write_misses = cpu_to_le32(cmd->stats.write_misses);
-
- r = dm_sm_copy_root(cmd->metadata_sm, &disk_super->metadata_space_map_root,
- metadata_len);
- if (r < 0) {
- dm_bm_unlock(sblock);
- return r;
- }
+ __copy_sm_root(cmd, disk_super);
return dm_tm_commit(cmd->tm, sblock);
}
@@ -624,10 +653,10 @@ static void unpack_value(__le64 value_le, dm_oblock_t *block, unsigned *flags)
/*----------------------------------------------------------------*/
-struct dm_cache_metadata *dm_cache_metadata_open(struct block_device *bdev,
- sector_t data_block_size,
- bool may_format_device,
- size_t policy_hint_size)
+static struct dm_cache_metadata *metadata_open(struct block_device *bdev,
+ sector_t data_block_size,
+ bool may_format_device,
+ size_t policy_hint_size)
{
int r;
struct dm_cache_metadata *cmd;
@@ -638,6 +667,7 @@ struct dm_cache_metadata *dm_cache_metadata_open(struct block_device *bdev,
return NULL;
}
+ atomic_set(&cmd->ref_count, 1);
init_rwsem(&cmd->root_lock);
cmd->bdev = bdev;
cmd->data_block_size = data_block_size;
@@ -660,10 +690,95 @@ struct dm_cache_metadata *dm_cache_metadata_open(struct block_device *bdev,
return cmd;
}
+/*
+ * We keep a little list of ref counted metadata objects to prevent two
+ * different target instances creating separate bufio instances. This is
+ * an issue if a table is reloaded before the suspend.
+ */
+static DEFINE_MUTEX(table_lock);
+static LIST_HEAD(table);
+
+static struct dm_cache_metadata *lookup(struct block_device *bdev)
+{
+ struct dm_cache_metadata *cmd;
+
+ list_for_each_entry(cmd, &table, list)
+ if (cmd->bdev == bdev) {
+ atomic_inc(&cmd->ref_count);
+ return cmd;
+ }
+
+ return NULL;
+}
+
+static struct dm_cache_metadata *lookup_or_open(struct block_device *bdev,
+ sector_t data_block_size,
+ bool may_format_device,
+ size_t policy_hint_size)
+{
+ struct dm_cache_metadata *cmd, *cmd2;
+
+ mutex_lock(&table_lock);
+ cmd = lookup(bdev);
+ mutex_unlock(&table_lock);
+
+ if (cmd)
+ return cmd;
+
+ cmd = metadata_open(bdev, data_block_size, may_format_device, policy_hint_size);
+ if (cmd) {
+ mutex_lock(&table_lock);
+ cmd2 = lookup(bdev);
+ if (cmd2) {
+ mutex_unlock(&table_lock);
+ __destroy_persistent_data_objects(cmd);
+ kfree(cmd);
+ return cmd2;
+ }
+ list_add(&cmd->list, &table);
+ mutex_unlock(&table_lock);
+ }
+
+ return cmd;
+}
+
+static bool same_params(struct dm_cache_metadata *cmd, sector_t data_block_size)
+{
+ if (cmd->data_block_size != data_block_size) {
+ DMERR("data_block_size (%llu) different from that in metadata (%llu)\n",
+ (unsigned long long) data_block_size,
+ (unsigned long long) cmd->data_block_size);
+ return false;
+ }
+
+ return true;
+}
+
+struct dm_cache_metadata *dm_cache_metadata_open(struct block_device *bdev,
+ sector_t data_block_size,
+ bool may_format_device,
+ size_t policy_hint_size)
+{
+ struct dm_cache_metadata *cmd = lookup_or_open(bdev, data_block_size,
+ may_format_device, policy_hint_size);
+ if (cmd && !same_params(cmd, data_block_size)) {
+ dm_cache_metadata_close(cmd);
+ return NULL;
+ }
+
+ return cmd;
+}
+
void dm_cache_metadata_close(struct dm_cache_metadata *cmd)
{
- __destroy_persistent_data_objects(cmd);
- kfree(cmd);
+ if (atomic_dec_and_test(&cmd->ref_count)) {
+ mutex_lock(&table_lock);
+ list_del(&cmd->list);
+ mutex_unlock(&table_lock);
+
+ __destroy_persistent_data_objects(cmd);
+ kfree(cmd);
+ }
}
int dm_cache_resize(struct dm_cache_metadata *cmd, dm_cblock_t new_cache_size)
diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index 6ab68e0..1771845 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -146,7 +146,13 @@ struct cache {
struct list_head need_commit_migrations;
sector_t migration_threshold;
wait_queue_head_t migration_wait;
- atomic_t nr_migrations;
+ atomic_t nr_allocated_migrations;
+
+ /*
+ * The number of in flight migrations that are performing
+ * background io. eg, promotion, writeback.
+ */
+ atomic_t nr_io_migrations;
wait_queue_head_t quiescing_wait;
atomic_t quiescing_ack;
@@ -154,7 +160,7 @@ struct cache {
/*
* cache_size entries, dirty if set
*/
- dm_cblock_t nr_dirty;
+ atomic_t nr_dirty;
unsigned long *dirty_bitset;
/*
@@ -162,7 +168,7 @@ struct cache {
*/
dm_dblock_t discard_nr_blocks;
unsigned long *discard_bitset;
- uint32_t discard_block_size; /* a power of 2 times sectors per block */
+ uint32_t discard_block_size;
/*
* Rather than reconstructing the table line for the status we just
@@ -182,7 +188,6 @@ struct cache {
struct dm_deferred_set *all_io_ds;
mempool_t *migration_pool;
- struct dm_cache_migration *next_migration;
struct dm_cache_policy *policy;
unsigned policy_nr_args;
@@ -265,10 +270,31 @@ static void free_prison_cell(struct cache *cache, struct dm_bio_prison_cell *cel
dm_bio_prison_free_cell(cache->prison, cell);
}
+static struct dm_cache_migration *alloc_migration(struct cache *cache)
+{
+ struct dm_cache_migration *mg;
+
+ mg = mempool_alloc(cache->migration_pool, GFP_NOWAIT);
+ if (mg) {
+ mg->cache = cache;
+ atomic_inc(&mg->cache->nr_allocated_migrations);
+ }
+
+ return mg;
+}
+
+static void free_migration(struct dm_cache_migration *mg)
+{
+ if (atomic_dec_and_test(&mg->cache->nr_allocated_migrations))
+ wake_up(&mg->cache->migration_wait);
+
+ mempool_free(mg, mg->cache->migration_pool);
+}
+
static int prealloc_data_structs(struct cache *cache, struct prealloc *p)
{
if (!p->mg) {
- p->mg = mempool_alloc(cache->migration_pool, GFP_NOWAIT);
+ p->mg = alloc_migration(cache);
if (!p->mg)
return -ENOMEM;
}
@@ -297,7 +323,7 @@ static void prealloc_free_structs(struct cache *cache, struct prealloc *p)
free_prison_cell(cache, p->cell1);
if (p->mg)
- mempool_free(p->mg, cache->migration_pool);
+ free_migration(p->mg);
}
static struct dm_cache_migration *prealloc_get_migration(struct prealloc *p)
@@ -408,7 +434,7 @@ static bool is_dirty(struct cache *cache, dm_cblock_t b)
static void set_dirty(struct cache *cache, dm_oblock_t oblock, dm_cblock_t cblock)
{
if (!test_and_set_bit(from_cblock(cblock), cache->dirty_bitset)) {
- cache->nr_dirty = to_cblock(from_cblock(cache->nr_dirty) + 1);
+ atomic_inc(&cache->nr_dirty);
policy_set_dirty(cache->policy, oblock);
}
}
@@ -417,8 +443,7 @@ static void clear_dirty(struct cache *cache, dm_oblock_t oblock, dm_cblock_t cbl
{
if (test_and_clear_bit(from_cblock(cblock), cache->dirty_bitset)) {
policy_clear_dirty(cache->policy, oblock);
- cache->nr_dirty = to_cblock(from_cblock(cache->nr_dirty) - 1);
- if (!from_cblock(cache->nr_dirty))
+ if (atomic_dec_return(&cache->nr_dirty) == 0)
dm_table_event(cache->ti->table);
}
}
@@ -709,24 +734,14 @@ static void remap_to_origin_then_cache(struct cache *cache, struct bio *bio,
* Migration covers moving data from the origin device to the cache, or
* vice versa.
*--------------------------------------------------------------*/
-static void free_migration(struct dm_cache_migration *mg)
+static void inc_io_migrations(struct cache *cache)
{
- mempool_free(mg, mg->cache->migration_pool);
+ atomic_inc(&cache->nr_io_migrations);
}
-static void inc_nr_migrations(struct cache *cache)
+static void dec_io_migrations(struct cache *cache)
{
- atomic_inc(&cache->nr_migrations);
-}
-
-static void dec_nr_migrations(struct cache *cache)
-{
- atomic_dec(&cache->nr_migrations);
-
- /*
- * Wake the worker in case we're suspending the target.
- */
- wake_up(&cache->migration_wait);
+ atomic_dec(&cache->nr_io_migrations);
}
static void __cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell,
@@ -749,11 +764,10 @@ static void cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell,
wake_worker(cache);
}
-static void cleanup_migration(struct dm_cache_migration *mg)
+static void free_io_migration(struct dm_cache_migration *mg)
{
- struct cache *cache = mg->cache;
+ dec_io_migrations(mg->cache);
free_migration(mg);
- dec_nr_migrations(cache);
}
static void migration_failure(struct dm_cache_migration *mg)
@@ -778,7 +792,7 @@ static void migration_failure(struct dm_cache_migration *mg)
cell_defer(cache, mg->new_ocell, 1);
}
- cleanup_migration(mg);
+ free_io_migration(mg);
}
static void migration_success_pre_commit(struct dm_cache_migration *mg)
@@ -787,9 +801,9 @@ static void migration_success_pre_commit(struct dm_cache_migration *mg)
struct cache *cache = mg->cache;
if (mg->writeback) {
- cell_defer(cache, mg->old_ocell, false);
clear_dirty(cache, mg->old_oblock, mg->cblock);
- cleanup_migration(mg);
+ cell_defer(cache, mg->old_ocell, false);
+ free_io_migration(mg);
return;
} else if (mg->demote) {
@@ -799,14 +813,14 @@ static void migration_success_pre_commit(struct dm_cache_migration *mg)
mg->old_oblock);
if (mg->promote)
cell_defer(cache, mg->new_ocell, true);
- cleanup_migration(mg);
+ free_io_migration(mg);
return;
}
} else {
if (dm_cache_insert_mapping(cache->cmd, mg->cblock, mg->new_oblock)) {
DMWARN_LIMIT("promotion failed; couldn't update on disk metadata");
policy_remove_mapping(cache->policy, mg->new_oblock);
- cleanup_migration(mg);
+ free_io_migration(mg);
return;
}
}
@@ -837,12 +851,12 @@ static void migration_success_post_commit(struct dm_cache_migration *mg)
spin_unlock_irqrestore(&cache->lock, flags);
} else
- cleanup_migration(mg);
+ free_io_migration(mg);
} else {
- cell_defer(cache, mg->new_ocell, true);
clear_dirty(cache, mg->new_oblock, mg->cblock);
- cleanup_migration(mg);
+ cell_defer(cache, mg->new_ocell, true);
+ free_io_migration(mg);
}
}
@@ -1003,7 +1017,7 @@ static void promote(struct cache *cache, struct prealloc *structs,
mg->new_ocell = cell;
mg->start_jiffies = jiffies;
- inc_nr_migrations(cache);
+ inc_io_migrations(cache);
quiesce_migration(mg);
}
@@ -1024,7 +1038,7 @@ static void writeback(struct cache *cache, struct prealloc *structs,
mg->new_ocell = NULL;
mg->start_jiffies = jiffies;
- inc_nr_migrations(cache);
+ inc_io_migrations(cache);
quiesce_migration(mg);
}
@@ -1048,7 +1062,7 @@ static void demote_then_promote(struct cache *cache, struct prealloc *structs,
mg->new_ocell = new_ocell;
mg->start_jiffies = jiffies;
- inc_nr_migrations(cache);
+ inc_io_migrations(cache);
quiesce_migration(mg);
}
@@ -1109,7 +1123,7 @@ static void process_discard_bio(struct cache *cache, struct bio *bio)
static bool spare_migration_bandwidth(struct cache *cache)
{
- sector_t current_volume = (atomic_read(&cache->nr_migrations) + 1) *
+ sector_t current_volume = (atomic_read(&cache->nr_io_migrations) + 1) *
cache->sectors_per_block;
return current_volume < cache->migration_threshold;
}
@@ -1400,7 +1414,7 @@ static void stop_quiescing(struct cache *cache)
static void wait_for_migrations(struct cache *cache)
{
- wait_event(cache->migration_wait, !atomic_read(&cache->nr_migrations));
+ wait_event(cache->migration_wait, !atomic_read(&cache->nr_allocated_migrations));
}
static void stop_worker(struct cache *cache)
@@ -1509,9 +1523,6 @@ static void destroy(struct cache *cache)
{
unsigned i;
- if (cache->next_migration)
- mempool_free(cache->next_migration, cache->migration_pool);
-
if (cache->migration_pool)
mempool_destroy(cache->migration_pool);
@@ -1908,35 +1919,6 @@ static int create_cache_policy(struct cache *cache, struct cache_args *ca,
return 0;
}
-/*
- * We want the discard block size to be a power of two, at least the size
- * of the cache block size, and have no more than 2^14 discard blocks
- * across the origin.
- */
-#define MAX_DISCARD_BLOCKS (1 << 14)
-
-static bool too_many_discard_blocks(sector_t discard_block_size,
- sector_t origin_size)
-{
- (void) sector_div(origin_size, discard_block_size);
-
- return origin_size > MAX_DISCARD_BLOCKS;
-}
-
-static sector_t calculate_discard_block_size(sector_t cache_block_size,
- sector_t origin_size)
-{
- sector_t discard_block_size;
-
- discard_block_size = roundup_pow_of_two(cache_block_size);
-
- if (origin_size)
- while (too_many_discard_blocks(discard_block_size, origin_size))
- discard_block_size *= 2;
-
- return discard_block_size;
-}
-
#define DEFAULT_MIGRATION_THRESHOLD 2048
static int cache_create(struct cache_args *ca, struct cache **result)
@@ -1961,6 +1943,8 @@ static int cache_create(struct cache_args *ca, struct cache **result)
ti->num_discard_bios = 1;
ti->discards_supported = true;
ti->discard_zeroes_data_unsupported = true;
+ /* Discard bios must be split on a block boundary */
+ ti->split_discard_bios = true;
cache->features = ca->features;
ti->per_bio_data_size = get_per_bio_data_size(cache);
@@ -2026,14 +2010,15 @@ static int cache_create(struct cache_args *ca, struct cache **result)
INIT_LIST_HEAD(&cache->quiesced_migrations);
INIT_LIST_HEAD(&cache->completed_migrations);
INIT_LIST_HEAD(&cache->need_commit_migrations);
- atomic_set(&cache->nr_migrations, 0);
+ atomic_set(&cache->nr_allocated_migrations, 0);
+ atomic_set(&cache->nr_io_migrations, 0);
init_waitqueue_head(&cache->migration_wait);
init_waitqueue_head(&cache->quiescing_wait);
atomic_set(&cache->quiescing_ack, 0);
r = -ENOMEM;
- cache->nr_dirty = 0;
+ atomic_set(&cache->nr_dirty, 0);
cache->dirty_bitset = alloc_bitset(from_cblock(cache->cache_size));
if (!cache->dirty_bitset) {
*error = "could not allocate dirty bitset";
@@ -2041,9 +2026,7 @@ static int cache_create(struct cache_args *ca, struct cache **result)
}
clear_bitset(cache->dirty_bitset, from_cblock(cache->cache_size));
- cache->discard_block_size =
- calculate_discard_block_size(cache->sectors_per_block,
- cache->origin_sectors);
+ cache->discard_block_size = cache->sectors_per_block;
cache->discard_nr_blocks = oblock_to_dblock(cache, cache->origin_blocks);
cache->discard_bitset = alloc_bitset(from_dblock(cache->discard_nr_blocks));
if (!cache->discard_bitset) {
@@ -2087,8 +2070,6 @@ static int cache_create(struct cache_args *ca, struct cache **result)
goto bad;
}
- cache->next_migration = NULL;
-
cache->need_tick_bio = true;
cache->sized = false;
cache->quiescing = false;
@@ -2531,7 +2512,7 @@ static void cache_status(struct dm_target *ti, status_type_t type,
residency = policy_residency(cache->policy);
- DMEMIT("%llu/%llu %u %u %u %u %u %u %llu %u ",
+ DMEMIT("%llu/%llu %u %u %u %u %u %u %llu %lu ",
(unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata),
(unsigned long long)nr_blocks_metadata,
(unsigned) atomic_read(&cache->stats.read_hit),
@@ -2541,7 +2522,7 @@ static void cache_status(struct dm_target *ti, status_type_t type,
(unsigned) atomic_read(&cache->stats.demotion),
(unsigned) atomic_read(&cache->stats.promotion),
(unsigned long long) from_cblock(residency),
- cache->nr_dirty);
+ (unsigned long) atomic_read(&cache->nr_dirty));
if (cache->features.write_through)
DMEMIT("1 writethrough ");
@@ -2630,7 +2611,7 @@ static void set_discard_limits(struct cache *cache, struct queue_limits *limits)
/*
* FIXME: these limits may be incompatible with the cache device
*/
- limits->max_discard_sectors = cache->discard_block_size * 1024;
+ limits->max_discard_sectors = cache->discard_block_size;
limits->discard_granularity = cache->discard_block_size << SECTOR_SHIFT;
}
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 0fce0bc..0f64dc5 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -18,7 +18,6 @@
#include <linux/crypto.h>
#include <linux/workqueue.h>
#include <linux/backing-dev.h>
-#include <linux/percpu.h>
#include <linux/atomic.h>
#include <linux/scatterlist.h>
#include <asm/page.h>
@@ -44,6 +43,7 @@ struct convert_context {
unsigned int idx_out;
sector_t cc_sector;
atomic_t cc_pending;
+ struct ablkcipher_request *req;
};
/*
@@ -105,15 +105,7 @@ struct iv_lmk_private {
enum flags { DM_CRYPT_SUSPENDED, DM_CRYPT_KEY_VALID };
/*
- * Duplicated per-CPU state for cipher.
- */
-struct crypt_cpu {
- struct ablkcipher_request *req;
-};
-
-/*
- * The fields in here must be read only after initialization,
- * changing state should be in crypt_cpu.
+ * The fields in here must be read only after initialization.
*/
struct crypt_config {
struct dm_dev *dev;
@@ -143,12 +135,6 @@ struct crypt_config {
sector_t iv_offset;
unsigned int iv_size;
- /*
- * Duplicated per cpu state. Access through
- * per_cpu_ptr() only.
- */
- struct crypt_cpu __percpu *cpu;
-
/* ESSIV: struct crypto_cipher *essiv_tfm */
void *iv_private;
struct crypto_ablkcipher **tfms;
@@ -184,11 +170,6 @@ static void clone_init(struct dm_crypt_io *, struct bio *);
static void kcryptd_queue_crypt(struct dm_crypt_io *io);
static u8 *iv_of_dmreq(struct crypt_config *cc, struct dm_crypt_request *dmreq);
-static struct crypt_cpu *this_crypt_config(struct crypt_config *cc)
-{
- return this_cpu_ptr(cc->cpu);
-}
-
/*
* Use this to access cipher attributes that are the same for each CPU.
*/
@@ -738,16 +719,15 @@ static void kcryptd_async_done(struct crypto_async_request *async_req,
static void crypt_alloc_req(struct crypt_config *cc,
struct convert_context *ctx)
{
- struct crypt_cpu *this_cc = this_crypt_config(cc);
unsigned key_index = ctx->cc_sector & (cc->tfms_count - 1);
- if (!this_cc->req)
- this_cc->req = mempool_alloc(cc->req_pool, GFP_NOIO);
+ if (!ctx->req)
+ ctx->req = mempool_alloc(cc->req_pool, GFP_NOIO);
- ablkcipher_request_set_tfm(this_cc->req, cc->tfms[key_index]);
- ablkcipher_request_set_callback(this_cc->req,
+ ablkcipher_request_set_tfm(ctx->req, cc->tfms[key_index]);
+ ablkcipher_request_set_callback(ctx->req,
CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
- kcryptd_async_done, dmreq_of_req(cc, this_cc->req));
+ kcryptd_async_done, dmreq_of_req(cc, ctx->req));
}
/*
@@ -756,7 +736,6 @@ static void crypt_alloc_req(struct crypt_config *cc,
static int crypt_convert(struct crypt_config *cc,
struct convert_context *ctx)
{
- struct crypt_cpu *this_cc = this_crypt_config(cc);
int r;
atomic_set(&ctx->cc_pending, 1);
@@ -768,7 +747,7 @@ static int crypt_convert(struct crypt_config *cc,
atomic_inc(&ctx->cc_pending);
- r = crypt_convert_block(cc, ctx, this_cc->req);
+ r = crypt_convert_block(cc, ctx, ctx->req);
switch (r) {
/* async */
@@ -777,7 +756,7 @@ static int crypt_convert(struct crypt_config *cc,
INIT_COMPLETION(ctx->restart);
/* fall through*/
case -EINPROGRESS:
- this_cc->req = NULL;
+ ctx->req = NULL;
ctx->cc_sector++;
continue;
@@ -876,6 +855,7 @@ static struct dm_crypt_io *crypt_io_alloc(struct crypt_config *cc,
io->sector = sector;
io->error = 0;
io->base_io = NULL;
+ io->ctx.req = NULL;
atomic_set(&io->io_pending, 0);
return io;
@@ -901,6 +881,8 @@ static void crypt_dec_pending(struct dm_crypt_io *io)
if (!atomic_dec_and_test(&io->io_pending))
return;
+ if (io->ctx.req)
+ mempool_free(io->ctx.req, cc->req_pool);
mempool_free(io, cc->io_pool);
if (likely(!base_io))
@@ -1326,8 +1308,6 @@ static int crypt_wipe_key(struct crypt_config *cc)
static void crypt_dtr(struct dm_target *ti)
{
struct crypt_config *cc = ti->private;
- struct crypt_cpu *cpu_cc;
- int cpu;
ti->private = NULL;
@@ -1339,13 +1319,6 @@ static void crypt_dtr(struct dm_target *ti)
if (cc->crypt_queue)
destroy_workqueue(cc->crypt_queue);
- if (cc->cpu)
- for_each_possible_cpu(cpu) {
- cpu_cc = per_cpu_ptr(cc->cpu, cpu);
- if (cpu_cc->req)
- mempool_free(cpu_cc->req, cc->req_pool);
- }
-
crypt_free_tfms(cc);
if (cc->bs)
@@ -1364,9 +1337,6 @@ static void crypt_dtr(struct dm_target *ti)
if (cc->dev)
dm_put_device(ti, cc->dev);
- if (cc->cpu)
- free_percpu(cc->cpu);
-
kzfree(cc->cipher);
kzfree(cc->cipher_string);
@@ -1421,13 +1391,6 @@ static int crypt_ctr_cipher(struct dm_target *ti,
if (tmp)
DMWARN("Ignoring unexpected additional cipher options");
- cc->cpu = __alloc_percpu(sizeof(*(cc->cpu)),
- __alignof__(struct crypt_cpu));
- if (!cc->cpu) {
- ti->error = "Cannot allocate per cpu state";
- goto bad_mem;
- }
-
/*
* For compatibility with the original dm-crypt mapping format, if
* only the cipher name is supplied, use cbc-plain.
@@ -1543,6 +1506,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
unsigned int key_size, opt_params;
unsigned long long tmpll;
int ret;
+ size_t iv_size_padding;
struct dm_arg_set as;
const char *opt_string;
char dummy;
@@ -1579,12 +1543,23 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
cc->dmreq_start = sizeof(struct ablkcipher_request);
cc->dmreq_start += crypto_ablkcipher_reqsize(any_tfm(cc));
- cc->dmreq_start = ALIGN(cc->dmreq_start, crypto_tfm_ctx_alignment());
- cc->dmreq_start += crypto_ablkcipher_alignmask(any_tfm(cc)) &
- ~(crypto_tfm_ctx_alignment() - 1);
+ cc->dmreq_start = ALIGN(cc->dmreq_start, __alignof__(struct dm_crypt_request));
+
+ if (crypto_ablkcipher_alignmask(any_tfm(cc)) < CRYPTO_MINALIGN) {
+ /* Allocate the padding exactly */
+ iv_size_padding = -(cc->dmreq_start + sizeof(struct dm_crypt_request))
+ & crypto_ablkcipher_alignmask(any_tfm(cc));
+ } else {
+ /*
+ * If the cipher requires greater alignment than kmalloc
+ * alignment, we don't know the exact position of the
+ * initialization vector. We must assume worst case.
+ */
+ iv_size_padding = crypto_ablkcipher_alignmask(any_tfm(cc));
+ }
cc->req_pool = mempool_create_kmalloc_pool(MIN_IOS, cc->dmreq_start +
- sizeof(struct dm_crypt_request) + cc->iv_size);
+ sizeof(struct dm_crypt_request) + iv_size_padding + cc->iv_size);
if (!cc->req_pool) {
ti->error = "Cannot allocate crypt request mempool";
goto bad;
diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
index 2a20986..e60c2ea 100644
--- a/drivers/md/dm-io.c
+++ b/drivers/md/dm-io.c
@@ -10,6 +10,7 @@
#include <linux/device-mapper.h>
#include <linux/bio.h>
+#include <linux/completion.h>
#include <linux/mempool.h>
#include <linux/module.h>
#include <linux/sched.h>
@@ -32,7 +33,7 @@ struct dm_io_client {
struct io {
unsigned long error_bits;
atomic_t count;
- struct task_struct *sleeper;
+ struct completion *wait;
struct dm_io_client *client;
io_notify_fn callback;
void *context;
@@ -121,8 +122,8 @@ static void dec_count(struct io *io, unsigned int region, int error)
invalidate_kernel_vmap_range(io->vma_invalidate_address,
io->vma_invalidate_size);
- if (io->sleeper)
- wake_up_process(io->sleeper);
+ if (io->wait)
+ complete(io->wait);
else {
unsigned long r = io->error_bits;
@@ -385,6 +386,7 @@ static int sync_io(struct dm_io_client *client, unsigned int num_regions,
*/
volatile char io_[sizeof(struct io) + __alignof__(struct io) - 1];
struct io *io = (struct io *)PTR_ALIGN(&io_, __alignof__(struct io));
+ DECLARE_COMPLETION_ONSTACK(wait);
if (num_regions > 1 && (rw & RW_MASK) != WRITE) {
WARN_ON(1);
@@ -393,7 +395,7 @@ static int sync_io(struct dm_io_client *client, unsigned int num_regions,
io->error_bits = 0;
atomic_set(&io->count, 1); /* see dispatch_io() */
- io->sleeper = current;
+ io->wait = &wait;
io->client = client;
io->vma_invalidate_address = dp->vma_invalidate_address;
@@ -401,15 +403,7 @@ static int sync_io(struct dm_io_client *client, unsigned int num_regions,
dispatch_io(rw, num_regions, where, dp, io, 1);
- while (1) {
- set_current_state(TASK_UNINTERRUPTIBLE);
-
- if (!atomic_read(&io->count))
- break;
-
- io_schedule();
- }
- set_current_state(TASK_RUNNING);
+ wait_for_completion_io(&wait);
if (error_bits)
*error_bits = io->error_bits;
@@ -432,7 +426,7 @@ static int async_io(struct dm_io_client *client, unsigned int num_regions,
io = mempool_alloc(client->pool, GFP_NOIO);
io->error_bits = 0;
atomic_set(&io->count, 1); /* see dispatch_io() */
- io->sleeper = NULL;
+ io->wait = NULL;
io->client = client;
io->callback = fn;
io->context = context;
diff --git a/drivers/md/dm-log-userspace-transfer.c b/drivers/md/dm-log-userspace-transfer.c
index 08d9a20..c69d0b7 100644
--- a/drivers/md/dm-log-userspace-transfer.c
+++ b/drivers/md/dm-log-userspace-transfer.c
@@ -272,7 +272,7 @@ int dm_ulog_tfr_init(void)
r = cn_add_callback(&ulog_cn_id, "dmlogusr", cn_ulog_callback);
if (r) {
- cn_del_callback(&ulog_cn_id);
+ kfree(prealloced_cn_msg);
return r;
}
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 4880b69..5971538 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -785,8 +785,7 @@ struct dm_raid_superblock {
__le32 layout;
__le32 stripe_sectors;
- __u8 pad[452]; /* Round struct to 512 bytes. */
- /* Always set to 0 when writing. */
+ /* Remainder of a logical block is zero-filled when writing (see super_sync()). */
} __packed;
static int read_disk_sb(struct md_rdev *rdev, int size)
@@ -823,7 +822,7 @@ static void super_sync(struct mddev *mddev, struct md_rdev *rdev)
test_bit(Faulty, &(rs->dev[i].rdev.flags)))
failed_devices |= (1ULL << i);
- memset(sb, 0, sizeof(*sb));
+ memset(sb + 1, 0, rdev->sb_size - sizeof(*sb));
sb->magic = cpu_to_le32(DM_RAID_MAGIC);
sb->features = cpu_to_le32(0); /* No features yet */
@@ -858,7 +857,11 @@ static int super_load(struct md_rdev *rdev, struct md_rdev *refdev)
uint64_t events_sb, events_refsb;
rdev->sb_start = 0;
- rdev->sb_size = sizeof(*sb);
+ rdev->sb_size = bdev_logical_block_size(rdev->meta_bdev);
+ if (rdev->sb_size < sizeof(*sb) || rdev->sb_size > PAGE_SIZE) {
+ DMERR("superblock size of a logical block is no longer valid");
+ return -EINVAL;
+ }
ret = read_disk_sb(rdev, rdev->sb_size);
if (ret)
diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c
index 3bb4506..b63095c 100644
--- a/drivers/md/dm-thin-metadata.c
+++ b/drivers/md/dm-thin-metadata.c
@@ -192,6 +192,13 @@ struct dm_pool_metadata {
* operation possible in this state is the closing of the device.
*/
bool fail_io:1;
+
+ /*
+ * Reading the space map roots can fail, so we read it into these
+ * buffers before the superblock is locked and updated.
+ */
+ __u8 data_space_map_root[SPACE_MAP_ROOT_SIZE];
+ __u8 metadata_space_map_root[SPACE_MAP_ROOT_SIZE];
};
struct dm_thin_device {
@@ -431,26 +438,53 @@ static void __setup_btree_details(struct dm_pool_metadata *pmd)
pmd->details_info.value_type.equal = NULL;
}
+static int save_sm_roots(struct dm_pool_metadata *pmd)
+{
+ int r;
+ size_t len;
+
+ r = dm_sm_root_size(pmd->metadata_sm, &len);
+ if (r < 0)
+ return r;
+
+ r = dm_sm_copy_root(pmd->metadata_sm, &pmd->metadata_space_map_root, len);
+ if (r < 0)
+ return r;
+
+ r = dm_sm_root_size(pmd->data_sm, &len);
+ if (r < 0)
+ return r;
+
+ return dm_sm_copy_root(pmd->data_sm, &pmd->data_space_map_root, len);
+}
+
+static void copy_sm_roots(struct dm_pool_metadata *pmd,
+ struct thin_disk_superblock *disk)
+{
+ memcpy(&disk->metadata_space_map_root,
+ &pmd->metadata_space_map_root,
+ sizeof(pmd->metadata_space_map_root));
+
+ memcpy(&disk->data_space_map_root,
+ &pmd->data_space_map_root,
+ sizeof(pmd->data_space_map_root));
+}
+
static int __write_initial_superblock(struct dm_pool_metadata *pmd)
{
int r;
struct dm_block *sblock;
- size_t metadata_len, data_len;
struct thin_disk_superblock *disk_super;
sector_t bdev_size = i_size_read(pmd->bdev->bd_inode) >> SECTOR_SHIFT;
if (bdev_size > THIN_METADATA_MAX_SECTORS)
bdev_size = THIN_METADATA_MAX_SECTORS;
- r = dm_sm_root_size(pmd->metadata_sm, &metadata_len);
- if (r < 0)
- return r;
-
- r = dm_sm_root_size(pmd->data_sm, &data_len);
+ r = dm_sm_commit(pmd->data_sm);
if (r < 0)
return r;
- r = dm_sm_commit(pmd->data_sm);
+ r = save_sm_roots(pmd);
if (r < 0)
return r;
@@ -471,15 +505,7 @@ static int __write_initial_superblock(struct dm_pool_metadata *pmd)
disk_super->trans_id = 0;
disk_super->held_root = 0;
- r = dm_sm_copy_root(pmd->metadata_sm, &disk_super->metadata_space_map_root,
- metadata_len);
- if (r < 0)
- goto bad_locked;
-
- r = dm_sm_copy_root(pmd->data_sm, &disk_super->data_space_map_root,
- data_len);
- if (r < 0)
- goto bad_locked;
+ copy_sm_roots(pmd, disk_super);
disk_super->data_mapping_root = cpu_to_le64(pmd->root);
disk_super->device_details_root = cpu_to_le64(pmd->details_root);
@@ -488,10 +514,6 @@ static int __write_initial_superblock(struct dm_pool_metadata *pmd)
disk_super->data_block_size = cpu_to_le32(pmd->data_block_size);
return dm_tm_commit(pmd->tm, sblock);
-
-bad_locked:
- dm_bm_unlock(sblock);
- return r;
}
static int __format_metadata(struct dm_pool_metadata *pmd)
@@ -591,6 +613,15 @@ static int __open_metadata(struct dm_pool_metadata *pmd)
disk_super = dm_block_data(sblock);
+ /* Verify the data block size hasn't changed */
+ if (le32_to_cpu(disk_super->data_block_size) != pmd->data_block_size) {
+ DMERR("changing the data block size (from %u to %llu) is not supported",
+ le32_to_cpu(disk_super->data_block_size),
+ (unsigned long long)pmd->data_block_size);
+ r = -EINVAL;
+ goto bad_unlock_sblock;
+ }
+
r = __check_incompat_features(disk_super, pmd);
if (r < 0)
goto bad_unlock_sblock;
@@ -769,6 +800,10 @@ static int __commit_transaction(struct dm_pool_metadata *pmd)
if (r < 0)
return r;
+ r = save_sm_roots(pmd);
+ if (r < 0)
+ return r;
+
r = superblock_lock(pmd, &sblock);
if (r)
return r;
@@ -780,21 +815,9 @@ static int __commit_transaction(struct dm_pool_metadata *pmd)
disk_super->trans_id = cpu_to_le64(pmd->trans_id);
disk_super->flags = cpu_to_le32(pmd->flags);
- r = dm_sm_copy_root(pmd->metadata_sm, &disk_super->metadata_space_map_root,
- metadata_len);
- if (r < 0)
- goto out_locked;
-
- r = dm_sm_copy_root(pmd->data_sm, &disk_super->data_space_map_root,
- data_len);
- if (r < 0)
- goto out_locked;
+ copy_sm_roots(pmd, disk_super);
return dm_tm_commit(pmd->tm, sblock);
-
-out_locked:
- dm_bm_unlock(sblock);
- return r;
}
struct dm_pool_metadata *dm_pool_metadata_open(struct block_device *bdev,
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index e9587101..0396d7f 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -1332,9 +1332,9 @@ static void process_deferred_bios(struct pool *pool)
*/
if (ensure_next_mapping(pool)) {
spin_lock_irqsave(&pool->lock, flags);
+ bio_list_add(&pool->deferred_bios, bio);
bio_list_merge(&pool->deferred_bios, &bios);
spin_unlock_irqrestore(&pool->lock, flags);
-
break;
}
@@ -1504,6 +1504,14 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio)
return DM_MAPIO_SUBMITTED;
}
+ /*
+ * We must hold the virtual cell before doing the lookup, otherwise
+ * there's a race with discard.
+ */
+ build_virtual_key(tc->td, block, &key);
+ if (dm_bio_detain(tc->pool->prison, &key, bio, &cell1, &cell_result))
+ return DM_MAPIO_SUBMITTED;
+
r = dm_thin_find_block(td, block, 0, &result);
/*
@@ -1527,13 +1535,10 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio)
* shared flag will be set in their case.
*/
thin_defer_bio(tc, bio);
+ cell_defer_no_holder_no_free(tc, &cell1);
return DM_MAPIO_SUBMITTED;
}
- build_virtual_key(tc->td, block, &key);
- if (dm_bio_detain(tc->pool->prison, &key, bio, &cell1, &cell_result))
- return DM_MAPIO_SUBMITTED;
-
build_data_key(tc->td, result.block, &key);
if (dm_bio_detain(tc->pool->prison, &key, bio, &cell2, &cell_result)) {
cell_defer_no_holder_no_free(tc, &cell1);
@@ -1554,6 +1559,7 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio)
* of doing so. Just error it.
*/
bio_io_error(bio);
+ cell_defer_no_holder_no_free(tc, &cell1);
return DM_MAPIO_SUBMITTED;
}
/* fall through */
@@ -1564,6 +1570,7 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio)
* provide the hint to load the metadata into cache.
*/
thin_defer_bio(tc, bio);
+ cell_defer_no_holder_no_free(tc, &cell1);
return DM_MAPIO_SUBMITTED;
default:
@@ -1573,6 +1580,7 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio)
* pool is switched to fail-io mode.
*/
bio_io_error(bio);
+ cell_defer_no_holder_no_free(tc, &cell1);
return DM_MAPIO_SUBMITTED;
}
}
@@ -2695,7 +2703,8 @@ static void set_discard_limits(struct pool_c *pt, struct queue_limits *limits)
*/
if (pt->adjusted_pf.discard_passdown) {
data_limits = &bdev_get_queue(pt->data_dev->bdev)->limits;
- limits->discard_granularity = data_limits->discard_granularity;
+ limits->discard_granularity = max(data_limits->discard_granularity,
+ pool->sectors_per_block << SECTOR_SHIFT);
} else
limits->discard_granularity = pool->sectors_per_block << SECTOR_SHIFT;
}
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index c147f06..a562d5a 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1813,14 +1813,14 @@ static void dm_request_fn(struct request_queue *q)
if (map_request(ti, clone, md))
goto requeued;
- BUG_ON_NONRT(!irqs_disabled());
+ BUG_ON(!irqs_disabled());
spin_lock(q->queue_lock);
}
goto out;
requeued:
- BUG_ON_NONRT(!irqs_disabled());
+ BUG_ON(!irqs_disabled());
spin_lock(q->queue_lock);
delay_and_out:
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 015bc45..bf030d4 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -7371,8 +7371,10 @@ void md_do_sync(struct md_thread *thread)
/* just incase thread restarts... */
if (test_bit(MD_RECOVERY_DONE, &mddev->recovery))
return;
- if (mddev->ro) /* never try to sync a read-only array */
+ if (mddev->ro) {/* never try to sync a read-only array */
+ set_bit(MD_RECOVERY_INTR, &mddev->recovery);
return;
+ }
if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) {
@@ -7482,6 +7484,19 @@ void md_do_sync(struct md_thread *thread)
rdev->recovery_offset < j)
j = rdev->recovery_offset;
rcu_read_unlock();
+
+ /* If there is a bitmap, we need to make sure all
+ * writes that started before we added a spare
+ * complete before we start doing a recovery.
+ * Otherwise the write might complete and (via
+ * bitmap_endwrite) set a bit in the bitmap after the
+ * recovery has checked that bit and skipped that
+ * region.
+ */
+ if (mddev->bitmap) {
+ mddev->pers->quiesce(mddev, 1);
+ mddev->pers->quiesce(mddev, 0);
+ }
}
printk(KERN_INFO "md: %s of RAID array %s\n", desc, mdname(mddev));
@@ -7825,6 +7840,7 @@ void md_check_recovery(struct mddev *mddev)
/* There is no thread, but we need to call
* ->spare_active and clear saved_raid_disk
*/
+ set_bit(MD_RECOVERY_INTR, &mddev->recovery);
md_reap_sync_thread(mddev);
clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
goto unlock;
@@ -8520,7 +8536,8 @@ static int md_notify_reboot(struct notifier_block *this,
if (mddev_trylock(mddev)) {
if (mddev->pers)
__md_stop_writes(mddev);
- mddev->safemode = 2;
+ if (mddev->persistent)
+ mddev->safemode = 2;
mddev_unlock(mddev);
}
need_delay = 1;
diff --git a/drivers/md/persistent-data/dm-block-manager.c b/drivers/md/persistent-data/dm-block-manager.c
index 064a3c2..30597f3 100644
--- a/drivers/md/persistent-data/dm-block-manager.c
+++ b/drivers/md/persistent-data/dm-block-manager.c
@@ -595,25 +595,14 @@ int dm_bm_unlock(struct dm_block *b)
}
EXPORT_SYMBOL_GPL(dm_bm_unlock);
-int dm_bm_flush_and_unlock(struct dm_block_manager *bm,
- struct dm_block *superblock)
+int dm_bm_flush(struct dm_block_manager *bm)
{
- int r;
-
if (bm->read_only)
return -EPERM;
- r = dm_bufio_write_dirty_buffers(bm->bufio);
- if (unlikely(r)) {
- dm_bm_unlock(superblock);
- return r;
- }
-
- dm_bm_unlock(superblock);
-
return dm_bufio_write_dirty_buffers(bm->bufio);
}
-EXPORT_SYMBOL_GPL(dm_bm_flush_and_unlock);
+EXPORT_SYMBOL_GPL(dm_bm_flush);
void dm_bm_prefetch(struct dm_block_manager *bm, dm_block_t b)
{
diff --git a/drivers/md/persistent-data/dm-block-manager.h b/drivers/md/persistent-data/dm-block-manager.h
index 13cd58e..1b95dfc 100644
--- a/drivers/md/persistent-data/dm-block-manager.h
+++ b/drivers/md/persistent-data/dm-block-manager.h
@@ -105,8 +105,7 @@ int dm_bm_unlock(struct dm_block *b);
*
* This method always blocks.
*/
-int dm_bm_flush_and_unlock(struct dm_block_manager *bm,
- struct dm_block *superblock);
+int dm_bm_flush(struct dm_block_manager *bm);
/*
* Request data is prefetched into the cache.
diff --git a/drivers/md/persistent-data/dm-btree-internal.h b/drivers/md/persistent-data/dm-btree-internal.h
index 37d367b..bf2b80d 100644
--- a/drivers/md/persistent-data/dm-btree-internal.h
+++ b/drivers/md/persistent-data/dm-btree-internal.h
@@ -42,6 +42,12 @@ struct btree_node {
} __packed;
+/*
+ * Locks a block using the btree node validator.
+ */
+int bn_read_lock(struct dm_btree_info *info, dm_block_t b,
+ struct dm_block **result);
+
void inc_children(struct dm_transaction_manager *tm, struct btree_node *n,
struct dm_btree_value_type *vt);
diff --git a/drivers/md/persistent-data/dm-btree-spine.c b/drivers/md/persistent-data/dm-btree-spine.c
index cf9fd67..1b5e13e 100644
--- a/drivers/md/persistent-data/dm-btree-spine.c
+++ b/drivers/md/persistent-data/dm-btree-spine.c
@@ -92,7 +92,7 @@ struct dm_block_validator btree_node_validator = {
/*----------------------------------------------------------------*/
-static int bn_read_lock(struct dm_btree_info *info, dm_block_t b,
+int bn_read_lock(struct dm_btree_info *info, dm_block_t b,
struct dm_block **result)
{
return dm_tm_read_lock(info->tm, b, &btree_node_validator, result);
diff --git a/drivers/md/persistent-data/dm-btree.c b/drivers/md/persistent-data/dm-btree.c
index 468e371..9701d29 100644
--- a/drivers/md/persistent-data/dm-btree.c
+++ b/drivers/md/persistent-data/dm-btree.c
@@ -828,22 +828,26 @@ EXPORT_SYMBOL_GPL(dm_btree_find_highest_key);
* FIXME: We shouldn't use a recursive algorithm when we have limited stack
* space. Also this only works for single level trees.
*/
-static int walk_node(struct ro_spine *s, dm_block_t block,
+static int walk_node(struct dm_btree_info *info, dm_block_t block,
int (*fn)(void *context, uint64_t *keys, void *leaf),
void *context)
{
int r;
unsigned i, nr;
+ struct dm_block *node;
struct btree_node *n;
uint64_t keys;
- r = ro_step(s, block);
- n = ro_node(s);
+ r = bn_read_lock(info, block, &node);
+ if (r)
+ return r;
+
+ n = dm_block_data(node);
nr = le32_to_cpu(n->header.nr_entries);
for (i = 0; i < nr; i++) {
if (le32_to_cpu(n->header.flags) & INTERNAL_NODE) {
- r = walk_node(s, value64(n, i), fn, context);
+ r = walk_node(info, value64(n, i), fn, context);
if (r)
goto out;
} else {
@@ -855,7 +859,7 @@ static int walk_node(struct ro_spine *s, dm_block_t block,
}
out:
- ro_pop(s);
+ dm_tm_unlock(info->tm, node);
return r;
}
@@ -863,15 +867,7 @@ int dm_btree_walk(struct dm_btree_info *info, dm_block_t root,
int (*fn)(void *context, uint64_t *keys, void *leaf),
void *context)
{
- int r;
- struct ro_spine spine;
-
BUG_ON(info->levels > 1);
-
- init_ro_spine(&spine, info);
- r = walk_node(&spine, root, fn, context);
- exit_ro_spine(&spine);
-
- return r;
+ return walk_node(info, root, fn, context);
}
EXPORT_SYMBOL_GPL(dm_btree_walk);
diff --git a/drivers/md/persistent-data/dm-space-map-metadata.c b/drivers/md/persistent-data/dm-space-map-metadata.c
index 579b582..d9a5aa5 100644
--- a/drivers/md/persistent-data/dm-space-map-metadata.c
+++ b/drivers/md/persistent-data/dm-space-map-metadata.c
@@ -564,7 +564,9 @@ static int sm_bootstrap_get_nr_blocks(struct dm_space_map *sm, dm_block_t *count
{
struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
- return smm->ll.nr_blocks;
+ *count = smm->ll.nr_blocks;
+
+ return 0;
}
static int sm_bootstrap_get_nr_free(struct dm_space_map *sm, dm_block_t *count)
diff --git a/drivers/md/persistent-data/dm-transaction-manager.c b/drivers/md/persistent-data/dm-transaction-manager.c
index 81da1a2..3bc30a0 100644
--- a/drivers/md/persistent-data/dm-transaction-manager.c
+++ b/drivers/md/persistent-data/dm-transaction-manager.c
@@ -154,7 +154,7 @@ int dm_tm_pre_commit(struct dm_transaction_manager *tm)
if (r < 0)
return r;
- return 0;
+ return dm_bm_flush(tm->bm);
}
EXPORT_SYMBOL_GPL(dm_tm_pre_commit);
@@ -164,8 +164,9 @@ int dm_tm_commit(struct dm_transaction_manager *tm, struct dm_block *root)
return -EWOULDBLOCK;
wipe_shadow_table(tm);
+ dm_bm_unlock(root);
- return dm_bm_flush_and_unlock(tm->bm, root);
+ return dm_bm_flush(tm->bm);
}
EXPORT_SYMBOL_GPL(dm_tm_commit);
diff --git a/drivers/md/persistent-data/dm-transaction-manager.h b/drivers/md/persistent-data/dm-transaction-manager.h
index b5b1390..2772ed2 100644
--- a/drivers/md/persistent-data/dm-transaction-manager.h
+++ b/drivers/md/persistent-data/dm-transaction-manager.h
@@ -38,18 +38,17 @@ struct dm_transaction_manager *dm_tm_create_non_blocking_clone(struct dm_transac
/*
* We use a 2-phase commit here.
*
- * i) In the first phase the block manager is told to start flushing, and
- * the changes to the space map are written to disk. You should interrogate
- * your particular space map to get detail of its root node etc. to be
- * included in your superblock.
+ * i) Make all changes for the transaction *except* for the superblock.
+ * Then call dm_tm_pre_commit() to flush them to disk.
*
- * ii) @root will be committed last. You shouldn't use more than the
- * first 512 bytes of @root if you wish the transaction to survive a power
- * failure. You *must* have a write lock held on @root for both stage (i)
- * and (ii). The commit will drop the write lock.
+ * ii) Lock your superblock. Update. Then call dm_tm_commit() which will
+ * unlock the superblock and flush it. No other blocks should be updated
+ * during this period. Care should be taken to never unlock a partially
+ * updated superblock; perform any operations that could fail *before* you
+ * take the superblock lock.
*/
int dm_tm_pre_commit(struct dm_transaction_manager *tm);
-int dm_tm_commit(struct dm_transaction_manager *tm, struct dm_block *root);
+int dm_tm_commit(struct dm_transaction_manager *tm, struct dm_block *superblock);
/*
* These methods are the only way to get hold of a writeable block.
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 6edc2db..6564eeb 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -94,6 +94,7 @@ static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
struct pool_info *pi = data;
struct r1bio *r1_bio;
struct bio *bio;
+ int need_pages;
int i, j;
r1_bio = r1bio_pool_alloc(gfp_flags, pi);
@@ -116,15 +117,15 @@ static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
* RESYNC_PAGES for each bio.
*/
if (test_bit(MD_RECOVERY_REQUESTED, &pi->mddev->recovery))
- j = pi->raid_disks;
+ need_pages = pi->raid_disks;
else
- j = 1;
- while(j--) {
+ need_pages = 1;
+ for (j = 0; j < need_pages; j++) {
bio = r1_bio->bios[j];
bio->bi_vcnt = RESYNC_PAGES;
if (bio_alloc_pages(bio, gfp_flags))
- goto out_free_bio;
+ goto out_free_pages;
}
/* If not user-requests, copy the page pointers to all bios */
if (!test_bit(MD_RECOVERY_REQUESTED, &pi->mddev->recovery)) {
@@ -138,6 +139,14 @@ static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
return r1_bio;
+out_free_pages:
+ while (--j >= 0) {
+ struct bio_vec *bv;
+
+ bio_for_each_segment_all(bv, r1_bio->bios[j], i)
+ __free_page(bv->bv_page);
+ }
+
out_free_bio:
while (++j < pi->raid_disks)
bio_put(r1_bio->bios[j]);
@@ -1397,12 +1406,12 @@ static void error(struct mddev *mddev, struct md_rdev *rdev)
mddev->degraded++;
set_bit(Faulty, &rdev->flags);
spin_unlock_irqrestore(&conf->device_lock, flags);
- /*
- * if recovery is running, make sure it aborts.
- */
- set_bit(MD_RECOVERY_INTR, &mddev->recovery);
} else
set_bit(Faulty, &rdev->flags);
+ /*
+ * if recovery is running, make sure it aborts.
+ */
+ set_bit(MD_RECOVERY_INTR, &mddev->recovery);
set_bit(MD_CHANGE_DEVS, &mddev->flags);
printk(KERN_ALERT
"md/raid1:%s: Disk failure on %s, disabling device.\n"
@@ -2043,7 +2052,7 @@ static void fix_read_error(struct r1conf *conf, int read_disk,
d--;
rdev = conf->mirrors[d].rdev;
if (rdev &&
- test_bit(In_sync, &rdev->flags))
+ !test_bit(Faulty, &rdev->flags))
r1_sync_page_io(rdev, sect, s,
conf->tmppage, WRITE);
}
@@ -2055,7 +2064,7 @@ static void fix_read_error(struct r1conf *conf, int read_disk,
d--;
rdev = conf->mirrors[d].rdev;
if (rdev &&
- test_bit(In_sync, &rdev->flags)) {
+ !test_bit(Faulty, &rdev->flags)) {
if (r1_sync_page_io(rdev, sect, s,
conf->tmppage, READ)) {
atomic_add(s, &rdev->corrected_errors);
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 308575d..9ccb107 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1698,13 +1698,12 @@ static void error(struct mddev *mddev, struct md_rdev *rdev)
spin_unlock_irqrestore(&conf->device_lock, flags);
return;
}
- if (test_and_clear_bit(In_sync, &rdev->flags)) {
+ if (test_and_clear_bit(In_sync, &rdev->flags))
mddev->degraded++;
- /*
- * if recovery is running, make sure it aborts.
- */
- set_bit(MD_RECOVERY_INTR, &mddev->recovery);
- }
+ /*
+ * If recovery is running, make sure it aborts.
+ */
+ set_bit(MD_RECOVERY_INTR, &mddev->recovery);
set_bit(Blocked, &rdev->flags);
set_bit(Faulty, &rdev->flags);
set_bit(MD_CHANGE_DEVS, &mddev->flags);
@@ -2970,6 +2969,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
*/
if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) {
end_reshape(conf);
+ close_sync(conf);
return 0;
}
@@ -4420,7 +4420,7 @@ read_more:
read_bio->bi_private = r10_bio;
read_bio->bi_end_io = end_sync_read;
read_bio->bi_rw = READ;
- read_bio->bi_flags &= ~(BIO_POOL_MASK - 1);
+ read_bio->bi_flags &= (~0UL << BIO_RESET_BITS);
read_bio->bi_flags |= 1 << BIO_UPTODATE;
read_bio->bi_vcnt = 0;
read_bio->bi_size = 0;
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index ab00c1e..7b54c3b 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -64,6 +64,10 @@
#define cpu_to_group(cpu) cpu_to_node(cpu)
#define ANY_GROUP NUMA_NO_NODE
+static bool devices_handle_discard_safely = false;
+module_param(devices_handle_discard_safely, bool, 0644);
+MODULE_PARM_DESC(devices_handle_discard_safely,
+ "Set to Y if all devices in each array reliably return zeroes on reads from discarded regions");
static struct workqueue_struct *raid5_wq;
/*
* Stripe cache
@@ -1541,9 +1545,8 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
struct raid5_percpu *percpu;
unsigned long cpu;
- cpu = get_cpu_light();
+ cpu = get_cpu();
percpu = per_cpu_ptr(conf->percpu, cpu);
- spin_lock(&percpu->lock);
if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) {
ops_run_biofill(sh);
overlap_clear++;
@@ -1595,8 +1598,7 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
if (test_and_clear_bit(R5_Overlap, &dev->flags))
wake_up(&sh->raid_conf->wait_for_overlap);
}
- spin_unlock(&percpu->lock);
- put_cpu_light();
+ put_cpu();
}
static int grow_one_stripe(struct r5conf *conf)
@@ -2787,7 +2789,8 @@ static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s,
(s->failed >= 2 && fdev[1]->toread) ||
(sh->raid_conf->level <= 5 && s->failed && fdev[0]->towrite &&
!test_bit(R5_OVERWRITE, &fdev[0]->flags)) ||
- (sh->raid_conf->level == 6 && s->failed && s->to_write))) {
+ ((sh->raid_conf->level == 6 || sh->sector >= sh->raid_conf->mddev->recovery_cp)
+ && s->failed && s->to_write))) {
/* we would like to get this block, possibly by computing it,
* otherwise read it if the backing disk is insync
*/
@@ -3674,6 +3677,8 @@ static void handle_stripe(struct stripe_head *sh)
set_bit(R5_Wantwrite, &dev->flags);
if (prexor)
continue;
+ if (s.failed > 1)
+ continue;
if (!test_bit(R5_Insync, &dev->flags) ||
((i == sh->pd_idx || i == sh->qd_idx) &&
s.failed == 0))
@@ -5458,7 +5463,6 @@ static int raid5_alloc_percpu(struct r5conf *conf)
__func__, cpu);
break;
}
- spin_lock_init(&per_cpu_ptr(conf->percpu, cpu)->lock);
}
put_online_cpus();
@@ -5939,7 +5943,7 @@ static int run(struct mddev *mddev)
mddev->queue->limits.discard_granularity = stripe;
/*
* unaligned part of discard request will be ignored, so can't
- * guarantee discard_zerors_data
+ * guarantee discard_zeroes_data
*/
mddev->queue->limits.discard_zeroes_data = 0;
@@ -5964,6 +5968,18 @@ static int run(struct mddev *mddev)
!bdev_get_queue(rdev->bdev)->
limits.discard_zeroes_data)
discard_supported = false;
+ /* Unfortunately, discard_zeroes_data is not currently
+ * a guarantee - just a hint. So we only allow DISCARD
+ * if the sysadmin has confirmed that only safe devices
+ * are in use by setting a module parameter.
+ */
+ if (!devices_handle_discard_safely) {
+ if (discard_supported) {
+ pr_info("md/raid456: discard support disabled due to uncertainty.\n");
+ pr_info("Set raid456.devices_handle_discard_safely=Y to override.\n");
+ }
+ discard_supported = false;
+ }
}
if (discard_supported &&
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 0a5e1e1..2113ffa 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -444,7 +444,6 @@ struct r5conf {
int recovery_disabled;
/* per cpu variables */
struct raid5_percpu {
- spinlock_t lock; /* Protection for -RT */
struct page *spare_page; /* Used when checking P/Q in raid6 */
void *scribble; /* space for constructing buffer
* lists and performing address