From 997782735c0f1e2e069337129fe0d5738d83d19b Mon Sep 17 00:00:00 2001 From: Junichi Nomura Date: Fri, 3 Oct 2014 11:55:16 +0000 Subject: dm: remove nr_iovecs parameter from alloc_tio() alloc_tio() uses bio_alloc_bioset() to allocate a clone-bio for a bio. alloc_tio() takes the number of bvecs to allocate for the clone-bio. However, with v3.14's immutable biovec changes DM now uses __bio_clone_fast() and no longer needs to allocate bvecs. In practice, the 'nr_iovecs' passed to alloc_tio() is always effectively 0. __clone_and_map_simple_bio() looked like it was passing non-zero nr_iovecs, but its value was always within the range of inline bvecs and no allocation actually happened. If allocation happened, the BUG_ON() in __bio_clone_fast() would've triggered. Remove the nr_iovecs parameter from alloc_tio() to prevent possible future bio_alloc_bioset() mis-use of a new bioset interface that will no longer allow bvecs to be allocated. Also fix extra whitespace before the __bio_clone_fast() call in __clone_and_map_simple_bio(). Signed-off-by: Jun'ichi Nomura Signed-off-by: Mike Snitzer diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 32b958d..0b9de07 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1249,13 +1249,13 @@ static void clone_bio(struct dm_target_io *tio, struct bio *bio, } static struct dm_target_io *alloc_tio(struct clone_info *ci, - struct dm_target *ti, int nr_iovecs, + struct dm_target *ti, unsigned target_bio_nr) { struct dm_target_io *tio; struct bio *clone; - clone = bio_alloc_bioset(GFP_NOIO, nr_iovecs, ci->md->bs); + clone = bio_alloc_bioset(GFP_NOIO, 0, ci->md->bs); tio = container_of(clone, struct dm_target_io, clone); tio->io = ci->io; @@ -1269,17 +1269,12 @@ static void __clone_and_map_simple_bio(struct clone_info *ci, struct dm_target *ti, unsigned target_bio_nr, unsigned *len) { - struct dm_target_io *tio = alloc_tio(ci, ti, ci->bio->bi_max_vecs, target_bio_nr); + struct dm_target_io *tio = alloc_tio(ci, ti, target_bio_nr); struct bio *clone = &tio->clone; tio->len_ptr = len; - /* - * Discard requests require the bio's inline iovecs be initialized. - * ci->bio->bi_max_vecs is BIO_INLINE_VECS anyway, for both flush - * and discard, so no need for concern about wasted bvec allocations. - */ - __bio_clone_fast(clone, ci->bio); + __bio_clone_fast(clone, ci->bio); if (len) bio_setup_sector(clone, ci->sector, *len); @@ -1322,7 +1317,7 @@ static void __clone_and_map_data_bio(struct clone_info *ci, struct dm_target *ti num_target_bios = ti->num_write_bios(ti, bio); for (target_bio_nr = 0; target_bio_nr < num_target_bios; target_bio_nr++) { - tio = alloc_tio(ci, ti, 0, target_bio_nr); + tio = alloc_tio(ci, ti, target_bio_nr); tio->len_ptr = len; clone_bio(tio, bio, sector, *len); __map_bio(tio); -- cgit v0.10.2 From 3d8aab2d2cca2dc878e396196d07889129440798 Mon Sep 17 00:00:00 2001 From: Junichi Nomura Date: Fri, 3 Oct 2014 11:55:26 +0000 Subject: dm: use bioset_create_nobvec() Since DM core uses bio_clone_fast() for both bio-based and request-based DM devices there is no need for DM's bioset to have a bvec mempool. With this patch, on arch with 4KB page for example, memory usage will be reduced by 64KB for each bio-based DM device and 1MB for each request-based DM device. For example, when you create 10,000 bio-based DM devices and 1,000 request-based DM devices, memory usage of biovec under no load is: # grep biovec /proc/slabinfo biovec-256 418068 418068 4096 ... biovec-128 0 0 2048 ... biovec-64 0 0 1024 ... biovec-16 0 0 256 ... With this patch series applied, the usage becomes: # grep biovec /proc/slabinfo biovec-256 116 116 4096 ... biovec-128 0 0 2048 ... biovec-64 0 0 1024 ... biovec-16 0 0 256 ... So 4096 * (418068 - 116) = 1.6GB of memory is saved in this example. Signed-off-by: Jun'ichi Nomura Signed-off-by: Mike Snitzer diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 0b9de07..56a2c74 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -2895,7 +2895,7 @@ struct dm_md_mempools *dm_alloc_md_mempools(unsigned type, unsigned integrity, u if (!pools->io_pool) goto out; - pools->bs = bioset_create(pool_size, front_pad); + pools->bs = bioset_create_nobvec(pool_size, front_pad); if (!pools->bs) goto out; -- cgit v0.10.2 From 1f271972478d84dd9e4d6dd82f414d70ed9e78ce Mon Sep 17 00:00:00 2001 From: Benjamin Marzinski Date: Wed, 13 Aug 2014 13:53:42 -0500 Subject: dm mpath: stop queueing IO when no valid paths exist 'queue_io' is set so that IO is queued while paths are being initialized. Clear queue_io in __choose_pgpath if there are no valid paths, since there are obviously no paths that can be initialized. Otherwise IOs to the device will back up. Signed-off-by: Benjamin Marzinski Signed-off-by: Mike Snitzer diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index 833d7e7..7b6b0f0 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -317,8 +317,10 @@ static void __choose_pgpath(struct multipath *m, size_t nr_bytes) struct priority_group *pg; unsigned bypassed = 1; - if (!m->nr_valid_paths) + if (!m->nr_valid_paths) { + m->queue_io = 0; goto failed; + } /* Were we instructed to switch PG? */ if (m->next_pg) { -- cgit v0.10.2 From 86f1152b117a404229fd6f08ec3faca779f37b92 Mon Sep 17 00:00:00 2001 From: Benjamin Marzinski Date: Wed, 13 Aug 2014 13:53:43 -0500 Subject: dm: allow active and inactive tables to share dm_devs Until this change, when loading a new DM table, DM core would re-open all of the devices in the DM table. Now, DM core will avoid redundant device opens (and closes when destroying the old table) if the old table already has a device open using the same mode. This is achieved by managing reference counts on the table_devices that DM core now stores in the mapped_device structure (rather than in the dm_table structure). So a mapped_device's active and inactive dm_tables' dm_dev lists now just point to the dm_devs stored in the mapped_device's table_devices list. This improvement in DM core's device reference counting has the side-effect of fixing a long-standing limitation of the multipath target: a DM multipath table couldn't include any paths that were unusable (failed). For example: if all paths have failed and you add a new, working, path to the table; you can't use it since the table load would fail due to it still containing failed paths. Now a re-load of a multipath table can include failed devices and when those devices become active again they can be used instantly. The device list code in dm.c isn't a straight copy/paste from the code in dm-table.c, but it's very close (aside from some variable renames). One subtle difference is that find_table_device for the tables_devices list will only match devices with the same name and mode. This is because we don't want to upgrade a device's mode in the active table when an inactive table is loaded. Access to the mapped_device structure's tables_devices list requires a mutex (tables_devices_lock), so that tables cannot be created and destroyed concurrently. Signed-off-by: Benjamin Marzinski Signed-off-by: Mike Snitzer diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c index 5152142..0be9381 100644 --- a/drivers/md/dm-ioctl.c +++ b/drivers/md/dm-ioctl.c @@ -1418,7 +1418,7 @@ static void retrieve_deps(struct dm_table *table, deps->count = count; count = 0; list_for_each_entry (dd, dm_table_get_devices(table), list) - deps->dev[count++] = huge_encode_dev(dd->dm_dev.bdev->bd_dev); + deps->dev[count++] = huge_encode_dev(dd->dm_dev->bdev->bd_dev); param->data_size = param->data_start + needed; } diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index f9c6cb8..b2bd1eb 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -210,15 +210,16 @@ int dm_table_create(struct dm_table **result, fmode_t mode, return 0; } -static void free_devices(struct list_head *devices) +static void free_devices(struct list_head *devices, struct mapped_device *md) { struct list_head *tmp, *next; list_for_each_safe(tmp, next, devices) { struct dm_dev_internal *dd = list_entry(tmp, struct dm_dev_internal, list); - DMWARN("dm_table_destroy: dm_put_device call missing for %s", - dd->dm_dev.name); + DMWARN("%s: dm_table_destroy: dm_put_device call missing for %s", + dm_device_name(md), dd->dm_dev->name); + dm_put_table_device(md, dd->dm_dev); kfree(dd); } } @@ -247,7 +248,7 @@ void dm_table_destroy(struct dm_table *t) vfree(t->highs); /* free the device list */ - free_devices(&t->devices); + free_devices(&t->devices, t->md); dm_free_md_mempools(t->mempools); @@ -262,53 +263,13 @@ static struct dm_dev_internal *find_device(struct list_head *l, dev_t dev) struct dm_dev_internal *dd; list_for_each_entry (dd, l, list) - if (dd->dm_dev.bdev->bd_dev == dev) + if (dd->dm_dev->bdev->bd_dev == dev) return dd; return NULL; } /* - * Open a device so we can use it as a map destination. - */ -static int open_dev(struct dm_dev_internal *d, dev_t dev, - struct mapped_device *md) -{ - static char *_claim_ptr = "I belong to device-mapper"; - struct block_device *bdev; - - int r; - - BUG_ON(d->dm_dev.bdev); - - bdev = blkdev_get_by_dev(dev, d->dm_dev.mode | FMODE_EXCL, _claim_ptr); - if (IS_ERR(bdev)) - return PTR_ERR(bdev); - - r = bd_link_disk_holder(bdev, dm_disk(md)); - if (r) { - blkdev_put(bdev, d->dm_dev.mode | FMODE_EXCL); - return r; - } - - d->dm_dev.bdev = bdev; - return 0; -} - -/* - * Close a device that we've been using. - */ -static void close_dev(struct dm_dev_internal *d, struct mapped_device *md) -{ - if (!d->dm_dev.bdev) - return; - - bd_unlink_disk_holder(d->dm_dev.bdev, dm_disk(md)); - blkdev_put(d->dm_dev.bdev, d->dm_dev.mode | FMODE_EXCL); - d->dm_dev.bdev = NULL; -} - -/* * If possible, this checks an area of a destination device is invalid. */ static int device_area_is_invalid(struct dm_target *ti, struct dm_dev *dev, @@ -386,19 +347,17 @@ static int upgrade_mode(struct dm_dev_internal *dd, fmode_t new_mode, struct mapped_device *md) { int r; - struct dm_dev_internal dd_new, dd_old; + struct dm_dev *old_dev, *new_dev; - dd_new = dd_old = *dd; + old_dev = dd->dm_dev; - dd_new.dm_dev.mode |= new_mode; - dd_new.dm_dev.bdev = NULL; - - r = open_dev(&dd_new, dd->dm_dev.bdev->bd_dev, md); + r = dm_get_table_device(md, dd->dm_dev->bdev->bd_dev, + dd->dm_dev->mode | new_mode, &new_dev); if (r) return r; - dd->dm_dev.mode |= new_mode; - close_dev(&dd_old, md); + dd->dm_dev = new_dev; + dm_put_table_device(md, old_dev); return 0; } @@ -440,27 +399,22 @@ int dm_get_device(struct dm_target *ti, const char *path, fmode_t mode, if (!dd) return -ENOMEM; - dd->dm_dev.mode = mode; - dd->dm_dev.bdev = NULL; - - if ((r = open_dev(dd, dev, t->md))) { + if ((r = dm_get_table_device(t->md, dev, mode, &dd->dm_dev))) { kfree(dd); return r; } - format_dev_t(dd->dm_dev.name, dev); - atomic_set(&dd->count, 0); list_add(&dd->list, &t->devices); - } else if (dd->dm_dev.mode != (mode | dd->dm_dev.mode)) { + } else if (dd->dm_dev->mode != (mode | dd->dm_dev->mode)) { r = upgrade_mode(dd, mode, t->md); if (r) return r; } atomic_inc(&dd->count); - *result = &dd->dm_dev; + *result = dd->dm_dev; return 0; } EXPORT_SYMBOL(dm_get_device); @@ -505,11 +459,23 @@ static int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev, */ void dm_put_device(struct dm_target *ti, struct dm_dev *d) { - struct dm_dev_internal *dd = container_of(d, struct dm_dev_internal, - dm_dev); + int found = 0; + struct list_head *devices = &ti->table->devices; + struct dm_dev_internal *dd; + list_for_each_entry(dd, devices, list) { + if (dd->dm_dev == d) { + found = 1; + break; + } + } + if (!found) { + DMWARN("%s: device %s not in table devices list", + dm_device_name(ti->table->md), d->name); + return; + } if (atomic_dec_and_test(&dd->count)) { - close_dev(dd, ti->table->md); + dm_put_table_device(ti->table->md, d); list_del(&dd->list); kfree(dd); } @@ -906,7 +872,7 @@ static int dm_table_set_type(struct dm_table *t) /* Non-request-stackable devices can't be used for request-based dm */ devices = dm_table_get_devices(t); list_for_each_entry(dd, devices, list) { - if (!blk_queue_stackable(bdev_get_queue(dd->dm_dev.bdev))) { + if (!blk_queue_stackable(bdev_get_queue(dd->dm_dev->bdev))) { DMWARN("table load rejected: including" " non-request-stackable devices"); return -EINVAL; @@ -1043,7 +1009,7 @@ static struct gendisk * dm_table_get_integrity_disk(struct dm_table *t, struct gendisk *prev_disk = NULL, *template_disk = NULL; list_for_each_entry(dd, devices, list) { - template_disk = dd->dm_dev.bdev->bd_disk; + template_disk = dd->dm_dev->bdev->bd_disk; if (!blk_get_integrity(template_disk)) goto no_integrity; if (!match_all && !blk_integrity_is_initialized(template_disk)) @@ -1629,7 +1595,7 @@ int dm_table_any_congested(struct dm_table *t, int bdi_bits) int r = 0; list_for_each_entry(dd, devices, list) { - struct request_queue *q = bdev_get_queue(dd->dm_dev.bdev); + struct request_queue *q = bdev_get_queue(dd->dm_dev->bdev); char b[BDEVNAME_SIZE]; if (likely(q)) @@ -1637,7 +1603,7 @@ int dm_table_any_congested(struct dm_table *t, int bdi_bits) else DMWARN_LIMIT("%s: any_congested: nonexistent device %s", dm_device_name(t->md), - bdevname(dd->dm_dev.bdev, b)); + bdevname(dd->dm_dev->bdev, b)); } list_for_each_entry(cb, &t->target_callbacks, list) diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 56a2c74..58f3927 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -142,6 +142,9 @@ struct mapped_device { */ struct dm_table *map; + struct list_head table_devices; + struct mutex table_devices_lock; + unsigned long flags; struct request_queue *queue; @@ -212,6 +215,12 @@ struct dm_md_mempools { struct bio_set *bs; }; +struct table_device { + struct list_head list; + atomic_t count; + struct dm_dev dm_dev; +}; + #define RESERVED_BIO_BASED_IOS 16 #define RESERVED_REQUEST_BASED_IOS 256 #define RESERVED_MAX_IOS 1024 @@ -670,6 +679,120 @@ static void dm_put_live_table_fast(struct mapped_device *md) __releases(RCU) } /* + * Open a table device so we can use it as a map destination. + */ +static int open_table_device(struct table_device *td, dev_t dev, + struct mapped_device *md) +{ + static char *_claim_ptr = "I belong to device-mapper"; + struct block_device *bdev; + + int r; + + BUG_ON(td->dm_dev.bdev); + + bdev = blkdev_get_by_dev(dev, td->dm_dev.mode | FMODE_EXCL, _claim_ptr); + if (IS_ERR(bdev)) + return PTR_ERR(bdev); + + r = bd_link_disk_holder(bdev, dm_disk(md)); + if (r) { + blkdev_put(bdev, td->dm_dev.mode | FMODE_EXCL); + return r; + } + + td->dm_dev.bdev = bdev; + return 0; +} + +/* + * Close a table device that we've been using. + */ +static void close_table_device(struct table_device *td, struct mapped_device *md) +{ + if (!td->dm_dev.bdev) + return; + + bd_unlink_disk_holder(td->dm_dev.bdev, dm_disk(md)); + blkdev_put(td->dm_dev.bdev, td->dm_dev.mode | FMODE_EXCL); + td->dm_dev.bdev = NULL; +} + +static struct table_device *find_table_device(struct list_head *l, dev_t dev, + fmode_t mode) { + struct table_device *td; + + list_for_each_entry(td, l, list) + if (td->dm_dev.bdev->bd_dev == dev && td->dm_dev.mode == mode) + return td; + + return NULL; +} + +int dm_get_table_device(struct mapped_device *md, dev_t dev, fmode_t mode, + struct dm_dev **result) { + int r; + struct table_device *td; + + mutex_lock(&md->table_devices_lock); + td = find_table_device(&md->table_devices, dev, mode); + if (!td) { + td = kmalloc(sizeof(*td), GFP_KERNEL); + if (!td) { + mutex_unlock(&md->table_devices_lock); + return -ENOMEM; + } + + td->dm_dev.mode = mode; + td->dm_dev.bdev = NULL; + + if ((r = open_table_device(td, dev, md))) { + mutex_unlock(&md->table_devices_lock); + kfree(td); + return r; + } + + format_dev_t(td->dm_dev.name, dev); + + atomic_set(&td->count, 0); + list_add(&td->list, &md->table_devices); + } + atomic_inc(&td->count); + mutex_unlock(&md->table_devices_lock); + + *result = &td->dm_dev; + return 0; +} +EXPORT_SYMBOL_GPL(dm_get_table_device); + +void dm_put_table_device(struct mapped_device *md, struct dm_dev *d) +{ + struct table_device *td = container_of(d, struct table_device, dm_dev); + + mutex_lock(&md->table_devices_lock); + if (atomic_dec_and_test(&td->count)) { + close_table_device(td, md); + list_del(&td->list); + kfree(td); + } + mutex_unlock(&md->table_devices_lock); +} +EXPORT_SYMBOL(dm_put_table_device); + +static void free_table_devices(struct list_head *devices) +{ + struct list_head *tmp, *next; + + list_for_each_safe(tmp, next, devices) { + struct table_device *td = list_entry(tmp, struct table_device, list); + + DMWARN("dm_destroy: %s still exists with %d references", + td->dm_dev.name, atomic_read(&td->count)); + kfree(td); + } +} + +/* * Get the geometry associated with a dm device */ int dm_get_geometry(struct mapped_device *md, struct hd_geometry *geo) @@ -1944,12 +2067,14 @@ static struct mapped_device *alloc_dev(int minor) md->type = DM_TYPE_NONE; mutex_init(&md->suspend_lock); mutex_init(&md->type_lock); + mutex_init(&md->table_devices_lock); spin_lock_init(&md->deferred_lock); atomic_set(&md->holders, 1); atomic_set(&md->open_count, 0); atomic_set(&md->event_nr, 0); atomic_set(&md->uevent_seq, 0); INIT_LIST_HEAD(&md->uevent_list); + INIT_LIST_HEAD(&md->table_devices); spin_lock_init(&md->uevent_lock); md->queue = blk_alloc_queue(GFP_KERNEL); @@ -2035,6 +2160,7 @@ static void free_dev(struct mapped_device *md) blk_integrity_unregister(md->disk); del_gendisk(md->disk); cleanup_srcu_struct(&md->io_barrier); + free_table_devices(&md->table_devices); free_minor(minor); spin_lock(&_minor_lock); diff --git a/drivers/md/dm.h b/drivers/md/dm.h index e81d215..988c7fb 100644 --- a/drivers/md/dm.h +++ b/drivers/md/dm.h @@ -44,7 +44,7 @@ struct dm_dev_internal { struct list_head list; atomic_t count; - struct dm_dev dm_dev; + struct dm_dev *dm_dev; }; struct dm_table; @@ -188,6 +188,9 @@ int dm_cancel_deferred_remove(struct mapped_device *md); int dm_request_based(struct mapped_device *md); sector_t dm_get_size(struct mapped_device *md); struct request_queue *dm_get_md_queue(struct mapped_device *md); +int dm_get_table_device(struct mapped_device *md, dev_t dev, fmode_t mode, + struct dm_dev **result); +void dm_put_table_device(struct mapped_device *md, struct dm_dev *d); struct dm_stats *dm_get_stats(struct mapped_device *md); int dm_kobject_uevent(struct mapped_device *md, enum kobject_action action, diff --git a/include/uapi/linux/dm-ioctl.h b/include/uapi/linux/dm-ioctl.h index c8a4302..3315ab2 100644 --- a/include/uapi/linux/dm-ioctl.h +++ b/include/uapi/linux/dm-ioctl.h @@ -267,9 +267,9 @@ enum { #define DM_DEV_SET_GEOMETRY _IOWR(DM_IOCTL, DM_DEV_SET_GEOMETRY_CMD, struct dm_ioctl) #define DM_VERSION_MAJOR 4 -#define DM_VERSION_MINOR 27 +#define DM_VERSION_MINOR 28 #define DM_VERSION_PATCHLEVEL 0 -#define DM_VERSION_EXTRA "-ioctl (2013-10-30)" +#define DM_VERSION_EXTRA "-ioctl (2014-09-17)" /* Status bits */ #define DM_READONLY_FLAG (1 << 0) /* In/Out */ -- cgit v0.10.2 From 75b8e04bbf01bdd5c42a1d8ac54abf757196ce49 Mon Sep 17 00:00:00 2001 From: Heinz Mauelshagen Date: Wed, 24 Sep 2014 17:47:18 +0200 Subject: dm raid: add discard support for RAID levels 1 and 10 Discard support is not enabled for RAID levels 4, 5, and 6 at this time due to concerns about unreliable discard_zeroes_data support on some hardware. Otherwise, discards could cause stripe data corruption (classic example of bad apples spoiling the bunch). Signed-off-by: Heinz Mauelshagen Signed-off-by: Mike Snitzer diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index 4880b69..030e2d6 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -1,6 +1,6 @@ /* * Copyright (C) 2010-2011 Neil Brown - * Copyright (C) 2010-2011 Red Hat, Inc. All rights reserved. + * Copyright (C) 2010-2014 Red Hat, Inc. All rights reserved. * * This file is released under the GPL. */ @@ -1150,6 +1150,27 @@ static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs) } /* + * Enable/disable discard support on RAID set depending on RAID level. + */ +static void configure_discard_support(struct dm_target *ti, struct raid_set *rs) +{ + /* Assume discards not supported until after checks below. */ + ti->discards_supported = false; + + /* RAID level 4,5,6 require discard_zeroes_data for data integrity! */ + if (rs->md.level == 4 || rs->md.level == 5 || rs->md.level == 6) + return; /* discard_zeroes_data cannot be trusted as reliable */ + + ti->discards_supported = true; + + /* + * RAID1 and RAID10 personalities require bio splitting, + */ + ti->split_discard_bios = true; + ti->num_discard_bios = 1; +} + +/* * Construct a RAID4/5/6 mapping: * Args: * <#raid_params> \ @@ -1231,6 +1252,11 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv) ti->private = rs; ti->num_flush_bios = 1; + /* + * Disable/enable discard support on RAID set. + */ + configure_discard_support(ti, rs); + mutex_lock(&rs->md.reconfig_mutex); ret = md_run(&rs->md); rs->md.in_sync = 0; /* Assume already marked dirty */ @@ -1652,7 +1678,7 @@ static void raid_resume(struct dm_target *ti) static struct target_type raid_target = { .name = "raid", - .version = {1, 5, 2}, + .version = {1, 6, 0}, .module = THIS_MODULE, .ctr = raid_ctr, .dtr = raid_dtr, -- cgit v0.10.2 From 48cf06bc5f508d5f71bc0fd7530daebb12a48428 Mon Sep 17 00:00:00 2001 From: Heinz Mauelshagen Date: Wed, 24 Sep 2014 17:47:19 +0200 Subject: dm raid: add discard support for RAID levels 4, 5 and 6 In case of RAID levels 4, 5 and 6 we have to verify each RAID members' ability to zero data on discards to avoid stripe data corruption -- if discard_zeroes_data is not set for each RAID member discard support must be disabled. But given the uncertainty of whether or not a RAID member properly supports zeroing data on discard we require the user to explicitly allow discard support on RAID levels 4, 5, and 6 by setting a dm-raid module paramter, e.g.: dm-raid.devices_handle_discard_safely=Y Otherwise, discards could cause data corruption on RAID4/5/6. Signed-off-by: Heinz Mauelshagen Signed-off-by: Mike Snitzer diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index 030e2d6..4857fa4 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -18,6 +18,8 @@ #define DM_MSG_PREFIX "raid" +static bool devices_handle_discard_safely = false; + /* * The following flags are used by dm-raid.c to set up the array state. * They must be cleared before md_run is called. @@ -475,6 +477,8 @@ too_many: * will form the "stripe" * [[no]sync] Force or prevent recovery of the * entire array + * [devices_handle_discard_safely] Allow discards on RAID4/5/6; useful if RAID + * member device(s) properly support TRIM/UNMAP * [rebuild ] Rebuild the drive indicated by the index * [daemon_sleep ] Time between bitmap daemon work to * clear bits @@ -1150,23 +1154,45 @@ static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs) } /* - * Enable/disable discard support on RAID set depending on RAID level. + * Enable/disable discard support on RAID set depending on + * RAID level and discard properties of underlying RAID members. */ static void configure_discard_support(struct dm_target *ti, struct raid_set *rs) { + int i; + bool raid456; + /* Assume discards not supported until after checks below. */ ti->discards_supported = false; /* RAID level 4,5,6 require discard_zeroes_data for data integrity! */ - if (rs->md.level == 4 || rs->md.level == 5 || rs->md.level == 6) - return; /* discard_zeroes_data cannot be trusted as reliable */ + raid456 = (rs->md.level == 4 || rs->md.level == 5 || rs->md.level == 6); + for (i = 0; i < rs->md.raid_disks; i++) { + struct request_queue *q = bdev_get_queue(rs->dev[i].rdev.bdev); + + if (!q || !blk_queue_discard(q)) + return; + + if (raid456) { + if (!q->limits.discard_zeroes_data) + return; + if (!devices_handle_discard_safely) { + DMERR("raid456 discard support disabled due to discard_zeroes_data uncertainty."); + DMERR("Set dm-raid.devices_handle_discard_safely=Y to override."); + return; + } + } + } + + /* All RAID members properly support discards */ ti->discards_supported = true; /* * RAID1 and RAID10 personalities require bio splitting, + * RAID0/4/5/6 don't and process large discard bios properly. */ - ti->split_discard_bios = true; + ti->split_discard_bios = !!(rs->md.level == 1 || rs->md.level == 10); ti->num_discard_bios = 1; } @@ -1709,6 +1735,10 @@ static void __exit dm_raid_exit(void) module_init(dm_raid_init); module_exit(dm_raid_exit); +module_param(devices_handle_discard_safely, bool, 0644); +MODULE_PARM_DESC(devices_handle_discard_safely, + "Set to Y if all devices in each array reliably return zeroes on reads from discarded regions"); + MODULE_DESCRIPTION(DM_NAME " raid4/5/6 target"); MODULE_ALIAS("dm-raid1"); MODULE_ALIAS("dm-raid10"); -- cgit v0.10.2 From eb76faf53b1ff7a77ce3f78cc98ad392ac70c2a0 Mon Sep 17 00:00:00 2001 From: Joe Thornber Date: Tue, 30 Sep 2014 09:32:46 +0100 Subject: dm bufio: update last_accessed when relinking a buffer The 'last_accessed' member of the dm_buffer structure was only set when the the buffer was created. This led to each buffer being discarded after dm_bufio_max_age time even if it was used recently. In practice this resulted in all thinp metadata being evicted soon after being read -- this is particularly problematic for metadata intensive workloads like multithreaded small random IO. 'last_accessed' is now updated each time the buffer is moved to the head of the LRU list, so the buffer is now properly discarded if it was not used in dm_bufio_max_age time. Signed-off-by: Joe Thornber Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer Cc: stable@vger.kernel.org # v3.2+ diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c index ab472c5..eab64bc 100644 --- a/drivers/md/dm-bufio.c +++ b/drivers/md/dm-bufio.c @@ -465,6 +465,7 @@ static void __relink_lru(struct dm_buffer *b, int dirty) c->n_buffers[dirty]++; b->list_mode = dirty; list_move(&b->lru_list, &c->lru[dirty]); + b->last_accessed = jiffies; } /*---------------------------------------------------------------- -- cgit v0.10.2 From 0e825862f3c04cee40e25f55680333728a4ffa9b Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Wed, 1 Oct 2014 13:29:48 -0400 Subject: dm bufio: when done scanning return from __scan immediately When __scan frees the required number of buffer entries that the shrinker requested (nr_to_scan becomes zero) it must return. Before this fix the __scan code exited only the inner loop and continued in the outer loop -- which could result in reduced performance due to extra buffers being freed (e.g. unnecessarily evicted thinp metadata needing to be synchronously re-read into bufio's cache). Also, move dm_bufio_cond_resched to __scan's inner loop, so that iterating the bufio client's lru lists doesn't result in scheduling latency. Reported-by: Joe Thornber Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer Cc: stable@vger.kernel.org # 3.2+ diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c index eab64bc..9ea5b60 100644 --- a/drivers/md/dm-bufio.c +++ b/drivers/md/dm-bufio.c @@ -1473,9 +1473,9 @@ static long __scan(struct dm_bufio_client *c, unsigned long nr_to_scan, list_for_each_entry_safe_reverse(b, tmp, &c->lru[l], lru_list) { freed += __cleanup_old_buffer(b, gfp_mask, 0); if (!--nr_to_scan) - break; + return freed; + dm_bufio_cond_resched(); } - dm_bufio_cond_resched(); } return freed; } -- cgit v0.10.2 From 56ec16cb1e1ce46354de8511eef962a417c32c92 Mon Sep 17 00:00:00 2001 From: Alexey Khoroshilov Date: Wed, 1 Oct 2014 22:58:35 +0200 Subject: dm log userspace: fix memory leak in dm_ulog_tfr_init failure path If cn_add_callback() fails in dm_ulog_tfr_init(), it does not deallocate prealloced memory but calls cn_del_callback(). Found by Linux Driver Verification project (linuxtesting.org). Signed-off-by: Alexey Khoroshilov Reviewed-by: Jonathan Brassow Signed-off-by: Mike Snitzer Cc: stable@vger.kernel.org diff --git a/drivers/md/dm-log-userspace-transfer.c b/drivers/md/dm-log-userspace-transfer.c index b428c0a..39ad966 100644 --- a/drivers/md/dm-log-userspace-transfer.c +++ b/drivers/md/dm-log-userspace-transfer.c @@ -272,7 +272,7 @@ int dm_ulog_tfr_init(void) r = cn_add_callback(&ulog_cn_id, "dmlogusr", cn_ulog_callback); if (r) { - cn_del_callback(&ulog_cn_id); + kfree(prealloced_cn_msg); return r; } -- cgit v0.10.2