From 145b9006a0ae753582902170ec1da49f89501845 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Thu, 12 Feb 2015 16:25:05 -0500 Subject: dm space map disk: fix sm_disk_count_is_more_than_one() dm_tm_shadow_block() is the only caller of dm_sm_count_is_more_than_one() which only ever operates on a metadata space-map. So in practice, sm_disk_count_is_more_than_one() isn't actually used (which explains why this bug never amounted to anything). But fix sm_disk_count_is_more_than_one() to properly set *result and return 0. Reported-by: Vivek Goyal Signed-off-by: Mike Snitzer diff --git a/drivers/md/persistent-data/dm-space-map-disk.c b/drivers/md/persistent-data/dm-space-map-disk.c index cfbf961..ebb280a 100644 --- a/drivers/md/persistent-data/dm-space-map-disk.c +++ b/drivers/md/persistent-data/dm-space-map-disk.c @@ -78,7 +78,9 @@ static int sm_disk_count_is_more_than_one(struct dm_space_map *sm, dm_block_t b, if (r) return r; - return count > 1; + *result = count > 1; + + return 0; } static int sm_disk_set_count(struct dm_space_map *sm, dm_block_t b, -- cgit v0.10.2 From f2ed51ac64611d717d1917820a01930174c2f236 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Thu, 12 Feb 2015 10:09:20 -0500 Subject: dm mirror: do not degrade the mirror on discard error It may be possible that a device claims discard support but it rejects discards with -EOPNOTSUPP. It happens when using loopback on ext2/ext3 filesystem driven by the ext4 driver. It may also happen if the underlying devices are moved from one disk on another. If discard error happens, we reject the bio with -EOPNOTSUPP, but we do not degrade the array. This patch fixes failed test shell/lvconvert-repair-transient.sh in the lvm2 testsuite if the testsuite is extracted on an ext2 or ext3 filesystem and it is being driven by the ext4 driver. Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer Cc: stable@vger.kernel.org diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c index 7dfdb5c..089d627 100644 --- a/drivers/md/dm-raid1.c +++ b/drivers/md/dm-raid1.c @@ -604,6 +604,15 @@ static void write_callback(unsigned long error, void *context) return; } + /* + * If the bio is discard, return an error, but do not + * degrade the array. + */ + if (bio->bi_rw & REQ_DISCARD) { + bio_endio(bio, -EOPNOTSUPP); + return; + } + for (i = 0; i < ms->nr_mirrors; i++) if (test_bit(i, &error)) fail_mirror(ms->mirror + i, DM_RAID1_WRITE_ERROR); -- cgit v0.10.2 From 37527b869207ad4c208b1e13967d69b8bba1fbf9 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 13 Feb 2015 11:05:37 -0800 Subject: dm io: reject unsupported DISCARD requests with EOPNOTSUPP I created a dm-raid1 device backed by a device that supports DISCARD and another device that does NOT support DISCARD with the following dm configuration: # echo '0 2048 mirror core 1 512 2 /dev/sda 0 /dev/sdb 0' | dmsetup create moo # lsblk -D NAME DISC-ALN DISC-GRAN DISC-MAX DISC-ZERO sda 0 4K 1G 0 `-moo (dm-0) 0 4K 1G 0 sdb 0 0B 0B 0 `-moo (dm-0) 0 4K 1G 0 Notice that the mirror device /dev/mapper/moo advertises DISCARD support even though one of the mirror halves doesn't. If I issue a DISCARD request (via fstrim, mount -o discard, or ioctl BLKDISCARD) through the mirror, kmirrord gets stuck in an infinite loop in do_region() when it tries to issue a DISCARD request to sdb. The problem is that when we call do_region() against sdb, num_sectors is set to zero because q->limits.max_discard_sectors is zero. Therefore, "remaining" never decreases and the loop never terminates. To fix this: before entering the loop, check for the combination of REQ_DISCARD and no discard and return -EOPNOTSUPP to avoid hanging up the mirror device. This bug was found by the unfortunate coincidence of pvmove and a discard operation in the RHEL 6.5 kernel; upstream is also affected. Signed-off-by: Darrick J. Wong Acked-by: "Martin K. Petersen" Signed-off-by: Mike Snitzer Cc: stable@vger.kernel.org diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c index c09359d..37de017 100644 --- a/drivers/md/dm-io.c +++ b/drivers/md/dm-io.c @@ -290,6 +290,12 @@ static void do_region(int rw, unsigned region, struct dm_io_region *where, unsigned short logical_block_size = queue_logical_block_size(q); sector_t num_sectors; + /* Reject unsupported discard requests */ + if ((rw & REQ_DISCARD) && !blk_queue_discard(q)) { + dec_count(io, region, -EOPNOTSUPP); + return; + } + /* * where->count may be zero if rw holds a flush and we need to * send a zero-sized flush. -- cgit v0.10.2 From f3396c58fd8442850e759843457d78b6ec3a9589 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Fri, 13 Feb 2015 08:23:09 -0500 Subject: dm crypt: use unbound workqueue for request processing Use unbound workqueue by default so that work is automatically balanced between available CPUs. The original behavior of encrypting using the same cpu that IO was submitted on can still be enabled by setting the optional 'same_cpu_crypt' table argument. Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer diff --git a/Documentation/device-mapper/dm-crypt.txt b/Documentation/device-mapper/dm-crypt.txt index c81839b..571f24f 100644 --- a/Documentation/device-mapper/dm-crypt.txt +++ b/Documentation/device-mapper/dm-crypt.txt @@ -51,7 +51,7 @@ Parameters: \ Otherwise #opt_params is the number of following arguments. Example of optional parameters section: - 1 allow_discards + 2 allow_discards same_cpu_crypt allow_discards Block discard requests (a.k.a. TRIM) are passed through the crypt device. @@ -63,6 +63,11 @@ allow_discards used space etc.) if the discarded blocks can be located easily on the device later. +same_cpu_crypt + Perform encryption using the same cpu that IO was submitted on. + The default is to use an unbound workqueue so that encryption work + is automatically balanced between available CPUs. + Example scripts =============== LUKS (Linux Unified Key Setup) is now the preferred way to set up disk diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 08981be..5063c90 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -108,7 +108,7 @@ struct iv_tcw_private { * Crypt: maps a linear range of a block device * and encrypts / decrypts at the same time. */ -enum flags { DM_CRYPT_SUSPENDED, DM_CRYPT_KEY_VALID }; +enum flags { DM_CRYPT_SUSPENDED, DM_CRYPT_KEY_VALID, DM_CRYPT_SAME_CPU }; /* * The fields in here must be read only after initialization. @@ -1688,7 +1688,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) char dummy; static struct dm_arg _args[] = { - {0, 1, "Invalid number of feature args"}, + {0, 2, "Invalid number of feature args"}, }; if (argc < 5) { @@ -1788,15 +1788,23 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) if (ret) goto bad; - opt_string = dm_shift_arg(&as); + while (opt_params--) { + opt_string = dm_shift_arg(&as); + if (!opt_string) { + ti->error = "Not enough feature arguments"; + goto bad; + } - if (opt_params == 1 && opt_string && - !strcasecmp(opt_string, "allow_discards")) - ti->num_discard_bios = 1; - else if (opt_params) { - ret = -EINVAL; - ti->error = "Invalid feature arguments"; - goto bad; + if (!strcasecmp(opt_string, "allow_discards")) + ti->num_discard_bios = 1; + + else if (!strcasecmp(opt_string, "same_cpu_crypt")) + set_bit(DM_CRYPT_SAME_CPU, &cc->flags); + + else { + ti->error = "Invalid feature arguments"; + goto bad; + } } } @@ -1807,8 +1815,11 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) goto bad; } - cc->crypt_queue = alloc_workqueue("kcryptd", - WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM, 1); + if (test_bit(DM_CRYPT_SAME_CPU, &cc->flags)) + cc->crypt_queue = alloc_workqueue("kcryptd", WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM, 1); + else + cc->crypt_queue = alloc_workqueue("kcryptd", WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM | WQ_UNBOUND, + num_online_cpus()); if (!cc->crypt_queue) { ti->error = "Couldn't create kcryptd queue"; goto bad; @@ -1860,6 +1871,7 @@ static void crypt_status(struct dm_target *ti, status_type_t type, { struct crypt_config *cc = ti->private; unsigned i, sz = 0; + int num_feature_args = 0; switch (type) { case STATUSTYPE_INFO: @@ -1878,8 +1890,15 @@ static void crypt_status(struct dm_target *ti, status_type_t type, DMEMIT(" %llu %s %llu", (unsigned long long)cc->iv_offset, cc->dev->name, (unsigned long long)cc->start); - if (ti->num_discard_bios) - DMEMIT(" 1 allow_discards"); + num_feature_args += !!ti->num_discard_bios; + num_feature_args += test_bit(DM_CRYPT_SAME_CPU, &cc->flags); + if (num_feature_args) { + DMEMIT(" %d", num_feature_args); + if (ti->num_discard_bios) + DMEMIT(" allow_discards"); + if (test_bit(DM_CRYPT_SAME_CPU, &cc->flags)) + DMEMIT(" same_cpu_crypt"); + } break; } @@ -1976,7 +1995,7 @@ static int crypt_iterate_devices(struct dm_target *ti, static struct target_type crypt_target = { .name = "crypt", - .version = {1, 13, 0}, + .version = {1, 14, 0}, .module = THIS_MODULE, .ctr = crypt_ctr, .dtr = crypt_dtr, -- cgit v0.10.2 From cf2f1abfbd0dba701f7f16ef619e4d2485de3366 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Fri, 13 Feb 2015 08:23:52 -0500 Subject: dm crypt: don't allocate pages for a partial request Change crypt_alloc_buffer so that it only ever allocates pages for a full request. This is a prerequisite for the commit "dm crypt: offload writes to thread". This change simplifies the dm-crypt code at the expense of reduced throughput in low memory conditions (where allocation for a partial request is most useful). Note: the next commit ("dm crypt: avoid deadlock in mempools") is needed to fix a theoretical deadlock. Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 5063c90..6199245 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -58,7 +58,6 @@ struct dm_crypt_io { atomic_t io_pending; int error; sector_t sector; - struct dm_crypt_io *base_io; } CRYPTO_MINALIGN_ATTR; struct dm_crypt_request { @@ -172,7 +171,6 @@ struct crypt_config { }; #define MIN_IOS 16 -#define MIN_POOL_PAGES 32 static struct kmem_cache *_crypt_io_pool; @@ -946,14 +944,13 @@ static int crypt_convert(struct crypt_config *cc, return 0; } +static void crypt_free_buffer_pages(struct crypt_config *cc, struct bio *clone); + /* * Generate a new unfragmented bio with the given size * This should never violate the device limitations - * May return a smaller bio when running out of pages, indicated by - * *out_of_pages set to 1. */ -static struct bio *crypt_alloc_buffer(struct dm_crypt_io *io, unsigned size, - unsigned *out_of_pages) +static struct bio *crypt_alloc_buffer(struct dm_crypt_io *io, unsigned size) { struct crypt_config *cc = io->cc; struct bio *clone; @@ -961,41 +958,27 @@ static struct bio *crypt_alloc_buffer(struct dm_crypt_io *io, unsigned size, gfp_t gfp_mask = GFP_NOIO | __GFP_HIGHMEM; unsigned i, len; struct page *page; + struct bio_vec *bvec; clone = bio_alloc_bioset(GFP_NOIO, nr_iovecs, cc->bs); if (!clone) return NULL; clone_init(io, clone); - *out_of_pages = 0; for (i = 0; i < nr_iovecs; i++) { page = mempool_alloc(cc->page_pool, gfp_mask); - if (!page) { - *out_of_pages = 1; - break; - } - - /* - * If additional pages cannot be allocated without waiting, - * return a partially-allocated bio. The caller will then try - * to allocate more bios while submitting this partial bio. - */ - gfp_mask = (gfp_mask | __GFP_NOWARN) & ~__GFP_WAIT; len = (size > PAGE_SIZE) ? PAGE_SIZE : size; - if (!bio_add_page(clone, page, len, 0)) { - mempool_free(page, cc->page_pool); - break; - } + bvec = &clone->bi_io_vec[clone->bi_vcnt++]; + bvec->bv_page = page; + bvec->bv_len = len; + bvec->bv_offset = 0; - size -= len; - } + clone->bi_iter.bi_size += len; - if (!clone->bi_iter.bi_size) { - bio_put(clone); - return NULL; + size -= len; } return clone; @@ -1020,7 +1003,6 @@ static void crypt_io_init(struct dm_crypt_io *io, struct crypt_config *cc, io->base_bio = bio; io->sector = sector; io->error = 0; - io->base_io = NULL; io->ctx.req = NULL; atomic_set(&io->io_pending, 0); } @@ -1033,13 +1015,11 @@ static void crypt_inc_pending(struct dm_crypt_io *io) /* * One of the bios was finished. Check for completion of * the whole request and correctly clean up the buffer. - * If base_io is set, wait for the last fragment to complete. */ static void crypt_dec_pending(struct dm_crypt_io *io) { struct crypt_config *cc = io->cc; struct bio *base_bio = io->base_bio; - struct dm_crypt_io *base_io = io->base_io; int error = io->error; if (!atomic_dec_and_test(&io->io_pending)) @@ -1050,13 +1030,7 @@ static void crypt_dec_pending(struct dm_crypt_io *io) if (io != dm_per_bio_data(base_bio, cc->per_bio_data_size)) mempool_free(io, cc->io_pool); - if (likely(!base_io)) - bio_endio(base_bio, error); - else { - if (error && !base_io->error) - base_io->error = error; - crypt_dec_pending(base_io); - } + bio_endio(base_bio, error); } /* @@ -1192,10 +1166,7 @@ static void kcryptd_crypt_write_convert(struct dm_crypt_io *io) { struct crypt_config *cc = io->cc; struct bio *clone; - struct dm_crypt_io *new_io; int crypt_finished; - unsigned out_of_pages = 0; - unsigned remaining = io->base_bio->bi_iter.bi_size; sector_t sector = io->sector; int r; @@ -1205,80 +1176,30 @@ static void kcryptd_crypt_write_convert(struct dm_crypt_io *io) crypt_inc_pending(io); crypt_convert_init(cc, &io->ctx, NULL, io->base_bio, sector); - /* - * The allocated buffers can be smaller than the whole bio, - * so repeat the whole process until all the data can be handled. - */ - while (remaining) { - clone = crypt_alloc_buffer(io, remaining, &out_of_pages); - if (unlikely(!clone)) { - io->error = -ENOMEM; - break; - } - - io->ctx.bio_out = clone; - io->ctx.iter_out = clone->bi_iter; - - remaining -= clone->bi_iter.bi_size; - sector += bio_sectors(clone); - - crypt_inc_pending(io); - - r = crypt_convert(cc, &io->ctx); - if (r < 0) - io->error = -EIO; - - crypt_finished = atomic_dec_and_test(&io->ctx.cc_pending); - - /* Encryption was already finished, submit io now */ - if (crypt_finished) { - kcryptd_crypt_write_io_submit(io, 0); - - /* - * If there was an error, do not try next fragments. - * For async, error is processed in async handler. - */ - if (unlikely(r < 0)) - break; + clone = crypt_alloc_buffer(io, io->base_bio->bi_iter.bi_size); + if (unlikely(!clone)) { + io->error = -EIO; + goto dec; + } - io->sector = sector; - } + io->ctx.bio_out = clone; + io->ctx.iter_out = clone->bi_iter; - /* - * Out of memory -> run queues - * But don't wait if split was due to the io size restriction - */ - if (unlikely(out_of_pages)) - congestion_wait(BLK_RW_ASYNC, HZ/100); + sector += bio_sectors(clone); - /* - * With async crypto it is unsafe to share the crypto context - * between fragments, so switch to a new dm_crypt_io structure. - */ - if (unlikely(!crypt_finished && remaining)) { - new_io = mempool_alloc(cc->io_pool, GFP_NOIO); - crypt_io_init(new_io, io->cc, io->base_bio, sector); - crypt_inc_pending(new_io); - crypt_convert_init(cc, &new_io->ctx, NULL, - io->base_bio, sector); - new_io->ctx.iter_in = io->ctx.iter_in; - - /* - * Fragments after the first use the base_io - * pending count. - */ - if (!io->base_io) - new_io->base_io = io; - else { - new_io->base_io = io->base_io; - crypt_inc_pending(io->base_io); - crypt_dec_pending(io); - } + crypt_inc_pending(io); + r = crypt_convert(cc, &io->ctx); + if (r) + io->error = -EIO; + crypt_finished = atomic_dec_and_test(&io->ctx.cc_pending); - io = new_io; - } + /* Encryption was already finished, submit io now */ + if (crypt_finished) { + kcryptd_crypt_write_io_submit(io, 0); + io->sector = sector; } +dec: crypt_dec_pending(io); } @@ -1746,7 +1667,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) sizeof(struct dm_crypt_request) + iv_size_padding + cc->iv_size, ARCH_KMALLOC_MINALIGN); - cc->page_pool = mempool_create_page_pool(MIN_POOL_PAGES, 0); + cc->page_pool = mempool_create_page_pool(BIO_MAX_PAGES, 0); if (!cc->page_pool) { ti->error = "Cannot allocate page mempool"; goto bad; -- cgit v0.10.2 From 7145c241a1bf2841952c3e297c4080b357b3e52d Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Fri, 13 Feb 2015 08:24:41 -0500 Subject: dm crypt: avoid deadlock in mempools Fix a theoretical deadlock introduced in the previous commit ("dm crypt: don't allocate pages for a partial request"). The function crypt_alloc_buffer may be called concurrently. If we allocate from the mempool concurrently, there is a possibility of deadlock. For example, if we have mempool of 256 pages, two processes, each wanting 256, pages allocate from the mempool concurrently, it may deadlock in a situation where both processes have allocated 128 pages and the mempool is exhausted. To avoid such a scenario we allocate the pages under a mutex. In order to not degrade performance with excessive locking, we try non-blocking allocations without a mutex first and if that fails, we fallback to a blocking allocations with a mutex. Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 6199245..fa1dba1 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -124,6 +124,7 @@ struct crypt_config { mempool_t *req_pool; mempool_t *page_pool; struct bio_set *bs; + struct mutex bio_alloc_lock; struct workqueue_struct *io_queue; struct workqueue_struct *crypt_queue; @@ -949,27 +950,51 @@ static void crypt_free_buffer_pages(struct crypt_config *cc, struct bio *clone); /* * Generate a new unfragmented bio with the given size * This should never violate the device limitations + * + * This function may be called concurrently. If we allocate from the mempool + * concurrently, there is a possibility of deadlock. For example, if we have + * mempool of 256 pages, two processes, each wanting 256, pages allocate from + * the mempool concurrently, it may deadlock in a situation where both processes + * have allocated 128 pages and the mempool is exhausted. + * + * In order to avoid this scenario we allocate the pages under a mutex. + * + * In order to not degrade performance with excessive locking, we try + * non-blocking allocations without a mutex first but on failure we fallback + * to blocking allocations with a mutex. */ static struct bio *crypt_alloc_buffer(struct dm_crypt_io *io, unsigned size) { struct crypt_config *cc = io->cc; struct bio *clone; unsigned int nr_iovecs = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; - gfp_t gfp_mask = GFP_NOIO | __GFP_HIGHMEM; - unsigned i, len; + gfp_t gfp_mask = GFP_NOWAIT | __GFP_HIGHMEM; + unsigned i, len, remaining_size; struct page *page; struct bio_vec *bvec; +retry: + if (unlikely(gfp_mask & __GFP_WAIT)) + mutex_lock(&cc->bio_alloc_lock); + clone = bio_alloc_bioset(GFP_NOIO, nr_iovecs, cc->bs); if (!clone) - return NULL; + goto return_clone; clone_init(io, clone); + remaining_size = size; + for (i = 0; i < nr_iovecs; i++) { page = mempool_alloc(cc->page_pool, gfp_mask); + if (!page) { + crypt_free_buffer_pages(cc, clone); + bio_put(clone); + gfp_mask |= __GFP_WAIT; + goto retry; + } - len = (size > PAGE_SIZE) ? PAGE_SIZE : size; + len = (remaining_size > PAGE_SIZE) ? PAGE_SIZE : remaining_size; bvec = &clone->bi_io_vec[clone->bi_vcnt++]; bvec->bv_page = page; @@ -978,9 +1003,13 @@ static struct bio *crypt_alloc_buffer(struct dm_crypt_io *io, unsigned size) clone->bi_iter.bi_size += len; - size -= len; + remaining_size -= len; } +return_clone: + if (unlikely(gfp_mask & __GFP_WAIT)) + mutex_unlock(&cc->bio_alloc_lock); + return clone; } @@ -1679,6 +1708,8 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) goto bad; } + mutex_init(&cc->bio_alloc_lock); + ret = -EINVAL; if (sscanf(argv[2], "%llu%c", &tmpll, &dummy) != 1) { ti->error = "Invalid iv_offset sector"; -- cgit v0.10.2 From 94f5e0243c48aa01441c987743dc468e2d6eaca2 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Fri, 13 Feb 2015 08:25:26 -0500 Subject: dm crypt: remove unused io_pool and _crypt_io_pool The previous commit ("dm crypt: don't allocate pages for a partial request") stopped using the io_pool slab mempool and backing _crypt_io_pool kmem cache. Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index fa1dba1..c29daf4 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -120,7 +120,6 @@ struct crypt_config { * pool for per bio private data, crypto requests and * encryption requeusts/buffer pages */ - mempool_t *io_pool; mempool_t *req_pool; mempool_t *page_pool; struct bio_set *bs; @@ -173,8 +172,6 @@ struct crypt_config { #define MIN_IOS 16 -static struct kmem_cache *_crypt_io_pool; - static void clone_init(struct dm_crypt_io *, struct bio *); static void kcryptd_queue_crypt(struct dm_crypt_io *io); static u8 *iv_of_dmreq(struct crypt_config *cc, struct dm_crypt_request *dmreq); @@ -1056,8 +1053,6 @@ static void crypt_dec_pending(struct dm_crypt_io *io) if (io->ctx.req) crypt_free_req(cc, io->ctx.req, base_bio); - if (io != dm_per_bio_data(base_bio, cc->per_bio_data_size)) - mempool_free(io, cc->io_pool); bio_endio(base_bio, error); } @@ -1445,8 +1440,6 @@ static void crypt_dtr(struct dm_target *ti) mempool_destroy(cc->page_pool); if (cc->req_pool) mempool_destroy(cc->req_pool); - if (cc->io_pool) - mempool_destroy(cc->io_pool); if (cc->iv_gen_ops && cc->iv_gen_ops->dtr) cc->iv_gen_ops->dtr(cc); @@ -1660,13 +1653,6 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) if (ret < 0) goto bad; - ret = -ENOMEM; - cc->io_pool = mempool_create_slab_pool(MIN_IOS, _crypt_io_pool); - if (!cc->io_pool) { - ti->error = "Cannot allocate crypt io mempool"; - goto bad; - } - cc->dmreq_start = sizeof(struct ablkcipher_request); cc->dmreq_start += crypto_ablkcipher_reqsize(any_tfm(cc)); cc->dmreq_start = ALIGN(cc->dmreq_start, __alignof__(struct dm_crypt_request)); @@ -1684,6 +1670,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) iv_size_padding = crypto_ablkcipher_alignmask(any_tfm(cc)); } + ret = -ENOMEM; cc->req_pool = mempool_create_kmalloc_pool(MIN_IOS, cc->dmreq_start + sizeof(struct dm_crypt_request) + iv_size_padding + cc->iv_size); if (!cc->req_pool) { @@ -1965,15 +1952,9 @@ static int __init dm_crypt_init(void) { int r; - _crypt_io_pool = KMEM_CACHE(dm_crypt_io, 0); - if (!_crypt_io_pool) - return -ENOMEM; - r = dm_register_target(&crypt_target); - if (r < 0) { + if (r < 0) DMERR("register failed %d", r); - kmem_cache_destroy(_crypt_io_pool); - } return r; } @@ -1981,7 +1962,6 @@ static int __init dm_crypt_init(void) static void __exit dm_crypt_exit(void) { dm_unregister_target(&crypt_target); - kmem_cache_destroy(_crypt_io_pool); } module_init(dm_crypt_init); -- cgit v0.10.2 From dc2676210c425ee8e5cb1bec5bc84d004ddf4179 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Fri, 13 Feb 2015 08:25:59 -0500 Subject: dm crypt: offload writes to thread Submitting write bios directly in the encryption thread caused serious performance degradation. On a multiprocessor machine, encryption requests finish in a different order than they were submitted. Consequently, write requests would be submitted in a different order and it could cause severe performance degradation. Move the submission of write requests to a separate thread so that the requests can be sorted before submitting. But this commit improves dm-crypt performance even without having dm-crypt perform request sorting (in particular it enables IO schedulers like CFQ to sort more effectively). Note: it is required that a previous commit ("dm crypt: don't allocate pages for a partial request") be applied before applying this patch. Otherwise, this commit could introduce a crash. Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index c29daf4..8c0e36b 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -58,6 +59,8 @@ struct dm_crypt_io { atomic_t io_pending; int error; sector_t sector; + + struct list_head list; } CRYPTO_MINALIGN_ATTR; struct dm_crypt_request { @@ -128,6 +131,10 @@ struct crypt_config { struct workqueue_struct *io_queue; struct workqueue_struct *crypt_queue; + struct task_struct *write_thread; + wait_queue_head_t write_thread_wait; + struct list_head write_thread_list; + char *cipher; char *cipher_string; @@ -1136,37 +1143,89 @@ static int kcryptd_io_read(struct dm_crypt_io *io, gfp_t gfp) return 0; } +static void kcryptd_io_read_work(struct work_struct *work) +{ + struct dm_crypt_io *io = container_of(work, struct dm_crypt_io, work); + + crypt_inc_pending(io); + if (kcryptd_io_read(io, GFP_NOIO)) + io->error = -ENOMEM; + crypt_dec_pending(io); +} + +static void kcryptd_queue_read(struct dm_crypt_io *io) +{ + struct crypt_config *cc = io->cc; + + INIT_WORK(&io->work, kcryptd_io_read_work); + queue_work(cc->io_queue, &io->work); +} + static void kcryptd_io_write(struct dm_crypt_io *io) { struct bio *clone = io->ctx.bio_out; + generic_make_request(clone); } -static void kcryptd_io(struct work_struct *work) +static int dmcrypt_write(void *data) { - struct dm_crypt_io *io = container_of(work, struct dm_crypt_io, work); + struct crypt_config *cc = data; + while (1) { + struct list_head local_list; + struct blk_plug plug; - if (bio_data_dir(io->base_bio) == READ) { - crypt_inc_pending(io); - if (kcryptd_io_read(io, GFP_NOIO)) - io->error = -ENOMEM; - crypt_dec_pending(io); - } else - kcryptd_io_write(io); -} + DECLARE_WAITQUEUE(wait, current); -static void kcryptd_queue_io(struct dm_crypt_io *io) -{ - struct crypt_config *cc = io->cc; + spin_lock_irq(&cc->write_thread_wait.lock); +continue_locked: - INIT_WORK(&io->work, kcryptd_io); - queue_work(cc->io_queue, &io->work); + if (!list_empty(&cc->write_thread_list)) + goto pop_from_list; + + __set_current_state(TASK_INTERRUPTIBLE); + __add_wait_queue(&cc->write_thread_wait, &wait); + + spin_unlock_irq(&cc->write_thread_wait.lock); + + if (unlikely(kthread_should_stop())) { + set_task_state(current, TASK_RUNNING); + remove_wait_queue(&cc->write_thread_wait, &wait); + break; + } + + schedule(); + + set_task_state(current, TASK_RUNNING); + spin_lock_irq(&cc->write_thread_wait.lock); + __remove_wait_queue(&cc->write_thread_wait, &wait); + goto continue_locked; + +pop_from_list: + local_list = cc->write_thread_list; + local_list.next->prev = &local_list; + local_list.prev->next = &local_list; + INIT_LIST_HEAD(&cc->write_thread_list); + + spin_unlock_irq(&cc->write_thread_wait.lock); + + blk_start_plug(&plug); + do { + struct dm_crypt_io *io = container_of(local_list.next, + struct dm_crypt_io, list); + list_del(&io->list); + kcryptd_io_write(io); + } while (!list_empty(&local_list)); + blk_finish_plug(&plug); + } + return 0; } static void kcryptd_crypt_write_io_submit(struct dm_crypt_io *io, int async) { struct bio *clone = io->ctx.bio_out; struct crypt_config *cc = io->cc; + unsigned long flags; if (unlikely(io->error < 0)) { crypt_free_buffer_pages(cc, clone); @@ -1180,10 +1239,10 @@ static void kcryptd_crypt_write_io_submit(struct dm_crypt_io *io, int async) clone->bi_iter.bi_sector = cc->start + io->sector; - if (async) - kcryptd_queue_io(io); - else - generic_make_request(clone); + spin_lock_irqsave(&cc->write_thread_wait.lock, flags); + list_add_tail(&io->list, &cc->write_thread_list); + wake_up_locked(&cc->write_thread_wait); + spin_unlock_irqrestore(&cc->write_thread_wait.lock, flags); } static void kcryptd_crypt_write_convert(struct dm_crypt_io *io) @@ -1426,6 +1485,9 @@ static void crypt_dtr(struct dm_target *ti) if (!cc) return; + if (cc->write_thread) + kthread_stop(cc->write_thread); + if (cc->io_queue) destroy_workqueue(cc->io_queue); if (cc->crypt_queue) @@ -1764,6 +1826,18 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) goto bad; } + init_waitqueue_head(&cc->write_thread_wait); + INIT_LIST_HEAD(&cc->write_thread_list); + + cc->write_thread = kthread_create(dmcrypt_write, cc, "dmcrypt_write"); + if (IS_ERR(cc->write_thread)) { + ret = PTR_ERR(cc->write_thread); + cc->write_thread = NULL; + ti->error = "Couldn't spawn write thread"; + goto bad; + } + wake_up_process(cc->write_thread); + ti->num_flush_bios = 1; ti->discard_zeroes_data_unsupported = true; @@ -1798,7 +1872,7 @@ static int crypt_map(struct dm_target *ti, struct bio *bio) if (bio_data_dir(io->base_bio) == READ) { if (kcryptd_io_read(io, GFP_NOWAIT)) - kcryptd_queue_io(io); + kcryptd_queue_read(io); } else kcryptd_queue_crypt(io); -- cgit v0.10.2 From 0f5d8e6ee758f7023e4353cca75d785b2d4f6abe Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Fri, 13 Feb 2015 08:27:08 -0500 Subject: dm crypt: add 'submit_from_crypt_cpus' option Make it possible to disable offloading writes by setting the optional 'submit_from_crypt_cpus' table argument. There are some situations where offloading write bios from the encryption threads to a single thread degrades performance significantly. The default is to offload write bios to the same thread because it benefits CFQ to have writes submitted using the same IO context. Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer diff --git a/Documentation/device-mapper/dm-crypt.txt b/Documentation/device-mapper/dm-crypt.txt index 571f24f..ad69778 100644 --- a/Documentation/device-mapper/dm-crypt.txt +++ b/Documentation/device-mapper/dm-crypt.txt @@ -51,7 +51,7 @@ Parameters: \ Otherwise #opt_params is the number of following arguments. Example of optional parameters section: - 2 allow_discards same_cpu_crypt + 3 allow_discards same_cpu_crypt submit_from_crypt_cpus allow_discards Block discard requests (a.k.a. TRIM) are passed through the crypt device. @@ -68,6 +68,14 @@ same_cpu_crypt The default is to use an unbound workqueue so that encryption work is automatically balanced between available CPUs. +submit_from_crypt_cpus + Disable offloading writes to a separate thread after encryption. + There are some situations where offloading write bios from the + encryption threads to a single thread degrades performance + significantly. The default is to offload write bios to the same + thread because it benefits CFQ to have writes submitted using the + same context. + Example scripts =============== LUKS (Linux Unified Key Setup) is now the preferred way to set up disk diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 8c0e36b..4519a7c 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -110,7 +110,8 @@ struct iv_tcw_private { * Crypt: maps a linear range of a block device * and encrypts / decrypts at the same time. */ -enum flags { DM_CRYPT_SUSPENDED, DM_CRYPT_KEY_VALID, DM_CRYPT_SAME_CPU }; +enum flags { DM_CRYPT_SUSPENDED, DM_CRYPT_KEY_VALID, + DM_CRYPT_SAME_CPU, DM_CRYPT_NO_OFFLOAD }; /* * The fields in here must be read only after initialization. @@ -1239,6 +1240,11 @@ static void kcryptd_crypt_write_io_submit(struct dm_crypt_io *io, int async) clone->bi_iter.bi_sector = cc->start + io->sector; + if (likely(!async) && test_bit(DM_CRYPT_NO_OFFLOAD, &cc->flags)) { + generic_make_request(clone); + return; + } + spin_lock_irqsave(&cc->write_thread_wait.lock, flags); list_add_tail(&io->list, &cc->write_thread_list); wake_up_locked(&cc->write_thread_wait); @@ -1693,7 +1699,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) char dummy; static struct dm_arg _args[] = { - {0, 2, "Invalid number of feature args"}, + {0, 3, "Invalid number of feature args"}, }; if (argc < 5) { @@ -1802,6 +1808,9 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) else if (!strcasecmp(opt_string, "same_cpu_crypt")) set_bit(DM_CRYPT_SAME_CPU, &cc->flags); + else if (!strcasecmp(opt_string, "submit_from_crypt_cpus")) + set_bit(DM_CRYPT_NO_OFFLOAD, &cc->flags); + else { ti->error = "Invalid feature arguments"; goto bad; @@ -1905,12 +1914,15 @@ static void crypt_status(struct dm_target *ti, status_type_t type, num_feature_args += !!ti->num_discard_bios; num_feature_args += test_bit(DM_CRYPT_SAME_CPU, &cc->flags); + num_feature_args += test_bit(DM_CRYPT_NO_OFFLOAD, &cc->flags); if (num_feature_args) { DMEMIT(" %d", num_feature_args); if (ti->num_discard_bios) DMEMIT(" allow_discards"); if (test_bit(DM_CRYPT_SAME_CPU, &cc->flags)) DMEMIT(" same_cpu_crypt"); + if (test_bit(DM_CRYPT_NO_OFFLOAD, &cc->flags)) + DMEMIT(" submit_from_crypt_cpus"); } break; -- cgit v0.10.2 From b3c5fd3052492f1b8d060799d4f18be5a5438add Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Fri, 13 Feb 2015 08:27:41 -0500 Subject: dm crypt: sort writes Write requests are sorted in a red-black tree structure and are submitted in the sorted order. In theory the sorting should be performed by the underlying disk scheduler, however, in practice the disk scheduler only accepts and sorts a finite number of requests. To allow the sorting of all requests, dm-crypt needs to implement its own sorting. The overhead associated with rbtree-based sorting is considered negligible so it is not used conditionally. Even on SSD sorting can be beneficial since in-order request dispatch promotes lower latency IO completion to the upper layers. Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 4519a7c..713a962 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -60,7 +61,7 @@ struct dm_crypt_io { int error; sector_t sector; - struct list_head list; + struct rb_node rb_node; } CRYPTO_MINALIGN_ATTR; struct dm_crypt_request { @@ -134,7 +135,7 @@ struct crypt_config { struct task_struct *write_thread; wait_queue_head_t write_thread_wait; - struct list_head write_thread_list; + struct rb_root write_tree; char *cipher; char *cipher_string; @@ -1169,11 +1170,15 @@ static void kcryptd_io_write(struct dm_crypt_io *io) generic_make_request(clone); } +#define crypt_io_from_node(node) rb_entry((node), struct dm_crypt_io, rb_node) + static int dmcrypt_write(void *data) { struct crypt_config *cc = data; + struct dm_crypt_io *io; + while (1) { - struct list_head local_list; + struct rb_root write_tree; struct blk_plug plug; DECLARE_WAITQUEUE(wait, current); @@ -1181,7 +1186,7 @@ static int dmcrypt_write(void *data) spin_lock_irq(&cc->write_thread_wait.lock); continue_locked: - if (!list_empty(&cc->write_thread_list)) + if (!RB_EMPTY_ROOT(&cc->write_tree)) goto pop_from_list; __set_current_state(TASK_INTERRUPTIBLE); @@ -1203,20 +1208,22 @@ continue_locked: goto continue_locked; pop_from_list: - local_list = cc->write_thread_list; - local_list.next->prev = &local_list; - local_list.prev->next = &local_list; - INIT_LIST_HEAD(&cc->write_thread_list); - + write_tree = cc->write_tree; + cc->write_tree = RB_ROOT; spin_unlock_irq(&cc->write_thread_wait.lock); + BUG_ON(rb_parent(write_tree.rb_node)); + + /* + * Note: we cannot walk the tree here with rb_next because + * the structures may be freed when kcryptd_io_write is called. + */ blk_start_plug(&plug); do { - struct dm_crypt_io *io = container_of(local_list.next, - struct dm_crypt_io, list); - list_del(&io->list); + io = crypt_io_from_node(rb_first(&write_tree)); + rb_erase(&io->rb_node, &write_tree); kcryptd_io_write(io); - } while (!list_empty(&local_list)); + } while (!RB_EMPTY_ROOT(&write_tree)); blk_finish_plug(&plug); } return 0; @@ -1227,6 +1234,8 @@ static void kcryptd_crypt_write_io_submit(struct dm_crypt_io *io, int async) struct bio *clone = io->ctx.bio_out; struct crypt_config *cc = io->cc; unsigned long flags; + sector_t sector; + struct rb_node **rbp, *parent; if (unlikely(io->error < 0)) { crypt_free_buffer_pages(cc, clone); @@ -1246,7 +1255,19 @@ static void kcryptd_crypt_write_io_submit(struct dm_crypt_io *io, int async) } spin_lock_irqsave(&cc->write_thread_wait.lock, flags); - list_add_tail(&io->list, &cc->write_thread_list); + rbp = &cc->write_tree.rb_node; + parent = NULL; + sector = io->sector; + while (*rbp) { + parent = *rbp; + if (sector < crypt_io_from_node(parent)->sector) + rbp = &(*rbp)->rb_left; + else + rbp = &(*rbp)->rb_right; + } + rb_link_node(&io->rb_node, parent, rbp); + rb_insert_color(&io->rb_node, &cc->write_tree); + wake_up_locked(&cc->write_thread_wait); spin_unlock_irqrestore(&cc->write_thread_wait.lock, flags); } @@ -1836,7 +1857,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) } init_waitqueue_head(&cc->write_thread_wait); - INIT_LIST_HEAD(&cc->write_thread_list); + cc->write_tree = RB_ROOT; cc->write_thread = kthread_create(dmcrypt_write, cc, "dmcrypt_write"); if (IS_ERR(cc->write_thread)) { -- cgit v0.10.2 From 2bec1f4a8832e74ebbe859f176d8a9cb20dd97f4 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Tue, 17 Feb 2015 14:30:53 -0500 Subject: dm: fix a race condition in dm_get_md The function dm_get_md finds a device mapper device with a given dev_t, increases the reference count and returns the pointer. dm_get_md calls dm_find_md, dm_find_md takes _minor_lock, finds the device, tests that the device doesn't have DMF_DELETING or DMF_FREEING flag, drops _minor_lock and returns pointer to the device. dm_get_md then calls dm_get. dm_get calls BUG if the device has the DMF_FREEING flag, otherwise it increments the reference count. There is a possible race condition - after dm_find_md exits and before dm_get is called, there are no locks held, so the device may disappear or DMF_FREEING flag may be set, which results in BUG. To fix this bug, we need to call dm_get while we hold _minor_lock. This patch renames dm_find_md to dm_get_md and changes it so that it calls dm_get while holding the lock. Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer Cc: stable@vger.kernel.org diff --git a/drivers/md/dm.c b/drivers/md/dm.c index ec1444f..73f2880 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -2571,7 +2571,7 @@ int dm_setup_md_queue(struct mapped_device *md) return 0; } -static struct mapped_device *dm_find_md(dev_t dev) +struct mapped_device *dm_get_md(dev_t dev) { struct mapped_device *md; unsigned minor = MINOR(dev); @@ -2582,12 +2582,15 @@ static struct mapped_device *dm_find_md(dev_t dev) spin_lock(&_minor_lock); md = idr_find(&_minor_idr, minor); - if (md && (md == MINOR_ALLOCED || - (MINOR(disk_devt(dm_disk(md))) != minor) || - dm_deleting_md(md) || - test_bit(DMF_FREEING, &md->flags))) { - md = NULL; - goto out; + if (md) { + if ((md == MINOR_ALLOCED || + (MINOR(disk_devt(dm_disk(md))) != minor) || + dm_deleting_md(md) || + test_bit(DMF_FREEING, &md->flags))) { + md = NULL; + goto out; + } + dm_get(md); } out: @@ -2595,16 +2598,6 @@ out: return md; } - -struct mapped_device *dm_get_md(dev_t dev) -{ - struct mapped_device *md = dm_find_md(dev); - - if (md) - dm_get(md); - - return md; -} EXPORT_SYMBOL_GPL(dm_get_md); void *dm_get_mdptr(struct mapped_device *md) -- cgit v0.10.2 From 22aa66a3ee5b61e0f4a0bfeabcaa567861109ec3 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Tue, 17 Feb 2015 14:34:00 -0500 Subject: dm snapshot: fix a possible invalid memory access on unload When the snapshot target is unloaded, snapshot_dtr() waits until pending_exceptions_count drops to zero. Then, it destroys the snapshot. Therefore, the function that decrements pending_exceptions_count should not touch the snapshot structure after the decrement. pending_complete() calls free_pending_exception(), which decrements pending_exceptions_count, and then it performs up_write(&s->lock) and it calls retry_origin_bios() which dereferences s->origin. These two memory accesses to the fields of the snapshot may touch the dm_snapshot struture after it is freed. This patch moves the call to free_pending_exception() to the end of pending_complete(), so that the snapshot will not be destroyed while pending_complete() is in progress. Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer Cc: stable@vger.kernel.org diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c index 864b03f..8b204ae2 100644 --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c @@ -1432,8 +1432,6 @@ out: full_bio->bi_private = pe->full_bio_private; atomic_inc(&full_bio->bi_remaining); } - free_pending_exception(pe); - increment_pending_exceptions_done_count(); up_write(&s->lock); @@ -1450,6 +1448,8 @@ out: } retry_origin_bios(s, origin_bios); + + free_pending_exception(pe); } static void commit_callback(void *context, int success) -- cgit v0.10.2