From d17ab4592760973d95961bb29f26969246e9dabc Mon Sep 17 00:00:00 2001 From: Grygorii Strashko Date: Fri, 8 Nov 2013 19:43:48 -0700 Subject: block: cleanup removing dependency on bootmem headers Cc: Yinghai Lu Cc: Tejun Heo Cc: Andrew Morton Signed-off-by: Grygorii Strashko Signed-off-by: Santosh Shilimkar Signed-off-by: Jens Axboe diff --git a/block/blk-ioc.c b/block/blk-ioc.c index 46cd7bd..242df01 100644 --- a/block/blk-ioc.c +++ b/block/blk-ioc.c @@ -6,7 +6,6 @@ #include #include #include -#include /* for max_pfn/max_low_pfn */ #include #include "blk.h" -- cgit v0.10.2 From 1fa8455deb92e9ec7756df23030e73b2d28eeca7 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 10 Nov 2013 21:55:27 -0800 Subject: bcache: Fix dirty_data accounting Dirty data accounting wasn't quite right - firstly, we were adding the key we're inserting after it could have merged with another dirty key already in the btree, and secondly we could sometimes pass the wrong offset to bcache_dev_sectors_dirty_add() for dirty data we were overwriting - which is important when tracking dirty data by stripe. Signed-off-by: Kent Overstreet Cc: linux-stable # >= v3.10 diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index f42fc7e..d1734d9 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -1742,6 +1742,9 @@ static bool fix_overlapping_extents(struct btree *b, if (bkey_cmp(insert, k) < 0) { bch_cut_front(insert, k); } else { + if (bkey_cmp(&START_KEY(insert), &START_KEY(k)) > 0) + old_offset = KEY_START(insert); + if (bkey_written(b, k) && bkey_cmp(&START_KEY(insert), &START_KEY(k)) <= 0) { /* @@ -1803,6 +1806,10 @@ static bool btree_insert_key(struct btree *b, struct btree_op *op, if (fix_overlapping_extents(b, k, &iter, op)) return false; + if (KEY_DIRTY(k)) + bcache_dev_sectors_dirty_add(b->c, KEY_INODE(k), + KEY_START(k), KEY_SIZE(k)); + while (m != end(i) && bkey_cmp(k, &START_KEY(m)) > 0) prev = m, m = bkey_next(m); @@ -1831,10 +1838,6 @@ static bool btree_insert_key(struct btree *b, struct btree_op *op, insert: shift_keys(b, m, k); copy: bkey_copy(m, k); merged: - if (KEY_DIRTY(k)) - bcache_dev_sectors_dirty_add(b->c, KEY_INODE(k), - KEY_START(k), KEY_SIZE(k)); - bch_check_keys(b, "%u for %s", status, op_type(op)); if (b->level && !KEY_OFFSET(k)) -- cgit v0.10.2 From 7857d5d470ec53bae187d144c69065ad3c0ebc21 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 8 Oct 2013 15:50:46 -0700 Subject: bcache: Fix a journalling performance bug diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c index 8435f81..7c9e6bf 100644 --- a/drivers/md/bcache/journal.c +++ b/drivers/md/bcache/journal.c @@ -554,32 +554,26 @@ static void journal_write_endio(struct bio *bio, int error) struct journal_write *w = bio->bi_private; cache_set_err_on(error, w->c, "journal io error"); - closure_put(&w->c->journal.io.cl); + closure_put(&w->c->journal.io); } static void journal_write(struct closure *); static void journal_write_done(struct closure *cl) { - struct journal *j = container_of(cl, struct journal, io.cl); - struct cache_set *c = container_of(j, struct cache_set, journal); - + struct journal *j = container_of(cl, struct journal, io); struct journal_write *w = (j->cur == j->w) ? &j->w[1] : &j->w[0]; __closure_wake_up(&w->wait); - - if (c->journal_delay_ms) - closure_delay(&j->io, msecs_to_jiffies(c->journal_delay_ms)); - - continue_at(cl, journal_write, system_wq); + continue_at_nobarrier(cl, journal_write, system_wq); } static void journal_write_unlocked(struct closure *cl) __releases(c->journal.lock) { - struct cache_set *c = container_of(cl, struct cache_set, journal.io.cl); + struct cache_set *c = container_of(cl, struct cache_set, journal.io); struct cache *ca; struct journal_write *w = c->journal.cur; struct bkey *k = &c->journal.key; @@ -660,7 +654,7 @@ static void journal_write_unlocked(struct closure *cl) static void journal_write(struct closure *cl) { - struct cache_set *c = container_of(cl, struct cache_set, journal.io.cl); + struct cache_set *c = container_of(cl, struct cache_set, journal.io); spin_lock(&c->journal.lock); journal_write_unlocked(cl); @@ -669,7 +663,10 @@ static void journal_write(struct closure *cl) static void __journal_try_write(struct cache_set *c, bool noflush) __releases(c->journal.lock) { - struct closure *cl = &c->journal.io.cl; + struct closure *cl = &c->journal.io; + struct journal_write *w = c->journal.cur; + + w->need_write = true; if (!closure_trylock(cl, &c->cl)) spin_unlock(&c->journal.lock); @@ -688,18 +685,24 @@ void bch_journal_meta(struct cache_set *c, struct closure *cl) if (CACHE_SYNC(&c->sb)) { spin_lock(&c->journal.lock); - w = c->journal.cur; - w->need_write = true; if (cl) BUG_ON(!closure_wait(&w->wait, cl)); - closure_flush(&c->journal.io); __journal_try_write(c, true); } } +static void journal_write_work(struct work_struct *work) +{ + struct cache_set *c = container_of(to_delayed_work(work), + struct cache_set, + journal.work); + spin_lock(&c->journal.lock); + journal_try_write(c); +} + /* * Entry point to the journalling code - bio_insert() and btree_invalidate() * pass bch_journal() a list of keys to be journalled, and then @@ -739,7 +742,6 @@ void bch_journal(struct closure *cl) } w = c->journal.cur; - w->need_write = true; b = __set_blocks(w->data, w->data->keys + n, c); if (b * c->sb.block_size > PAGE_SECTORS << JSET_BITS || @@ -755,8 +757,6 @@ void bch_journal(struct closure *cl) BUG_ON(!closure_wait(&w->wait, cl)); - closure_flush(&c->journal.io); - journal_try_write(c); continue_at(cl, bch_journal, bcache_wq); } @@ -768,11 +768,15 @@ void bch_journal(struct closure *cl) atomic_inc(op->journal); if (op->flush_journal) { - closure_flush(&c->journal.io); closure_wait(&w->wait, cl->parent); + journal_try_write(c); + } else if (!w->need_write) { + schedule_delayed_work(&c->journal.work, + msecs_to_jiffies(c->journal_delay_ms)); + spin_unlock(&c->journal.lock); + } else { + spin_unlock(&c->journal.lock); } - - journal_try_write(c); out: bch_btree_insert_async(cl); } @@ -790,6 +794,7 @@ int bch_journal_alloc(struct cache_set *c) closure_init_unlocked(&j->io); spin_lock_init(&j->lock); + INIT_DELAYED_WORK(&j->work, journal_write_work); c->journal_delay_ms = 100; diff --git a/drivers/md/bcache/journal.h b/drivers/md/bcache/journal.h index 3d78512..3ca93d3 100644 --- a/drivers/md/bcache/journal.h +++ b/drivers/md/bcache/journal.h @@ -140,7 +140,8 @@ struct journal { spinlock_t lock; /* used when waiting because the journal was full */ struct closure_waitlist wait; - struct closure_with_timer io; + struct closure io; + struct delayed_work work; /* Number of blocks free in the bucket(s) we're currently writing to */ unsigned blocks_free; -- cgit v0.10.2 From dd9ec84da530d3a92e99ad1f52edae44533cc27f Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 24 Oct 2013 17:12:52 -0700 Subject: bcache: Fix a lockdep splat bch_keybuf_del() takes a spinlock that can't be taken in interrupt context - whoops. Fortunately, this code isn't enabled by default (you have to toggle a sysfs thing). Signed-off-by: Kent Overstreet diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c index 1a3b4f4..dd8a035 100644 --- a/drivers/md/bcache/movinggc.c +++ b/drivers/md/bcache/movinggc.c @@ -113,7 +113,7 @@ static void write_moving(struct closure *cl) bch_insert_data(&s->op.cl); } - continue_at(cl, write_moving_finish, NULL); + continue_at(cl, write_moving_finish, bch_gc_wq); } static void read_moving_submit(struct closure *cl) -- cgit v0.10.2 From 49b1212dfacfe51f951442563d1617bb06aac575 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 24 Jul 2013 17:16:09 -0700 Subject: bcache: Use blkdev_issue_discard() The old asynchronous discard code was really a relic from when all the allocation code was asynchronous - now that allocation runs out of a dedicated thread there's no point in keeping around all that complicated machinery. Signed-off-by: Kent Overstreet diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c index e45f557..e033b02 100644 --- a/drivers/md/bcache/alloc.c +++ b/drivers/md/bcache/alloc.c @@ -63,13 +63,12 @@ #include "bcache.h" #include "btree.h" +#include #include #include #include #include -#define MAX_IN_FLIGHT_DISCARDS 8U - /* Bucket heap / gen */ uint8_t bch_inc_gen(struct cache *ca, struct bucket *b) @@ -121,75 +120,6 @@ void bch_rescale_priorities(struct cache_set *c, int sectors) mutex_unlock(&c->bucket_lock); } -/* Discard/TRIM */ - -struct discard { - struct list_head list; - struct work_struct work; - struct cache *ca; - long bucket; - - struct bio bio; - struct bio_vec bv; -}; - -static void discard_finish(struct work_struct *w) -{ - struct discard *d = container_of(w, struct discard, work); - struct cache *ca = d->ca; - char buf[BDEVNAME_SIZE]; - - if (!test_bit(BIO_UPTODATE, &d->bio.bi_flags)) { - pr_notice("discard error on %s, disabling", - bdevname(ca->bdev, buf)); - d->ca->discard = 0; - } - - mutex_lock(&ca->set->bucket_lock); - - fifo_push(&ca->free, d->bucket); - list_add(&d->list, &ca->discards); - atomic_dec(&ca->discards_in_flight); - - mutex_unlock(&ca->set->bucket_lock); - - closure_wake_up(&ca->set->bucket_wait); - wake_up_process(ca->alloc_thread); - - closure_put(&ca->set->cl); -} - -static void discard_endio(struct bio *bio, int error) -{ - struct discard *d = container_of(bio, struct discard, bio); - schedule_work(&d->work); -} - -static void do_discard(struct cache *ca, long bucket) -{ - struct discard *d = list_first_entry(&ca->discards, - struct discard, list); - - list_del(&d->list); - d->bucket = bucket; - - atomic_inc(&ca->discards_in_flight); - closure_get(&ca->set->cl); - - bio_init(&d->bio); - - d->bio.bi_sector = bucket_to_sector(ca->set, d->bucket); - d->bio.bi_bdev = ca->bdev; - d->bio.bi_rw = REQ_WRITE|REQ_DISCARD; - d->bio.bi_max_vecs = 1; - d->bio.bi_io_vec = d->bio.bi_inline_vecs; - d->bio.bi_size = bucket_bytes(ca); - d->bio.bi_end_io = discard_endio; - bio_set_prio(&d->bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)); - - submit_bio(0, &d->bio); -} - /* Allocation */ static inline bool can_inc_bucket_gen(struct bucket *b) @@ -398,16 +328,18 @@ static int bch_allocator_thread(void *arg) else break; - allocator_wait(ca, (int) fifo_free(&ca->free) > - atomic_read(&ca->discards_in_flight)); - if (ca->discard) { - allocator_wait(ca, !list_empty(&ca->discards)); - do_discard(ca, bucket); - } else { - fifo_push(&ca->free, bucket); - closure_wake_up(&ca->set->bucket_wait); + mutex_unlock(&ca->set->bucket_lock); + blkdev_issue_discard(ca->bdev, + bucket_to_sector(ca->set, bucket), + ca->sb.block_size, GFP_KERNEL, 0); + mutex_lock(&ca->set->bucket_lock); } + + allocator_wait(ca, !fifo_full(&ca->free)); + + fifo_push(&ca->free, bucket); + closure_wake_up(&ca->set->bucket_wait); } /* @@ -556,22 +488,8 @@ int bch_cache_allocator_start(struct cache *ca) return 0; } -void bch_cache_allocator_exit(struct cache *ca) -{ - struct discard *d; - - while (!list_empty(&ca->discards)) { - d = list_first_entry(&ca->discards, struct discard, list); - cancel_work_sync(&d->work); - list_del(&d->list); - kfree(d); - } -} - int bch_cache_allocator_init(struct cache *ca) { - unsigned i; - /* * Reserve: * Prio/gen writes first @@ -589,15 +507,5 @@ int bch_cache_allocator_init(struct cache *ca) ca->watermark[WATERMARK_NONE] = ca->free.size / 2 + ca->watermark[WATERMARK_MOVINGGC]; - for (i = 0; i < MAX_IN_FLIGHT_DISCARDS; i++) { - struct discard *d = kzalloc(sizeof(*d), GFP_KERNEL); - if (!d) - return -ENOMEM; - - d->ca = ca; - INIT_WORK(&d->work, discard_finish); - list_add(&d->list, &ca->discards); - } - return 0; } diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 0f12382..bf37474 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -620,15 +620,6 @@ struct cache { bool discard; /* Get rid of? */ - /* - * We preallocate structs for issuing discards to buckets, and keep them - * on this list when they're not in use; do_discard() issues discards - * whenever there's work to do and is called by free_some_buckets() and - * when a discard finishes. - */ - atomic_t discards_in_flight; - struct list_head discards; - struct journal_device journal; /* The rest of this all shows up in sysfs */ @@ -1222,7 +1213,6 @@ int bch_btree_cache_alloc(struct cache_set *); void bch_moving_init_cache_set(struct cache_set *); int bch_cache_allocator_start(struct cache *ca); -void bch_cache_allocator_exit(struct cache *ca); int bch_cache_allocator_init(struct cache *ca); void bch_debug_exit(void); diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 547c4c5..fd37342 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1725,8 +1725,6 @@ void bch_cache_release(struct kobject *kobj) if (ca->set) ca->set->cache[ca->sb.nr_this_dev] = NULL; - bch_cache_allocator_exit(ca); - bio_split_pool_free(&ca->bio_split_hook); free_pages((unsigned long) ca->disk_buckets, ilog2(bucket_pages(ca))); @@ -1758,8 +1756,6 @@ static int cache_alloc(struct cache_sb *sb, struct cache *ca) __module_get(THIS_MODULE); kobject_init(&ca->kobj, &bch_cache_ktype); - INIT_LIST_HEAD(&ca->discards); - bio_init(&ca->journal.bio); ca->journal.bio.bi_max_vecs = 8; ca->journal.bio.bi_io_vec = ca->journal.bio.bi_inline_vecs; -- cgit v0.10.2 From 77c320eb46e216c17aee5c943949229ccfed6904 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 11 Jul 2013 19:42:51 -0700 Subject: bcache: Add on error panic/unregister setting Works kind of like the ext4 setting, to panic or remount read only on errors. Signed-off-by: Kent Overstreet diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index bf37474..5786156 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -843,8 +843,14 @@ struct cache_set { atomic_long_t cache_read_races; atomic_long_t writeback_keys_done; atomic_long_t writeback_keys_failed; + + enum { + ON_ERROR_UNREGISTER, + ON_ERROR_PANIC, + } on_error; unsigned error_limit; unsigned error_decay; + unsigned short journal_delay_ms; unsigned verify:1; unsigned key_merging_disabled:1; diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c index 7c9e6bf..9e87758 100644 --- a/drivers/md/bcache/journal.c +++ b/drivers/md/bcache/journal.c @@ -305,10 +305,9 @@ int bch_journal_replay(struct cache_set *s, struct list_head *list, list_for_each_entry(i, list, list) { BUG_ON(i->pin && atomic_read(i->pin) != 1); - if (n != i->j.seq) - pr_err( - "journal entries %llu-%llu missing! (replaying %llu-%llu)\n", - n, i->j.seq - 1, start, end); + cache_set_err_on(n != i->j.seq, s, +"bcache: journal entries %llu-%llu missing! (replaying %llu-%llu)", + n, i->j.seq - 1, start, end); for (k = i->j.start; k < end(&i->j); diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index fd37342..74f2e90 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1260,7 +1260,8 @@ bool bch_cache_set_error(struct cache_set *c, const char *fmt, ...) { va_list args; - if (test_bit(CACHE_SET_STOPPING, &c->flags)) + if (c->on_error != ON_ERROR_PANIC && + test_bit(CACHE_SET_STOPPING, &c->flags)) return false; /* XXX: we can be called from atomic context @@ -1275,6 +1276,9 @@ bool bch_cache_set_error(struct cache_set *c, const char *fmt, ...) printk(", disabling caching\n"); + if (c->on_error == ON_ERROR_PANIC) + panic("panic forced after error\n"); + bch_cache_set_unregister(c); return true; } diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index 924dcfd..4211d82 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -21,6 +21,12 @@ static const char * const cache_replacement_policies[] = { NULL }; +static const char * const error_actions[] = { + "unregister", + "panic", + NULL +}; + write_attribute(attach); write_attribute(detach); write_attribute(unregister); @@ -90,6 +96,7 @@ rw_attribute(discard); rw_attribute(running); rw_attribute(label); rw_attribute(readahead); +rw_attribute(errors); rw_attribute(io_error_limit); rw_attribute(io_error_halflife); rw_attribute(verify); @@ -492,6 +499,10 @@ lock_root: sysfs_print(writeback_keys_failed, atomic_long_read(&c->writeback_keys_failed)); + if (attr == &sysfs_errors) + return bch_snprint_string_list(buf, PAGE_SIZE, error_actions, + c->on_error); + /* See count_io_errors for why 88 */ sysfs_print(io_error_halflife, c->error_decay * 88); sysfs_print(io_error_limit, c->error_limit >> IO_ERROR_SHIFT); @@ -569,6 +580,15 @@ STORE(__bch_cache_set) sysfs_strtoul(congested_write_threshold_us, c->congested_write_threshold_us); + if (attr == &sysfs_errors) { + ssize_t v = bch_read_string_list(buf, error_actions); + + if (v < 0) + return v; + + c->on_error = v; + } + if (attr == &sysfs_io_error_limit) c->error_limit = strtoul_or_return(buf) << IO_ERROR_SHIFT; @@ -620,6 +640,7 @@ static struct attribute *bch_cache_set_files[] = { &sysfs_average_key_size, &sysfs_dirty_data, + &sysfs_errors, &sysfs_io_error_limit, &sysfs_io_error_halflife, &sysfs_congested, -- cgit v0.10.2 From 2d679fc75678551485df62274edaed452becd16d Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 17 Aug 2013 02:13:15 -0700 Subject: bcache: Stripe size isn't necessarily a power of two Originally I got this right... except that the divides didn't use do_div(), which broke 32 bit kernels. When I went to fix that, I forgot that the raid stripe size usually isn't a power of two... doh Signed-off-by: Kent Overstreet diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 5786156..6e836f2 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -437,7 +437,7 @@ struct bcache_device { int flush_done; uint64_t nr_stripes; - unsigned stripe_size_bits; + unsigned stripe_size; atomic_t *stripe_sectors_dirty; unsigned long sectors_dirty_last; diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 74f2e90..d3169c0 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -761,11 +761,10 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size, struct request_queue *q; size_t n; - if (!d->stripe_size_bits) - d->stripe_size_bits = 31; + if (!d->stripe_size) + d->stripe_size = 1 << 31; - d->nr_stripes = round_up(sectors, 1 << d->stripe_size_bits) >> - d->stripe_size_bits; + d->nr_stripes = DIV_ROUND_UP_ULL(sectors, d->stripe_size); if (!d->nr_stripes || d->nr_stripes > SIZE_MAX / sizeof(atomic_t)) return -ENOMEM; diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index 4211d82..b3a66f1 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -157,7 +157,7 @@ SHOW(__bch_cached_dev) sysfs_hprint(dirty_data, bcache_dev_sectors_dirty(&dc->disk) << 9); - sysfs_hprint(stripe_size, (1 << dc->disk.stripe_size_bits) << 9); + sysfs_hprint(stripe_size, dc->disk.stripe_size << 9); var_printf(partial_stripes_expensive, "%u"); var_printf(sequential_merge, "%i"); diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index ba3ee48..b842fbf 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -114,25 +114,25 @@ static bool dirty_pred(struct keybuf *buf, struct bkey *k) static bool dirty_full_stripe_pred(struct keybuf *buf, struct bkey *k) { - uint64_t stripe; + uint64_t stripe = KEY_START(k); unsigned nr_sectors = KEY_SIZE(k); struct cached_dev *dc = container_of(buf, struct cached_dev, writeback_keys); - unsigned stripe_size = 1 << dc->disk.stripe_size_bits; if (!KEY_DIRTY(k)) return false; - stripe = KEY_START(k) >> dc->disk.stripe_size_bits; - while (1) { - if (atomic_read(dc->disk.stripe_sectors_dirty + stripe) != - stripe_size) - return false; + do_div(stripe, dc->disk.stripe_size); - if (nr_sectors <= stripe_size) + while (1) { + if (atomic_read(dc->disk.stripe_sectors_dirty + stripe) == + dc->disk.stripe_size) return true; - nr_sectors -= stripe_size; + if (nr_sectors <= dc->disk.stripe_size) + return false; + + nr_sectors -= dc->disk.stripe_size; stripe++; } } @@ -186,11 +186,12 @@ static void refill_dirty(struct closure *cl) for (i = 0; i < dc->disk.nr_stripes; i++) if (atomic_read(dc->disk.stripe_sectors_dirty + i) == - 1 << dc->disk.stripe_size_bits) + dc->disk.stripe_size) goto full_stripes; goto normal_refill; full_stripes: + searched_from_start = false; /* not searching entire btree */ bch_refill_keybuf(dc->disk.c, buf, &end, dirty_full_stripe_pred); } else { @@ -252,19 +253,19 @@ void bcache_dev_sectors_dirty_add(struct cache_set *c, unsigned inode, uint64_t offset, int nr_sectors) { struct bcache_device *d = c->devices[inode]; - unsigned stripe_size, stripe_offset; - uint64_t stripe; + unsigned stripe_offset; + uint64_t stripe = offset; if (!d) return; - stripe_size = 1 << d->stripe_size_bits; - stripe = offset >> d->stripe_size_bits; - stripe_offset = offset & (stripe_size - 1); + do_div(stripe, d->stripe_size); + + stripe_offset = offset & (d->stripe_size - 1); while (nr_sectors) { int s = min_t(unsigned, abs(nr_sectors), - stripe_size - stripe_offset); + d->stripe_size - stripe_offset); if (nr_sectors < 0) s = -s; diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h index c91f61b..3496188 100644 --- a/drivers/md/bcache/writeback.h +++ b/drivers/md/bcache/writeback.h @@ -18,16 +18,18 @@ static inline bool bcache_dev_stripe_dirty(struct bcache_device *d, uint64_t offset, unsigned nr_sectors) { - uint64_t stripe = offset >> d->stripe_size_bits; + uint64_t stripe = offset; + + do_div(stripe, d->stripe_size); while (1) { if (atomic_read(d->stripe_sectors_dirty + stripe)) return true; - if (nr_sectors <= 1 << d->stripe_size_bits) + if (nr_sectors <= d->stripe_size) return false; - nr_sectors -= 1 << d->stripe_size_bits; + nr_sectors -= d->stripe_size; stripe++; } } -- cgit v0.10.2 From 8304ad4dc818ffd701c2f3e90683b5b8013f44e2 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 24 Jul 2013 17:20:00 -0700 Subject: bcache: Remove unnecessary check in should_split() Checking i->seq was redundant, because since ages ago we always initialize the new bset when advancing b->written Signed-off-by: Kent Overstreet diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h index 3333d37..8a1c7e6 100644 --- a/drivers/md/bcache/btree.h +++ b/drivers/md/bcache/btree.h @@ -353,7 +353,7 @@ static inline void rw_unlock(bool w, struct btree *b) _w == insert_lock(op, _b)) \ _r = bch_btree_ ## fn(_b, op, ##__VA_ARGS__); \ rw_unlock(_w, _b); \ - bch_cannibalize_unlock(c, &(op)->cl); \ + bch_cannibalize_unlock(c, &(op)->cl); \ } while (_r == -EINTR); \ \ _r; \ @@ -363,8 +363,7 @@ static inline bool should_split(struct btree *b) { struct bset *i = write_block(b); return b->written >= btree_blocks(b) || - (i->seq == b->sets[0].data->seq && - b->written + __set_blocks(i, i->keys + 15, b->c) + (b->written + __set_blocks(i, i->keys + 15, b->c) > btree_blocks(b)); } -- cgit v0.10.2 From d6fd3b11cea82346837957feab25b0be48aa424c Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 24 Jul 2013 17:20:19 -0700 Subject: bcache: Explicitly track btree node's parent This is prep work for the reworked btree insertion code. The way we set b->parent is ugly and hacky... the problem is, when btree_split() or garbage collection splits or rewrites a btree node, the parent changes for all its (potentially already cached) children. I may change this later and add some code to look through the btree node cache and find all our cached child nodes and change the parent pointer then... Signed-off-by: Kent Overstreet diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index d1734d9..87299ba 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -884,6 +884,7 @@ out: lock_set_subclass(&b->lock.dep_map, level + 1, _THIS_IP_); b->level = level; + b->parent = (void *) ~0UL; mca_reinit(b); @@ -1898,7 +1899,7 @@ out: static int btree_split(struct btree *b, struct btree_op *op) { - bool split, root = b == b->c->root; + bool split; struct btree *n1, *n2 = NULL, *n3 = NULL; uint64_t start_time = local_clock(); @@ -1920,7 +1921,7 @@ static int btree_split(struct btree *b, struct btree_op *op) if (IS_ERR(n2)) goto err_free1; - if (root) { + if (!b->parent) { n3 = bch_btree_node_alloc(b->c, b->level + 1, &op->cl); if (IS_ERR(n3)) goto err_free2; @@ -1928,7 +1929,8 @@ static int btree_split(struct btree *b, struct btree_op *op) bch_btree_insert_keys(n1, op); - /* Has to be a linear search because we don't have an auxiliary + /* + * Has to be a linear search because we don't have an auxiliary * search tree yet */ @@ -1960,6 +1962,8 @@ static int btree_split(struct btree *b, struct btree_op *op) bch_btree_node_write(n1, &op->cl); if (n3) { + /* Depth increases, make a new root */ + bkey_copy_key(&n3->key, &MAX_KEY); bch_btree_insert_keys(n3, op); bch_btree_node_write(n3, &op->cl); @@ -1967,7 +1971,9 @@ static int btree_split(struct btree *b, struct btree_op *op) closure_sync(&op->cl); bch_btree_set_root(n3); rw_unlock(true, n3); - } else if (root) { + } else if (!b->parent) { + /* Root filled up but didn't need to be split */ + op->keys.top = op->keys.bottom; closure_sync(&op->cl); bch_btree_set_root(n1); diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h index 8a1c7e6..6d2fb75 100644 --- a/drivers/md/bcache/btree.h +++ b/drivers/md/bcache/btree.h @@ -125,6 +125,7 @@ struct btree { unsigned long seq; struct rw_semaphore lock; struct cache_set *c; + struct btree *parent; unsigned long flags; uint16_t written; /* would be nice to kill */ @@ -327,12 +328,13 @@ static inline void rw_unlock(bool w, struct btree *b) ({ \ int _r, l = (b)->level - 1; \ bool _w = l <= (op)->lock; \ - struct btree *_b = bch_btree_node_get((b)->c, key, l, op); \ - if (!IS_ERR(_b)) { \ - _r = bch_btree_ ## fn(_b, op, ##__VA_ARGS__); \ - rw_unlock(_w, _b); \ + struct btree *_child = bch_btree_node_get((b)->c, key, l, op); \ + if (!IS_ERR(_child)) { \ + _child->parent = (b); \ + _r = bch_btree_ ## fn(_child, op, ##__VA_ARGS__); \ + rw_unlock(_w, _child); \ } else \ - _r = PTR_ERR(_b); \ + _r = PTR_ERR(_child); \ _r; \ }) @@ -350,8 +352,10 @@ static inline void rw_unlock(bool w, struct btree *b) bool _w = insert_lock(op, _b); \ rw_lock(_w, _b, _b->level); \ if (_b == (c)->root && \ - _w == insert_lock(op, _b)) \ + _w == insert_lock(op, _b)) { \ + _b->parent = NULL; \ _r = bch_btree_ ## fn(_b, op, ##__VA_ARGS__); \ + } \ rw_unlock(_w, _b); \ bch_cannibalize_unlock(c, &(op)->cl); \ } while (_r == -EINTR); \ -- cgit v0.10.2 From 26c949f8062cb9221a28b2228104f1cc5b265097 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 10 Sep 2013 18:41:15 -0700 Subject: bcache: Add btree_insert_node() The flow of control in the old btree insertion code was rather - backwards; we'd recurse down the btree (in btree_insert_recurse()), and then if we needed to split the keys to be inserted into the parent node would be effectively returned up to btree_insert_recurse(), which would notice there was more work to do and finish the insertion. The main problem with this was that the full logic for btree insertion could only be used by calling btree_insert_recurse; if you'd gotten to a btree leaf some other way and had a key to insert, if it turned out that node needed to be split you were SOL. This inverts the flow of control so btree_insert_node() does _full_ btree insertion, including splitting - and takes a (leaf) btree node to insert into as a parameter. This means we can now _correctly_ handle cache misses - for cache misses, we need to insert a fake "check" key into the btree when we discover we have a cache miss - while we still have the btree locked. Previously, if the btree node was full inserting a cache miss would just fail. Signed-off-by: Kent Overstreet diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c index 22d1ae7..830eede 100644 --- a/drivers/md/bcache/bset.c +++ b/drivers/md/bcache/bset.c @@ -73,6 +73,18 @@ struct bkey *bch_keylist_pop(struct keylist *l) return l->top = k; } +void bch_keylist_pop_front(struct keylist *l) +{ + struct bkey *next = bkey_next(l->bottom); + size_t bytes = ((void *) l->top) - ((void *) next); + + memmove(l->bottom, + next, + bytes); + + l->top = ((void *) l->bottom) + bytes; +} + /* Pointer validation */ bool __bch_ptr_invalid(struct cache_set *c, int level, const struct bkey *k) diff --git a/drivers/md/bcache/bset.h b/drivers/md/bcache/bset.h index ae115a2..a3627d0 100644 --- a/drivers/md/bcache/bset.h +++ b/drivers/md/bcache/bset.h @@ -267,6 +267,7 @@ static inline void bch_keylist_free(struct keylist *l) void bch_keylist_copy(struct keylist *, struct keylist *); struct bkey *bch_keylist_pop(struct keylist *); +void bch_keylist_pop_front(struct keylist *); int bch_keylist_realloc(struct keylist *, int, struct cache_set *); void bch_bkey_copy_single_ptr(struct bkey *, const struct bkey *, diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 87299ba..c2722e0 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -1849,15 +1849,43 @@ merged: return true; } -static bool bch_btree_insert_keys(struct btree *b, struct btree_op *op) +static bool bch_btree_insert_keys(struct btree *b, struct btree_op *op, + struct keylist *insert_keys) { bool ret = false; - struct bkey *k; unsigned oldsize = bch_count_data(b); - while ((k = bch_keylist_pop(&op->keys))) { - bkey_put(b->c, k, b->level); - ret |= btree_insert_key(b, op, k); + BUG_ON(!insert_lock(op, b)); + + while (!bch_keylist_empty(insert_keys)) { + struct bkey *k = insert_keys->bottom; + + if (b->level || + bkey_cmp(k, &b->key) <= 0) { + bkey_put(b->c, k, b->level); + + ret |= btree_insert_key(b, op, k); + bch_keylist_pop_front(insert_keys); + } else if (bkey_cmp(&START_KEY(k), &b->key) < 0) { +#if 0 + if (op->type == BTREE_REPLACE) { + bkey_put(b->c, k, b->level); + bch_keylist_pop_front(insert_keys); + op->insert_collision = true; + break; + } +#endif + BKEY_PADDED(key) temp; + bkey_copy(&temp.key, insert_keys->bottom); + + bch_cut_back(&b->key, &temp.key); + bch_cut_front(&b->key, insert_keys->bottom); + + ret |= btree_insert_key(b, op, &temp.key); + break; + } else { + break; + } } BUG_ON(bch_count_data(b) < oldsize); @@ -1897,7 +1925,9 @@ out: return ret; } -static int btree_split(struct btree *b, struct btree_op *op) +static int btree_split(struct btree *b, struct btree_op *op, + struct keylist *insert_keys, + struct keylist *parent_keys) { bool split; struct btree *n1, *n2 = NULL, *n3 = NULL; @@ -1927,7 +1957,7 @@ static int btree_split(struct btree *b, struct btree_op *op) goto err_free2; } - bch_btree_insert_keys(n1, op); + bch_btree_insert_keys(n1, op, insert_keys); /* * Has to be a linear search because we don't have an auxiliary @@ -1949,23 +1979,23 @@ static int btree_split(struct btree *b, struct btree_op *op) bkey_copy_key(&n2->key, &b->key); - bch_keylist_add(&op->keys, &n2->key); + bch_keylist_add(parent_keys, &n2->key); bch_btree_node_write(n2, &op->cl); rw_unlock(true, n2); } else { trace_bcache_btree_node_compact(b, n1->sets[0].data->keys); - bch_btree_insert_keys(n1, op); + bch_btree_insert_keys(n1, op, insert_keys); } - bch_keylist_add(&op->keys, &n1->key); + bch_keylist_add(parent_keys, &n1->key); bch_btree_node_write(n1, &op->cl); if (n3) { /* Depth increases, make a new root */ bkey_copy_key(&n3->key, &MAX_KEY); - bch_btree_insert_keys(n3, op); + bch_btree_insert_keys(n3, op, parent_keys); bch_btree_node_write(n3, &op->cl); closure_sync(&op->cl); @@ -1974,22 +2004,22 @@ static int btree_split(struct btree *b, struct btree_op *op) } else if (!b->parent) { /* Root filled up but didn't need to be split */ - op->keys.top = op->keys.bottom; + parent_keys->top = parent_keys->bottom; closure_sync(&op->cl); bch_btree_set_root(n1); } else { unsigned i; - bkey_copy(op->keys.top, &b->key); - bkey_copy_key(op->keys.top, &ZERO_KEY); + bkey_copy(parent_keys->top, &b->key); + bkey_copy_key(parent_keys->top, &ZERO_KEY); for (i = 0; i < KEY_PTRS(&b->key); i++) { uint8_t g = PTR_BUCKET(b->c, &b->key, i)->gen + 1; - SET_PTR_GEN(op->keys.top, i, g); + SET_PTR_GEN(parent_keys->top, i, g); } - bch_keylist_push(&op->keys); + bch_keylist_push(parent_keys); closure_sync(&op->cl); atomic_inc(&b->c->prio_blocked); } @@ -2018,69 +2048,65 @@ err: return -ENOMEM; } -static int bch_btree_insert_recurse(struct btree *b, struct btree_op *op, - struct keylist *stack_keys) +static int bch_btree_insert_node(struct btree *b, struct btree_op *op, + struct keylist *insert_keys) { - if (b->level) { - int ret; - struct bkey *insert = op->keys.bottom; - struct bkey *k = bch_next_recurse_key(b, &START_KEY(insert)); + int ret = 0; + struct keylist split_keys; - if (!k) { - btree_bug(b, "no key to recurse on at level %i/%i", - b->level, b->c->root->level); + bch_keylist_init(&split_keys); - op->keys.top = op->keys.bottom; - return -EIO; - } + BUG_ON(b->level); - if (bkey_cmp(insert, k) > 0) { - unsigned i; + do { + if (should_split(b)) { + if (current->bio_list) { + op->lock = b->c->root->level + 1; + ret = -EAGAIN; + } else if (op->lock <= b->c->root->level) { + op->lock = b->c->root->level + 1; + ret = -EINTR; + } else { + struct btree *parent = b->parent; - if (op->type == BTREE_REPLACE) { - __bkey_put(b->c, insert); - op->keys.top = op->keys.bottom; - op->insert_collision = true; - return 0; + ret = btree_split(b, op, insert_keys, + &split_keys); + insert_keys = &split_keys; + b = parent; } + } else { + BUG_ON(write_block(b) != b->sets[b->nsets].data); - for (i = 0; i < KEY_PTRS(insert); i++) - atomic_inc(&PTR_BUCKET(b->c, insert, i)->pin); - - bkey_copy(stack_keys->top, insert); - - bch_cut_back(k, insert); - bch_cut_front(k, stack_keys->top); - - bch_keylist_push(stack_keys); + if (bch_btree_insert_keys(b, op, insert_keys)) { + if (!b->level) + bch_btree_leaf_dirty(b, op); + else + bch_btree_node_write(b, &op->cl); + } } + } while (!bch_keylist_empty(&split_keys)); - ret = btree(insert_recurse, k, b, op, stack_keys); - if (ret) - return ret; - } + return ret; +} - if (!bch_keylist_empty(&op->keys)) { - if (should_split(b)) { - if (op->lock <= b->c->root->level) { - BUG_ON(b->level); - op->lock = b->c->root->level + 1; - return -EINTR; - } - return btree_split(b, op); - } +static int bch_btree_insert_recurse(struct btree *b, struct btree_op *op) +{ + if (b->level) { + struct bkey *insert = op->keys.bottom; + struct bkey *k = bch_next_recurse_key(b, &START_KEY(insert)); - BUG_ON(write_block(b) != b->sets[b->nsets].data); + if (!k) { + btree_bug(b, "no key to recurse on at level %i/%i", + b->level, b->c->root->level); - if (bch_btree_insert_keys(b, op)) { - if (!b->level) - bch_btree_leaf_dirty(b, op); - else - bch_btree_node_write(b, &op->cl); + op->keys.top = op->keys.bottom; + return -EIO; } - } - return 0; + return btree(insert_recurse, k, b, op); + } else { + return bch_btree_insert_node(b, op, &op->keys); + } } int bch_btree_insert(struct btree_op *op, struct cache_set *c) @@ -2106,7 +2132,7 @@ int bch_btree_insert(struct btree_op *op, struct cache_set *c) op->lock = 0; } - ret = btree_root(insert_recurse, c, op, &stack_keys); + ret = btree_root(insert_recurse, c, op); if (ret == -EAGAIN) { ret = 0; -- cgit v0.10.2 From 403b6cdeb1a38d896ffcb1f99ddcfd4e343b5d69 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 24 Jul 2013 17:22:44 -0700 Subject: bcache: Insert multiple keys at a time We'll often end up with a list of adjacent keys to insert - because bch_data_insert() may have to fragment the data it writes. Originally, to simplify things and avoid having to deal with corner cases bch_btree_insert() would pass keys from this list one at a time to btree_insert_recurse() - mainly because the list of keys might span leaf nodes, so it was easier this way. With the btree_insert_node() refactoring, it's now a lot easier to just pass down the whole list and have btree_insert_recurse() iterate over leaf nodes until it's done. Signed-off-by: Kent Overstreet diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index c2722e0..60d0646 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -1858,10 +1858,14 @@ static bool bch_btree_insert_keys(struct btree *b, struct btree_op *op, BUG_ON(!insert_lock(op, b)); while (!bch_keylist_empty(insert_keys)) { + struct bset *i = write_block(b); struct bkey *k = insert_keys->bottom; - if (b->level || - bkey_cmp(k, &b->key) <= 0) { + if (b->written + __set_blocks(i, i->keys + bkey_u64s(k), b->c) + > btree_blocks(b)) + break; + + if (bkey_cmp(k, &b->key) <= 0) { bkey_put(b->c, k, b->level); ret |= btree_insert_key(b, op, k); @@ -1888,6 +1892,8 @@ static bool bch_btree_insert_keys(struct btree *b, struct btree_op *op, } } + BUG_ON(!bch_keylist_empty(insert_keys) && b->level); + BUG_ON(bch_count_data(b) < oldsize); return ret; } @@ -2073,6 +2079,8 @@ static int bch_btree_insert_node(struct btree *b, struct btree_op *op, &split_keys); insert_keys = &split_keys; b = parent; + if (!ret) + ret = -EINTR; } } else { BUG_ON(write_block(b) != b->sets[b->nsets].data); @@ -2091,6 +2099,9 @@ static int bch_btree_insert_node(struct btree *b, struct btree_op *op, static int bch_btree_insert_recurse(struct btree *b, struct btree_op *op) { + if (bch_keylist_empty(&op->keys)) + return 0; + if (b->level) { struct bkey *insert = op->keys.bottom; struct bkey *k = bch_next_recurse_key(b, &START_KEY(insert)); @@ -2112,7 +2123,6 @@ static int bch_btree_insert_recurse(struct btree *b, struct btree_op *op) int bch_btree_insert(struct btree_op *op, struct cache_set *c) { int ret = 0; - struct keylist stack_keys; /* * Don't want to block with the btree locked unless we have to, @@ -2121,17 +2131,9 @@ int bch_btree_insert(struct btree_op *op, struct cache_set *c) clear_closure_blocking(&op->cl); BUG_ON(bch_keylist_empty(&op->keys)); - bch_keylist_copy(&stack_keys, &op->keys); - bch_keylist_init(&op->keys); - - while (!bch_keylist_empty(&stack_keys) || - !bch_keylist_empty(&op->keys)) { - if (bch_keylist_empty(&op->keys)) { - bch_keylist_add(&op->keys, - bch_keylist_pop(&stack_keys)); - op->lock = 0; - } + while (!bch_keylist_empty(&op->keys)) { + op->lock = 0; ret = btree_root(insert_recurse, c, op); if (ret == -EAGAIN) { @@ -2143,14 +2145,11 @@ int bch_btree_insert(struct btree_op *op, struct cache_set *c) pr_err("error %i trying to insert key for %s", ret, op_type(op)); - while ((k = bch_keylist_pop(&stack_keys) ?: - bch_keylist_pop(&op->keys))) + while ((k = bch_keylist_pop(&op->keys))) bkey_put(c, k, 0); } } - bch_keylist_free(&stack_keys); - if (op->journal) atomic_dec_bug(op->journal); op->journal = NULL; -- cgit v0.10.2 From e7c590eb63509c5d5f48a390d23aa25f4417ac96 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 10 Sep 2013 18:39:16 -0700 Subject: bcache: Convert btree_insert_check_key() to btree_insert_node() This was the main point of all this refactoring - now, btree_insert_check_key() won't fail just because the leaf node happened to be full. Signed-off-by: Kent Overstreet diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 6e836f2..10ce0c8 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -1091,14 +1091,6 @@ do { \ for (b = (ca)->buckets + (ca)->sb.first_bucket; \ b < (ca)->buckets + (ca)->sb.nbuckets; b++) -static inline void __bkey_put(struct cache_set *c, struct bkey *k) -{ - unsigned i; - - for (i = 0; i < KEY_PTRS(k); i++) - atomic_dec_bug(&PTR_BUCKET(c, k, i)->pin); -} - static inline void cached_dev_put(struct cached_dev *dc) { if (atomic_dec_and_test(&dc->count)) diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 60d0646..08a8532 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -118,6 +118,15 @@ void bch_btree_op_init_stack(struct btree_op *op) /* Btree key manipulation */ +void __bkey_put(struct cache_set *c, struct bkey *k) +{ + unsigned i; + + for (i = 0; i < KEY_PTRS(k); i++) + if (ptr_available(c, k, i)) + atomic_dec_bug(&PTR_BUCKET(c, k, i)->pin); +} + static void bkey_put(struct cache_set *c, struct bkey *k, int level) { if ((level && KEY_OFFSET(k)) || !level) @@ -1855,8 +1864,6 @@ static bool bch_btree_insert_keys(struct btree *b, struct btree_op *op, bool ret = false; unsigned oldsize = bch_count_data(b); - BUG_ON(!insert_lock(op, b)); - while (!bch_keylist_empty(insert_keys)) { struct bset *i = write_block(b); struct bkey *k = insert_keys->bottom; @@ -1898,39 +1905,6 @@ static bool bch_btree_insert_keys(struct btree *b, struct btree_op *op, return ret; } -bool bch_btree_insert_check_key(struct btree *b, struct btree_op *op, - struct bio *bio) -{ - bool ret = false; - uint64_t btree_ptr = b->key.ptr[0]; - unsigned long seq = b->seq; - BKEY_PADDED(k) tmp; - - rw_unlock(false, b); - rw_lock(true, b, b->level); - - if (b->key.ptr[0] != btree_ptr || - b->seq != seq + 1 || - should_split(b)) - goto out; - - op->replace = KEY(op->inode, bio_end_sector(bio), bio_sectors(bio)); - - SET_KEY_PTRS(&op->replace, 1); - get_random_bytes(&op->replace.ptr[0], sizeof(uint64_t)); - - SET_PTR_DEV(&op->replace, 0, PTR_CHECK_DEV); - - bkey_copy(&tmp.k, &op->replace); - - BUG_ON(op->type != BTREE_INSERT); - BUG_ON(!btree_insert_key(b, op, &tmp.k)); - ret = true; -out: - downgrade_write(&b->lock); - return ret; -} - static int btree_split(struct btree *b, struct btree_op *op, struct keylist *insert_keys, struct keylist *parent_keys) @@ -2097,6 +2071,44 @@ static int bch_btree_insert_node(struct btree *b, struct btree_op *op, return ret; } +int bch_btree_insert_check_key(struct btree *b, struct btree_op *op, + struct bkey *check_key) +{ + int ret = -EINTR; + uint64_t btree_ptr = b->key.ptr[0]; + unsigned long seq = b->seq; + struct keylist insert; + bool upgrade = op->lock == -1; + + bch_keylist_init(&insert); + + if (upgrade) { + rw_unlock(false, b); + rw_lock(true, b, b->level); + + if (b->key.ptr[0] != btree_ptr || + b->seq != seq + 1) + goto out; + } + + SET_KEY_PTRS(check_key, 1); + get_random_bytes(&check_key->ptr[0], sizeof(uint64_t)); + + SET_PTR_DEV(check_key, 0, PTR_CHECK_DEV); + + bch_keylist_add(&insert, check_key); + + BUG_ON(op->type != BTREE_INSERT); + + ret = bch_btree_insert_node(b, op, &insert); + + BUG_ON(!ret && !bch_keylist_empty(&insert)); +out: + if (upgrade) + downgrade_write(&b->lock); + return ret; +} + static int bch_btree_insert_recurse(struct btree *b, struct btree_op *op) { if (bch_keylist_empty(&op->keys)) diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h index 6d2fb75..73bd621 100644 --- a/drivers/md/bcache/btree.h +++ b/drivers/md/bcache/btree.h @@ -216,6 +216,8 @@ static inline struct bkey *bch_btree_iter_init(struct btree *b, return __bch_btree_iter_init(b, iter, search, b->sets); } +void __bkey_put(struct cache_set *c, struct bkey *k); + /* Looping macros */ #define for_each_cached_btree(b, c, iter) \ @@ -380,8 +382,8 @@ struct btree *bch_btree_node_alloc(struct cache_set *, int, struct closure *); struct btree *bch_btree_node_get(struct cache_set *, struct bkey *, int, struct btree_op *); -bool bch_btree_insert_check_key(struct btree *, struct btree_op *, - struct bio *); +int bch_btree_insert_check_key(struct btree *, struct btree_op *, + struct bkey *); int bch_btree_insert(struct btree_op *, struct cache_set *); int bch_btree_search_recurse(struct btree *, struct btree_op *); diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 2a7f0dd..9ed334c 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -869,35 +869,39 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s, struct bio *bio, unsigned sectors) { int ret = 0; - unsigned reada; + unsigned reada = 0; struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); struct bio *miss; - miss = bch_bio_split(bio, sectors, GFP_NOIO, s->d->bio_split); - if (miss == bio) - s->op.lookup_done = true; + if (s->cache_miss || s->op.skip) { + miss = bch_bio_split(bio, sectors, GFP_NOIO, s->d->bio_split); + if (miss == bio) + s->op.lookup_done = true; + goto out_submit; + } - miss->bi_end_io = request_endio; - miss->bi_private = &s->cl; + if (!(bio->bi_rw & REQ_RAHEAD) && + !(bio->bi_rw & REQ_META) && + s->op.c->gc_stats.in_use < CUTOFF_CACHE_READA) + reada = min_t(sector_t, dc->readahead >> 9, + bdev_sectors(bio->bi_bdev) - bio_end_sector(bio)); - if (s->cache_miss || s->op.skip) - goto out_submit; + s->cache_bio_sectors = min(sectors, bio_sectors(bio) + reada); - if (miss != bio || - (bio->bi_rw & REQ_RAHEAD) || - (bio->bi_rw & REQ_META) || - s->op.c->gc_stats.in_use >= CUTOFF_CACHE_READA) - reada = 0; - else { - reada = min(dc->readahead >> 9, - sectors - bio_sectors(miss)); - - if (bio_end_sector(miss) + reada > bdev_sectors(miss->bi_bdev)) - reada = bdev_sectors(miss->bi_bdev) - - bio_end_sector(miss); - } + s->op.replace = KEY(s->op.inode, bio->bi_sector + + s->cache_bio_sectors, s->cache_bio_sectors); + + ret = bch_btree_insert_check_key(b, &s->op, &s->op.replace); + if (ret) + return ret; + + miss = bch_bio_split(bio, sectors, GFP_NOIO, s->d->bio_split); + if (miss == bio) + s->op.lookup_done = true; + else + /* btree_search_recurse()'s btree iterator is no good anymore */ + ret = -EINTR; - s->cache_bio_sectors = bio_sectors(miss) + reada; s->op.cache_bio = bio_alloc_bioset(GFP_NOWAIT, DIV_ROUND_UP(s->cache_bio_sectors, PAGE_SECTORS), dc->disk.bio_split); @@ -912,11 +916,6 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s, s->op.cache_bio->bi_end_io = request_endio; s->op.cache_bio->bi_private = &s->cl; - /* btree_search_recurse()'s btree iterator is no good anymore */ - ret = -EINTR; - if (!bch_btree_insert_check_key(b, &s->op, s->op.cache_bio)) - goto out_put; - bch_bio_map(s->op.cache_bio, NULL); if (bio_alloc_pages(s->op.cache_bio, __GFP_NOWARN|GFP_NOIO)) goto out_put; @@ -931,6 +930,8 @@ out_put: bio_put(s->op.cache_bio); s->op.cache_bio = NULL; out_submit: + miss->bi_end_io = request_endio; + miss->bi_private = &s->cl; closure_bio_submit(miss, &s->cl, s->d); return ret; } -- cgit v0.10.2 From 4f3d40147b8d0ce7055e241e1d263e0aa2b2b46d Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 10 Sep 2013 18:46:36 -0700 Subject: bcache: Add explicit keylist arg to btree_insert() Some refactoring - better to explicitly pass stuff around instead of having it all in the "big bag of state", struct btree_op. Going to prune struct btree_op quite a bit over time. Signed-off-by: Kent Overstreet diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 08a8532..fc3cae5 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -2109,30 +2109,32 @@ out: return ret; } -static int bch_btree_insert_recurse(struct btree *b, struct btree_op *op) +static int bch_btree_insert_recurse(struct btree *b, struct btree_op *op, + struct keylist *keys) { - if (bch_keylist_empty(&op->keys)) + if (bch_keylist_empty(keys)) return 0; if (b->level) { - struct bkey *insert = op->keys.bottom; - struct bkey *k = bch_next_recurse_key(b, &START_KEY(insert)); + struct bkey *k; + k = bch_next_recurse_key(b, &START_KEY(keys->bottom)); if (!k) { btree_bug(b, "no key to recurse on at level %i/%i", b->level, b->c->root->level); - op->keys.top = op->keys.bottom; + keys->top = keys->bottom; return -EIO; } - return btree(insert_recurse, k, b, op); + return btree(insert_recurse, k, b, op, keys); } else { - return bch_btree_insert_node(b, op, &op->keys); + return bch_btree_insert_node(b, op, keys); } } -int bch_btree_insert(struct btree_op *op, struct cache_set *c) +int bch_btree_insert(struct btree_op *op, struct cache_set *c, + struct keylist *keys) { int ret = 0; @@ -2142,11 +2144,11 @@ int bch_btree_insert(struct btree_op *op, struct cache_set *c) */ clear_closure_blocking(&op->cl); - BUG_ON(bch_keylist_empty(&op->keys)); + BUG_ON(bch_keylist_empty(keys)); - while (!bch_keylist_empty(&op->keys)) { + while (!bch_keylist_empty(keys)) { op->lock = 0; - ret = btree_root(insert_recurse, c, op); + ret = btree_root(insert_recurse, c, op, keys); if (ret == -EAGAIN) { ret = 0; @@ -2157,7 +2159,7 @@ int bch_btree_insert(struct btree_op *op, struct cache_set *c) pr_err("error %i trying to insert key for %s", ret, op_type(op)); - while ((k = bch_keylist_pop(&op->keys))) + while ((k = bch_keylist_pop(keys))) bkey_put(c, k, 0); } } diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h index 73bd621..967aacd 100644 --- a/drivers/md/bcache/btree.h +++ b/drivers/md/bcache/btree.h @@ -384,7 +384,7 @@ struct btree *bch_btree_node_get(struct cache_set *, struct bkey *, int bch_btree_insert_check_key(struct btree *, struct btree_op *, struct bkey *); -int bch_btree_insert(struct btree_op *, struct cache_set *); +int bch_btree_insert(struct btree_op *, struct cache_set *, struct keylist *); int bch_btree_search_recurse(struct btree *, struct btree_op *); diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c index 9e87758..5abe5d5 100644 --- a/drivers/md/bcache/journal.c +++ b/drivers/md/bcache/journal.c @@ -320,7 +320,7 @@ int bch_journal_replay(struct cache_set *s, struct list_head *list, op->journal = i->pin; atomic_inc(op->journal); - ret = bch_btree_insert(op, s); + ret = bch_btree_insert(op, s, &op->keys); if (ret) goto err; diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 9ed334c..7fd84ce9 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -607,7 +607,7 @@ void bch_btree_insert_async(struct closure *cl) struct btree_op *op = container_of(cl, struct btree_op, cl); struct search *s = container_of(op, struct search, op); - if (bch_btree_insert(op, op->c)) { + if (bch_btree_insert(op, op->c, &op->keys)) { s->error = -ENOMEM; op->insert_data_done = true; } diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index b842fbf..8ffc8ec 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -311,7 +311,7 @@ static void write_dirty_finish(struct closure *cl) for (i = 0; i < KEY_PTRS(&w->key); i++) atomic_inc(&PTR_BUCKET(dc->disk.c, &w->key, i)->pin); - bch_btree_insert(&op, dc->disk.c); + bch_btree_insert(&op, dc->disk.c, &op.keys); closure_sync(&op.cl); if (op.insert_collision) -- cgit v0.10.2 From c2f95ae2ebbe1ab61b1d4437f5923fdf720d4d4d Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 24 Jul 2013 17:24:25 -0700 Subject: bcache: Clean up keylist code More random refactoring. Signed-off-by: Kent Overstreet diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c index 830eede..d0512e4 100644 --- a/drivers/md/bcache/bset.c +++ b/drivers/md/bcache/bset.c @@ -14,22 +14,12 @@ /* Keylists */ -void bch_keylist_copy(struct keylist *dest, struct keylist *src) -{ - *dest = *src; - - if (src->list == src->d) { - size_t n = (uint64_t *) src->top - src->d; - dest->top = (struct bkey *) &dest->d[n]; - dest->list = dest->d; - } -} - int bch_keylist_realloc(struct keylist *l, int nptrs, struct cache_set *c) { - unsigned oldsize = (uint64_t *) l->top - l->list; - unsigned newsize = oldsize + 2 + nptrs; - uint64_t *new; + size_t oldsize = bch_keylist_nkeys(l); + size_t newsize = oldsize + 2 + nptrs; + uint64_t *old_keys = l->keys_p == l->inline_keys ? NULL : l->keys_p; + uint64_t *new_keys; /* The journalling code doesn't handle the case where the keys to insert * is bigger than an empty write: If we just return -ENOMEM here, @@ -45,24 +35,23 @@ int bch_keylist_realloc(struct keylist *l, int nptrs, struct cache_set *c) roundup_pow_of_two(oldsize) == newsize) return 0; - new = krealloc(l->list == l->d ? NULL : l->list, - sizeof(uint64_t) * newsize, GFP_NOIO); + new_keys = krealloc(old_keys, sizeof(uint64_t) * newsize, GFP_NOIO); - if (!new) + if (!new_keys) return -ENOMEM; - if (l->list == l->d) - memcpy(new, l->list, sizeof(uint64_t) * KEYLIST_INLINE); + if (!old_keys) + memcpy(new_keys, l->inline_keys, sizeof(uint64_t) * oldsize); - l->list = new; - l->top = (struct bkey *) (&l->list[oldsize]); + l->keys_p = new_keys; + l->top_p = new_keys + oldsize; return 0; } struct bkey *bch_keylist_pop(struct keylist *l) { - struct bkey *k = l->bottom; + struct bkey *k = l->keys; if (k == l->top) return NULL; @@ -75,14 +64,11 @@ struct bkey *bch_keylist_pop(struct keylist *l) void bch_keylist_pop_front(struct keylist *l) { - struct bkey *next = bkey_next(l->bottom); - size_t bytes = ((void *) l->top) - ((void *) next); - - memmove(l->bottom, - next, - bytes); + l->top_p -= bkey_u64s(l->keys); - l->top = ((void *) l->bottom) + bytes; + memmove(l->keys, + bkey_next(l->keys), + bch_keylist_bytes(l)); } /* Pointer validation */ diff --git a/drivers/md/bcache/bset.h b/drivers/md/bcache/bset.h index a3627d0..8a93056 100644 --- a/drivers/md/bcache/bset.h +++ b/drivers/md/bcache/bset.h @@ -227,20 +227,23 @@ static inline struct bkey *bkey_next(const struct bkey *k) /* Keylists */ struct keylist { - struct bkey *top; union { - uint64_t *list; - struct bkey *bottom; + struct bkey *keys; + uint64_t *keys_p; + }; + union { + struct bkey *top; + uint64_t *top_p; }; /* Enough room for btree_split's keys without realloc */ #define KEYLIST_INLINE 16 - uint64_t d[KEYLIST_INLINE]; + uint64_t inline_keys[KEYLIST_INLINE]; }; static inline void bch_keylist_init(struct keylist *l) { - l->top = (void *) (l->list = l->d); + l->top_p = l->keys_p = l->inline_keys; } static inline void bch_keylist_push(struct keylist *l) @@ -256,16 +259,30 @@ static inline void bch_keylist_add(struct keylist *l, struct bkey *k) static inline bool bch_keylist_empty(struct keylist *l) { - return l->top == (void *) l->list; + return l->top == l->keys; +} + +static inline void bch_keylist_reset(struct keylist *l) +{ + l->top = l->keys; } static inline void bch_keylist_free(struct keylist *l) { - if (l->list != l->d) - kfree(l->list); + if (l->keys_p != l->inline_keys) + kfree(l->keys_p); +} + +static inline size_t bch_keylist_nkeys(struct keylist *l) +{ + return l->top_p - l->keys_p; +} + +static inline size_t bch_keylist_bytes(struct keylist *l) +{ + return bch_keylist_nkeys(l) * sizeof(uint64_t); } -void bch_keylist_copy(struct keylist *, struct keylist *); struct bkey *bch_keylist_pop(struct keylist *); void bch_keylist_pop_front(struct keylist *); int bch_keylist_realloc(struct keylist *, int, struct cache_set *); diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index fc3cae5..f960607 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -1866,7 +1866,7 @@ static bool bch_btree_insert_keys(struct btree *b, struct btree_op *op, while (!bch_keylist_empty(insert_keys)) { struct bset *i = write_block(b); - struct bkey *k = insert_keys->bottom; + struct bkey *k = insert_keys->keys; if (b->written + __set_blocks(i, i->keys + bkey_u64s(k), b->c) > btree_blocks(b)) @@ -1887,10 +1887,10 @@ static bool bch_btree_insert_keys(struct btree *b, struct btree_op *op, } #endif BKEY_PADDED(key) temp; - bkey_copy(&temp.key, insert_keys->bottom); + bkey_copy(&temp.key, insert_keys->keys); bch_cut_back(&b->key, &temp.key); - bch_cut_front(&b->key, insert_keys->bottom); + bch_cut_front(&b->key, insert_keys->keys); ret |= btree_insert_key(b, op, &temp.key); break; @@ -1984,7 +1984,7 @@ static int btree_split(struct btree *b, struct btree_op *op, } else if (!b->parent) { /* Root filled up but didn't need to be split */ - parent_keys->top = parent_keys->bottom; + bch_keylist_reset(parent_keys); closure_sync(&op->cl); bch_btree_set_root(n1); } else { @@ -2118,12 +2118,12 @@ static int bch_btree_insert_recurse(struct btree *b, struct btree_op *op, if (b->level) { struct bkey *k; - k = bch_next_recurse_key(b, &START_KEY(keys->bottom)); + k = bch_next_recurse_key(b, &START_KEY(keys->keys)); if (!k) { btree_bug(b, "no key to recurse on at level %i/%i", b->level, b->c->root->level); - keys->top = keys->bottom; + bch_keylist_reset(keys); return -EIO; } diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c index 5abe5d5..1bdefdb 100644 --- a/drivers/md/bcache/journal.c +++ b/drivers/md/bcache/journal.c @@ -713,7 +713,7 @@ void bch_journal(struct closure *cl) struct btree_op *op = container_of(cl, struct btree_op, cl); struct cache_set *c = op->c; struct journal_write *w; - size_t b, n = ((uint64_t *) op->keys.top) - op->keys.list; + size_t sectors, nkeys; if (op->type != BTREE_INSERT || !CACHE_SYNC(&c->sb)) @@ -741,10 +741,12 @@ void bch_journal(struct closure *cl) } w = c->journal.cur; - b = __set_blocks(w->data, w->data->keys + n, c); + nkeys = w->data->keys + bch_keylist_nkeys(&op->keys); + sectors = __set_blocks(w->data, nkeys, c) * c->sb.block_size; - if (b * c->sb.block_size > PAGE_SECTORS << JSET_BITS || - b > c->journal.blocks_free) { + if (sectors > min_t(size_t, + c->journal.blocks_free * c->sb.block_size, + PAGE_SECTORS << JSET_BITS)) { trace_bcache_journal_entry_full(c); /* @@ -760,8 +762,8 @@ void bch_journal(struct closure *cl) continue_at(cl, bch_journal, bcache_wq); } - memcpy(end(w->data), op->keys.list, n * sizeof(uint64_t)); - w->data->keys += n; + memcpy(end(w->data), op->keys.keys, bch_keylist_bytes(&op->keys)); + w->data->keys += bch_keylist_nkeys(&op->keys); op->journal = &fifo_back(&c->journal.pin); atomic_inc(op->journal); diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 7fd84ce9..a000e91 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -438,13 +438,13 @@ static void bch_insert_data_error(struct closure *cl) * from the keys we'll accomplish just that. */ - struct bkey *src = op->keys.bottom, *dst = op->keys.bottom; + struct bkey *src = op->keys.keys, *dst = op->keys.keys; while (src != op->keys.top) { struct bkey *n = bkey_next(src); SET_KEY_PTRS(src, 0); - bkey_copy(dst, src); + memmove(dst, src, bkey_bytes(src)); dst = bkey_next(dst); src = n; -- cgit v0.10.2 From 84f0db03ea1e024f2a9e6cfcf7ac0323e4f84d3a Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 24 Jul 2013 17:24:52 -0700 Subject: bcache: Refactor request_write() Try to improve some of the naming a bit to be more consistent, and also improve the flow of control in request_write() a bit. Signed-off-by: Kent Overstreet diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h index 967aacd..ea0814b 100644 --- a/drivers/md/bcache/btree.h +++ b/drivers/md/bcache/btree.h @@ -260,7 +260,7 @@ struct btree_op { } type:8; unsigned csum:1; - unsigned skip:1; + unsigned bypass:1; unsigned flush_journal:1; unsigned insert_data_done:1; diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index a000e91..dbc2ef6 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -25,8 +25,6 @@ struct kmem_cache *bch_search_cache; -static void check_should_skip(struct cached_dev *, struct search *); - /* Cgroup interface */ #ifdef CONFIG_CGROUP_BCACHE @@ -480,7 +478,7 @@ static void bch_insert_data_loop(struct closure *cl) struct search *s = container_of(op, struct search, op); struct bio *bio = op->cache_bio, *n; - if (op->skip) + if (op->bypass) return bio_invalidate(cl); if (atomic_sub_return(bio_sectors(bio), &op->c->sectors_to_gc) < 0) { @@ -557,7 +555,7 @@ err: * we wait for buckets to be freed up, so just invalidate the * rest of the write. */ - op->skip = true; + op->bypass = true; return bio_invalidate(cl); } else { /* @@ -590,8 +588,8 @@ err: * It inserts the data in op->cache_bio; bi_sector is used for the key offset, * and op->inode is used for the key inode. * - * If op->skip is true, instead of inserting the data it invalidates the region - * of the cache represented by op->cache_bio and op->inode. + * If op->bypass is true, instead of inserting the data it invalidates the + * region of the cache represented by op->cache_bio and op->inode. */ void bch_insert_data(struct closure *cl) { @@ -717,7 +715,6 @@ static struct search *search_alloc(struct bio *bio, struct bcache_device *d) s->orig_bio = bio; s->write = (bio->bi_rw & REQ_WRITE) != 0; s->op.flush_journal = (bio->bi_rw & (REQ_FLUSH|REQ_FUA)) != 0; - s->op.skip = (bio->bi_rw & REQ_DISCARD) != 0; s->recoverable = 1; s->start_time = jiffies; do_bio_hook(s); @@ -757,6 +754,134 @@ static void cached_dev_bio_complete(struct closure *cl) cached_dev_put(dc); } +unsigned bch_get_congested(struct cache_set *c) +{ + int i; + long rand; + + if (!c->congested_read_threshold_us && + !c->congested_write_threshold_us) + return 0; + + i = (local_clock_us() - c->congested_last_us) / 1024; + if (i < 0) + return 0; + + i += atomic_read(&c->congested); + if (i >= 0) + return 0; + + i += CONGESTED_MAX; + + if (i > 0) + i = fract_exp_two(i, 6); + + rand = get_random_int(); + i -= bitmap_weight(&rand, BITS_PER_LONG); + + return i > 0 ? i : 1; +} + +static void add_sequential(struct task_struct *t) +{ + ewma_add(t->sequential_io_avg, + t->sequential_io, 8, 0); + + t->sequential_io = 0; +} + +static struct hlist_head *iohash(struct cached_dev *dc, uint64_t k) +{ + return &dc->io_hash[hash_64(k, RECENT_IO_BITS)]; +} + +static bool check_should_bypass(struct cached_dev *dc, struct search *s) +{ + struct cache_set *c = s->op.c; + struct bio *bio = &s->bio.bio; + unsigned mode = cache_mode(dc, bio); + unsigned sectors, congested = bch_get_congested(c); + + if (atomic_read(&dc->disk.detaching) || + c->gc_stats.in_use > CUTOFF_CACHE_ADD || + (bio->bi_rw & REQ_DISCARD)) + goto skip; + + if (mode == CACHE_MODE_NONE || + (mode == CACHE_MODE_WRITEAROUND && + (bio->bi_rw & REQ_WRITE))) + goto skip; + + if (bio->bi_sector & (c->sb.block_size - 1) || + bio_sectors(bio) & (c->sb.block_size - 1)) { + pr_debug("skipping unaligned io"); + goto skip; + } + + if (!congested && !dc->sequential_cutoff) + goto rescale; + + if (!congested && + mode == CACHE_MODE_WRITEBACK && + (bio->bi_rw & REQ_WRITE) && + (bio->bi_rw & REQ_SYNC)) + goto rescale; + + if (dc->sequential_merge) { + struct io *i; + + spin_lock(&dc->io_lock); + + hlist_for_each_entry(i, iohash(dc, bio->bi_sector), hash) + if (i->last == bio->bi_sector && + time_before(jiffies, i->jiffies)) + goto found; + + i = list_first_entry(&dc->io_lru, struct io, lru); + + add_sequential(s->task); + i->sequential = 0; +found: + if (i->sequential + bio->bi_size > i->sequential) + i->sequential += bio->bi_size; + + i->last = bio_end_sector(bio); + i->jiffies = jiffies + msecs_to_jiffies(5000); + s->task->sequential_io = i->sequential; + + hlist_del(&i->hash); + hlist_add_head(&i->hash, iohash(dc, i->last)); + list_move_tail(&i->lru, &dc->io_lru); + + spin_unlock(&dc->io_lock); + } else { + s->task->sequential_io = bio->bi_size; + + add_sequential(s->task); + } + + sectors = max(s->task->sequential_io, + s->task->sequential_io_avg) >> 9; + + if (dc->sequential_cutoff && + sectors >= dc->sequential_cutoff >> 9) { + trace_bcache_bypass_sequential(s->orig_bio); + goto skip; + } + + if (congested && sectors >= congested) { + trace_bcache_bypass_congested(s->orig_bio); + goto skip; + } + +rescale: + bch_rescale_priorities(c, bio_sectors(bio)); + return false; +skip: + bch_mark_sectors_bypassed(s, bio_sectors(bio)); + return true; +} + /* Process reads */ static void cached_dev_read_complete(struct closure *cl) @@ -854,8 +979,8 @@ static void request_read_done_bh(struct closure *cl) struct search *s = container_of(cl, struct search, cl); struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); - bch_mark_cache_accounting(s, !s->cache_miss, s->op.skip); - trace_bcache_read(s->orig_bio, !s->cache_miss, s->op.skip); + bch_mark_cache_accounting(s, !s->cache_miss, s->op.bypass); + trace_bcache_read(s->orig_bio, !s->cache_miss, s->op.bypass); if (s->error) continue_at_nobarrier(cl, request_read_error, bcache_wq); @@ -873,7 +998,7 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s, struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); struct bio *miss; - if (s->cache_miss || s->op.skip) { + if (s->cache_miss || s->op.bypass) { miss = bch_bio_split(bio, sectors, GFP_NOIO, s->d->bio_split); if (miss == bio) s->op.lookup_done = true; @@ -940,9 +1065,7 @@ static void request_read(struct cached_dev *dc, struct search *s) { struct closure *cl = &s->cl; - check_should_skip(dc, s); closure_call(&s->op.cl, btree_read_async, NULL, cl); - continue_at(cl, request_read_done_bh, NULL); } @@ -961,41 +1084,48 @@ static void request_write(struct cached_dev *dc, struct search *s) { struct closure *cl = &s->cl; struct bio *bio = &s->bio.bio; - struct bkey start, end; - start = KEY(dc->disk.id, bio->bi_sector, 0); - end = KEY(dc->disk.id, bio_end_sector(bio), 0); + struct bkey start = KEY(dc->disk.id, bio->bi_sector, 0); + struct bkey end = KEY(dc->disk.id, bio_end_sector(bio), 0); bch_keybuf_check_overlapping(&s->op.c->moving_gc_keys, &start, &end); - check_should_skip(dc, s); down_read_non_owner(&dc->writeback_lock); - if (bch_keybuf_check_overlapping(&dc->writeback_keys, &start, &end)) { - s->op.skip = false; + /* + * We overlap with some dirty data undergoing background + * writeback, force this write to writeback + */ + s->op.bypass = false; s->writeback = true; } + /* + * Discards aren't _required_ to do anything, so skipping if + * check_overlapping returned true is ok + * + * But check_overlapping drops dirty keys for which io hasn't started, + * so we still want to call it. + */ if (bio->bi_rw & REQ_DISCARD) - goto skip; + s->op.bypass = true; if (should_writeback(dc, s->orig_bio, cache_mode(dc, bio), - s->op.skip)) { - s->op.skip = false; + s->op.bypass)) { + s->op.bypass = false; s->writeback = true; } - if (s->op.skip) - goto skip; - - trace_bcache_write(s->orig_bio, s->writeback, s->op.skip); + trace_bcache_write(s->orig_bio, s->writeback, s->op.bypass); - if (!s->writeback) { - s->op.cache_bio = bio_clone_bioset(bio, GFP_NOIO, - dc->disk.bio_split); + if (s->op.bypass) { + s->op.cache_bio = s->orig_bio; + bio_get(s->op.cache_bio); - closure_bio_submit(bio, cl, s->d); - } else { + if (!(bio->bi_rw & REQ_DISCARD) || + blk_queue_discard(bdev_get_queue(dc->bdev))) + closure_bio_submit(bio, cl, s->d); + } else if (s->writeback) { bch_writeback_add(dc); s->op.cache_bio = bio; @@ -1011,21 +1141,15 @@ static void request_write(struct cached_dev *dc, struct search *s) closure_bio_submit(flush, cl, s->d); } + } else { + s->op.cache_bio = bio_clone_bioset(bio, GFP_NOIO, + dc->disk.bio_split); + + closure_bio_submit(bio, cl, s->d); } -out: + closure_call(&s->op.cl, bch_insert_data, NULL, cl); continue_at(cl, cached_dev_write_complete, NULL); -skip: - s->op.skip = true; - s->op.cache_bio = s->orig_bio; - bio_get(s->op.cache_bio); - - if ((bio->bi_rw & REQ_DISCARD) && - !blk_queue_discard(bdev_get_queue(dc->bdev))) - goto out; - - closure_bio_submit(bio, cl, s->d); - goto out; } static void request_nodata(struct cached_dev *dc, struct search *s) @@ -1033,14 +1157,10 @@ static void request_nodata(struct cached_dev *dc, struct search *s) struct closure *cl = &s->cl; struct bio *bio = &s->bio.bio; - if (bio->bi_rw & REQ_DISCARD) { - request_write(dc, s); - return; - } - if (s->op.flush_journal) bch_journal_meta(s->op.c, cl); + /* If it's a flush, we send the flush to the backing device too */ closure_bio_submit(bio, cl, s->d); continue_at(cl, cached_dev_bio_complete, NULL); @@ -1048,134 +1168,6 @@ static void request_nodata(struct cached_dev *dc, struct search *s) /* Cached devices - read & write stuff */ -unsigned bch_get_congested(struct cache_set *c) -{ - int i; - long rand; - - if (!c->congested_read_threshold_us && - !c->congested_write_threshold_us) - return 0; - - i = (local_clock_us() - c->congested_last_us) / 1024; - if (i < 0) - return 0; - - i += atomic_read(&c->congested); - if (i >= 0) - return 0; - - i += CONGESTED_MAX; - - if (i > 0) - i = fract_exp_two(i, 6); - - rand = get_random_int(); - i -= bitmap_weight(&rand, BITS_PER_LONG); - - return i > 0 ? i : 1; -} - -static void add_sequential(struct task_struct *t) -{ - ewma_add(t->sequential_io_avg, - t->sequential_io, 8, 0); - - t->sequential_io = 0; -} - -static struct hlist_head *iohash(struct cached_dev *dc, uint64_t k) -{ - return &dc->io_hash[hash_64(k, RECENT_IO_BITS)]; -} - -static void check_should_skip(struct cached_dev *dc, struct search *s) -{ - struct cache_set *c = s->op.c; - struct bio *bio = &s->bio.bio; - unsigned mode = cache_mode(dc, bio); - unsigned sectors, congested = bch_get_congested(c); - - if (atomic_read(&dc->disk.detaching) || - c->gc_stats.in_use > CUTOFF_CACHE_ADD || - (bio->bi_rw & REQ_DISCARD)) - goto skip; - - if (mode == CACHE_MODE_NONE || - (mode == CACHE_MODE_WRITEAROUND && - (bio->bi_rw & REQ_WRITE))) - goto skip; - - if (bio->bi_sector & (c->sb.block_size - 1) || - bio_sectors(bio) & (c->sb.block_size - 1)) { - pr_debug("skipping unaligned io"); - goto skip; - } - - if (!congested && !dc->sequential_cutoff) - goto rescale; - - if (!congested && - mode == CACHE_MODE_WRITEBACK && - (bio->bi_rw & REQ_WRITE) && - (bio->bi_rw & REQ_SYNC)) - goto rescale; - - if (dc->sequential_merge) { - struct io *i; - - spin_lock(&dc->io_lock); - - hlist_for_each_entry(i, iohash(dc, bio->bi_sector), hash) - if (i->last == bio->bi_sector && - time_before(jiffies, i->jiffies)) - goto found; - - i = list_first_entry(&dc->io_lru, struct io, lru); - - add_sequential(s->task); - i->sequential = 0; -found: - if (i->sequential + bio->bi_size > i->sequential) - i->sequential += bio->bi_size; - - i->last = bio_end_sector(bio); - i->jiffies = jiffies + msecs_to_jiffies(5000); - s->task->sequential_io = i->sequential; - - hlist_del(&i->hash); - hlist_add_head(&i->hash, iohash(dc, i->last)); - list_move_tail(&i->lru, &dc->io_lru); - - spin_unlock(&dc->io_lock); - } else { - s->task->sequential_io = bio->bi_size; - - add_sequential(s->task); - } - - sectors = max(s->task->sequential_io, - s->task->sequential_io_avg) >> 9; - - if (dc->sequential_cutoff && - sectors >= dc->sequential_cutoff >> 9) { - trace_bcache_bypass_sequential(s->orig_bio); - goto skip; - } - - if (congested && sectors >= congested) { - trace_bcache_bypass_congested(s->orig_bio); - goto skip; - } - -rescale: - bch_rescale_priorities(c, bio_sectors(bio)); - return; -skip: - bch_mark_sectors_bypassed(s, bio_sectors(bio)); - s->op.skip = true; -} - static void cached_dev_make_request(struct request_queue *q, struct bio *bio) { struct search *s; @@ -1195,12 +1187,16 @@ static void cached_dev_make_request(struct request_queue *q, struct bio *bio) s = search_alloc(bio, d); trace_bcache_request_start(s, bio); - if (!bio_has_data(bio)) + if (!bio->bi_size) request_nodata(dc, s); - else if (rw) - request_write(dc, s); - else - request_read(dc, s); + else { + s->op.bypass = check_should_bypass(dc, s); + + if (rw) + request_write(dc, s); + else + request_read(dc, s); + } } else { if ((bio->bi_rw & REQ_DISCARD) && !blk_queue_discard(bdev_get_queue(dc->bdev))) @@ -1298,21 +1294,21 @@ static void flash_dev_make_request(struct request_queue *q, struct bio *bio) trace_bcache_request_start(s, bio); - if (bio_has_data(bio) && !rw) { - closure_call(&s->op.cl, btree_read_async, NULL, cl); - } else if (bio_has_data(bio) || s->op.skip) { + if (!bio->bi_size) { + if (s->op.flush_journal) + bch_journal_meta(s->op.c, cl); + } else if (rw) { bch_keybuf_check_overlapping(&s->op.c->moving_gc_keys, &KEY(d->id, bio->bi_sector, 0), &KEY(d->id, bio_end_sector(bio), 0)); + s->op.bypass = (bio->bi_rw & REQ_DISCARD) != 0; s->writeback = true; s->op.cache_bio = bio; closure_call(&s->op.cl, bch_insert_data, NULL, cl); } else { - /* No data - probably a cache flush */ - if (s->op.flush_journal) - bch_journal_meta(s->op.c, cl); + closure_call(&s->op.cl, btree_read_async, NULL, cl); } continue_at(cl, search_free, NULL); -- cgit v0.10.2 From cdd972b164be8fc69f6ee8533c5a07b621da74c7 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 10 Sep 2013 17:06:17 -0700 Subject: bcache: Refactor read request code a bit More refactoring, and renaming. Signed-off-by: Kent Overstreet diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index dbc2ef6..3b85f33 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -884,7 +884,7 @@ skip: /* Process reads */ -static void cached_dev_read_complete(struct closure *cl) +static void cached_dev_cache_miss_done(struct closure *cl) { struct search *s = container_of(cl, struct search, cl); @@ -902,9 +902,10 @@ static void cached_dev_read_complete(struct closure *cl) cached_dev_bio_complete(cl); } -static void request_read_error(struct closure *cl) +static void cached_dev_read_error(struct closure *cl) { struct search *s = container_of(cl, struct search, cl); + struct bio *bio = &s->bio.bio; struct bio_vec *bv; int i; @@ -928,20 +929,20 @@ static void request_read_error(struct closure *cl) /* XXX: invalidate cache */ - closure_bio_submit(&s->bio.bio, &s->cl, s->d); + closure_bio_submit(bio, cl, s->d); } - continue_at(cl, cached_dev_read_complete, NULL); + continue_at(cl, cached_dev_cache_miss_done, NULL); } -static void request_read_done(struct closure *cl) +static void cached_dev_read_done(struct closure *cl) { struct search *s = container_of(cl, struct search, cl); struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); /* - * s->cache_bio != NULL implies that we had a cache miss; cache_bio now - * contains data ready to be inserted into the cache. + * We had a cache miss; cache_bio now contains data ready to be inserted + * into the cache. * * First, we copy the data we just read from cache_bio's bounce buffers * to the buffers the original bio pointed to: @@ -971,10 +972,10 @@ static void request_read_done(struct closure *cl) closure_call(&s->op.cl, bch_insert_data, NULL, cl); } - continue_at(cl, cached_dev_read_complete, NULL); + continue_at(cl, cached_dev_cache_miss_done, NULL); } -static void request_read_done_bh(struct closure *cl) +static void cached_dev_read_done_bh(struct closure *cl) { struct search *s = container_of(cl, struct search, cl); struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); @@ -983,11 +984,11 @@ static void request_read_done_bh(struct closure *cl) trace_bcache_read(s->orig_bio, !s->cache_miss, s->op.bypass); if (s->error) - continue_at_nobarrier(cl, request_read_error, bcache_wq); + continue_at_nobarrier(cl, cached_dev_read_error, bcache_wq); else if (s->op.cache_bio || verify(dc, &s->bio.bio)) - continue_at_nobarrier(cl, request_read_done, bcache_wq); + continue_at_nobarrier(cl, cached_dev_read_done, bcache_wq); else - continue_at_nobarrier(cl, cached_dev_read_complete, NULL); + continue_at_nobarrier(cl, cached_dev_bio_complete, NULL); } static int cached_dev_cache_miss(struct btree *b, struct search *s, @@ -996,7 +997,7 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s, int ret = 0; unsigned reada = 0; struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); - struct bio *miss; + struct bio *miss, *cache_bio; if (s->cache_miss || s->op.bypass) { miss = bch_bio_split(bio, sectors, GFP_NOIO, s->d->bio_split); @@ -1027,33 +1028,31 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s, /* btree_search_recurse()'s btree iterator is no good anymore */ ret = -EINTR; - s->op.cache_bio = bio_alloc_bioset(GFP_NOWAIT, + cache_bio = bio_alloc_bioset(GFP_NOWAIT, DIV_ROUND_UP(s->cache_bio_sectors, PAGE_SECTORS), dc->disk.bio_split); - - if (!s->op.cache_bio) + if (!cache_bio) goto out_submit; - s->op.cache_bio->bi_sector = miss->bi_sector; - s->op.cache_bio->bi_bdev = miss->bi_bdev; - s->op.cache_bio->bi_size = s->cache_bio_sectors << 9; + cache_bio->bi_sector = miss->bi_sector; + cache_bio->bi_bdev = miss->bi_bdev; + cache_bio->bi_size = s->cache_bio_sectors << 9; - s->op.cache_bio->bi_end_io = request_endio; - s->op.cache_bio->bi_private = &s->cl; + cache_bio->bi_end_io = request_endio; + cache_bio->bi_private = &s->cl; - bch_bio_map(s->op.cache_bio, NULL); - if (bio_alloc_pages(s->op.cache_bio, __GFP_NOWARN|GFP_NOIO)) + bch_bio_map(cache_bio, NULL); + if (bio_alloc_pages(cache_bio, __GFP_NOWARN|GFP_NOIO)) goto out_put; - s->cache_miss = miss; - bio_get(s->op.cache_bio); - - closure_bio_submit(s->op.cache_bio, &s->cl, s->d); + s->cache_miss = miss; + s->op.cache_bio = cache_bio; + bio_get(cache_bio); + closure_bio_submit(cache_bio, &s->cl, s->d); return ret; out_put: - bio_put(s->op.cache_bio); - s->op.cache_bio = NULL; + bio_put(cache_bio); out_submit: miss->bi_end_io = request_endio; miss->bi_private = &s->cl; @@ -1061,12 +1060,12 @@ out_submit: return ret; } -static void request_read(struct cached_dev *dc, struct search *s) +static void cached_dev_read(struct cached_dev *dc, struct search *s) { struct closure *cl = &s->cl; closure_call(&s->op.cl, btree_read_async, NULL, cl); - continue_at(cl, request_read_done_bh, NULL); + continue_at(cl, cached_dev_read_done_bh, NULL); } /* Process writes */ @@ -1080,7 +1079,7 @@ static void cached_dev_write_complete(struct closure *cl) cached_dev_bio_complete(cl); } -static void request_write(struct cached_dev *dc, struct search *s) +static void cached_dev_write(struct cached_dev *dc, struct search *s) { struct closure *cl = &s->cl; struct bio *bio = &s->bio.bio; @@ -1152,7 +1151,7 @@ static void request_write(struct cached_dev *dc, struct search *s) continue_at(cl, cached_dev_write_complete, NULL); } -static void request_nodata(struct cached_dev *dc, struct search *s) +static void cached_dev_nodata(struct cached_dev *dc, struct search *s) { struct closure *cl = &s->cl; struct bio *bio = &s->bio.bio; @@ -1188,14 +1187,14 @@ static void cached_dev_make_request(struct request_queue *q, struct bio *bio) trace_bcache_request_start(s, bio); if (!bio->bi_size) - request_nodata(dc, s); + cached_dev_nodata(dc, s); else { s->op.bypass = check_should_bypass(dc, s); if (rw) - request_write(dc, s); + cached_dev_write(dc, s); else - request_read(dc, s); + cached_dev_read(dc, s); } } else { if ((bio->bi_rw & REQ_DISCARD) && -- cgit v0.10.2 From a34a8bfd4e6358c646928320d37b0425c0762f8a Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 24 Oct 2013 17:07:04 -0700 Subject: bcache: Refactor journalling flow control Making things less asynchronous that don't need to be - bch_journal() only has to block when the journal or journal entry is full, which is emphatically not a fast path. So make it a normal function that just returns when it finishes, to make the code and control flow easier to follow. Signed-off-by: Kent Overstreet diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index f960607..777c01d 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -2164,9 +2164,6 @@ int bch_btree_insert(struct btree_op *op, struct cache_set *c, } } - if (op->journal) - atomic_dec_bug(op->journal); - op->journal = NULL; return ret; } diff --git a/drivers/md/bcache/closure.h b/drivers/md/bcache/closure.h index 0003992..ab011f0 100644 --- a/drivers/md/bcache/closure.h +++ b/drivers/md/bcache/closure.h @@ -642,7 +642,7 @@ do { \ #define continue_at_nobarrier(_cl, _fn, _wq) \ do { \ set_closure_fn(_cl, _fn, _wq); \ - closure_queue(cl); \ + closure_queue(_cl); \ return; \ } while (0) diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c index 1bdefdb..940e89e 100644 --- a/drivers/md/bcache/journal.c +++ b/drivers/md/bcache/journal.c @@ -318,7 +318,6 @@ int bch_journal_replay(struct cache_set *s, struct list_head *list, bch_keylist_push(&op->keys); op->journal = i->pin; - atomic_inc(op->journal); ret = bch_btree_insert(op, s, &op->keys); if (ret) @@ -357,48 +356,35 @@ static void btree_flush_write(struct cache_set *c) * Try to find the btree node with that references the oldest journal * entry, best is our current candidate and is locked if non NULL: */ - struct btree *b, *best = NULL; - unsigned iter; + struct btree *b, *best; + unsigned i; +retry: + best = NULL; + + for_each_cached_btree(b, c, i) + if (btree_current_write(b)->journal) { + if (!best) + best = b; + else if (journal_pin_cmp(c, + btree_current_write(best), + btree_current_write(b))) { + best = b; + } + } - for_each_cached_btree(b, c, iter) { - if (!down_write_trylock(&b->lock)) - continue; + b = best; + if (b) { + rw_lock(true, b, b->level); - if (!btree_node_dirty(b) || - !btree_current_write(b)->journal) { + if (!btree_current_write(b)->journal) { rw_unlock(true, b); - continue; + /* We raced */ + goto retry; } - if (!best) - best = b; - else if (journal_pin_cmp(c, - btree_current_write(best), - btree_current_write(b))) { - rw_unlock(true, best); - best = b; - } else - rw_unlock(true, b); + bch_btree_node_write(b, NULL); + rw_unlock(true, b); } - - if (best) - goto out; - - /* We can't find the best btree node, just pick the first */ - list_for_each_entry(b, &c->btree_cache, list) - if (!b->level && btree_node_dirty(b)) { - best = b; - rw_lock(true, best, best->level); - goto found; - } - -out: - if (!best) - return; -found: - if (btree_node_dirty(best)) - bch_btree_node_write(best, NULL); - rw_unlock(true, best); } #define last_seq(j) ((j)->seq - fifo_used(&(j)->pin) + 1) @@ -494,7 +480,7 @@ static void journal_reclaim(struct cache_set *c) do_journal_discard(ca); if (c->journal.blocks_free) - return; + goto out; /* * Allocate: @@ -520,7 +506,7 @@ static void journal_reclaim(struct cache_set *c) if (n) c->journal.blocks_free = c->sb.bucket_size >> c->block_bits; - +out: if (!journal_full(&c->journal)) __closure_wake_up(&c->journal.wait); } @@ -659,7 +645,7 @@ static void journal_write(struct closure *cl) journal_write_unlocked(cl); } -static void __journal_try_write(struct cache_set *c, bool noflush) +static void journal_try_write(struct cache_set *c) __releases(c->journal.lock) { struct closure *cl = &c->journal.io; @@ -667,29 +653,59 @@ static void __journal_try_write(struct cache_set *c, bool noflush) w->need_write = true; - if (!closure_trylock(cl, &c->cl)) - spin_unlock(&c->journal.lock); - else if (noflush && journal_full(&c->journal)) { - spin_unlock(&c->journal.lock); - continue_at(cl, journal_write, system_wq); - } else + if (closure_trylock(cl, &c->cl)) journal_write_unlocked(cl); + else + spin_unlock(&c->journal.lock); } -#define journal_try_write(c) __journal_try_write(c, false) - -void bch_journal_meta(struct cache_set *c, struct closure *cl) +static struct journal_write *journal_wait_for_write(struct cache_set *c, + unsigned nkeys) { - struct journal_write *w; + size_t sectors; + struct closure cl; - if (CACHE_SYNC(&c->sb)) { - spin_lock(&c->journal.lock); - w = c->journal.cur; + closure_init_stack(&cl); + + spin_lock(&c->journal.lock); + + while (1) { + struct journal_write *w = c->journal.cur; + + sectors = __set_blocks(w->data, w->data->keys + nkeys, + c) * c->sb.block_size; + + if (sectors <= min_t(size_t, + c->journal.blocks_free * c->sb.block_size, + PAGE_SECTORS << JSET_BITS)) + return w; + + /* XXX: tracepoint */ + if (!journal_full(&c->journal)) { + trace_bcache_journal_entry_full(c); + + /* + * XXX: If we were inserting so many keys that they + * won't fit in an _empty_ journal write, we'll + * deadlock. For now, handle this in + * bch_keylist_realloc() - but something to think about. + */ + BUG_ON(!w->data->keys); + + closure_wait(&w->wait, &cl); + journal_try_write(c); /* unlocks */ + } else { + trace_bcache_journal_full(c); + + closure_wait(&c->journal.wait, &cl); + journal_reclaim(c); + spin_unlock(&c->journal.lock); - if (cl) - BUG_ON(!closure_wait(&w->wait, cl)); + btree_flush_write(c); + } - __journal_try_write(c, true); + closure_sync(&cl); + spin_lock(&c->journal.lock); } } @@ -708,68 +724,26 @@ static void journal_write_work(struct work_struct *work) * bch_journal() hands those same keys off to btree_insert_async() */ -void bch_journal(struct closure *cl) +atomic_t *bch_journal(struct cache_set *c, + struct keylist *keys, + struct closure *parent) { - struct btree_op *op = container_of(cl, struct btree_op, cl); - struct cache_set *c = op->c; struct journal_write *w; - size_t sectors, nkeys; - - if (op->type != BTREE_INSERT || - !CACHE_SYNC(&c->sb)) - goto out; - - /* - * If we're looping because we errored, might already be waiting on - * another journal write: - */ - while (atomic_read(&cl->parent->remaining) & CLOSURE_WAITING) - closure_sync(cl->parent); - - spin_lock(&c->journal.lock); - - if (journal_full(&c->journal)) { - trace_bcache_journal_full(c); - - closure_wait(&c->journal.wait, cl); - - journal_reclaim(c); - spin_unlock(&c->journal.lock); - - btree_flush_write(c); - continue_at(cl, bch_journal, bcache_wq); - } + atomic_t *ret; - w = c->journal.cur; - nkeys = w->data->keys + bch_keylist_nkeys(&op->keys); - sectors = __set_blocks(w->data, nkeys, c) * c->sb.block_size; + if (!CACHE_SYNC(&c->sb)) + return NULL; - if (sectors > min_t(size_t, - c->journal.blocks_free * c->sb.block_size, - PAGE_SECTORS << JSET_BITS)) { - trace_bcache_journal_entry_full(c); + w = journal_wait_for_write(c, bch_keylist_nkeys(keys)); - /* - * XXX: If we were inserting so many keys that they won't fit in - * an _empty_ journal write, we'll deadlock. For now, handle - * this in bch_keylist_realloc() - but something to think about. - */ - BUG_ON(!w->data->keys); + memcpy(end(w->data), keys->keys, bch_keylist_bytes(keys)); + w->data->keys += bch_keylist_nkeys(keys); - BUG_ON(!closure_wait(&w->wait, cl)); + ret = &fifo_back(&c->journal.pin); + atomic_inc(ret); - journal_try_write(c); - continue_at(cl, bch_journal, bcache_wq); - } - - memcpy(end(w->data), op->keys.keys, bch_keylist_bytes(&op->keys)); - w->data->keys += bch_keylist_nkeys(&op->keys); - - op->journal = &fifo_back(&c->journal.pin); - atomic_inc(op->journal); - - if (op->flush_journal) { - closure_wait(&w->wait, cl->parent); + if (parent) { + closure_wait(&w->wait, parent); journal_try_write(c); } else if (!w->need_write) { schedule_delayed_work(&c->journal.work, @@ -778,8 +752,21 @@ void bch_journal(struct closure *cl) } else { spin_unlock(&c->journal.lock); } -out: - bch_btree_insert_async(cl); + + + return ret; +} + +void bch_journal_meta(struct cache_set *c, struct closure *cl) +{ + struct keylist keys; + atomic_t *ref; + + bch_keylist_init(&keys); + + ref = bch_journal(c, &keys, cl); + if (ref) + atomic_dec_bug(ref); } void bch_journal_free(struct cache_set *c) diff --git a/drivers/md/bcache/journal.h b/drivers/md/bcache/journal.h index 3ca93d3..7045e6f 100644 --- a/drivers/md/bcache/journal.h +++ b/drivers/md/bcache/journal.h @@ -200,8 +200,9 @@ struct journal_device { struct closure; struct cache_set; struct btree_op; +struct keylist; -void bch_journal(struct closure *); +atomic_t *bch_journal(struct cache_set *, struct keylist *, struct closure *); void bch_journal_next(struct journal *); void bch_journal_mark(struct cache_set *, struct list_head *); void bch_journal_meta(struct cache_set *, struct closure *); diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c index dd8a035..2c42377 100644 --- a/drivers/md/bcache/movinggc.c +++ b/drivers/md/bcache/movinggc.c @@ -110,7 +110,7 @@ static void write_moving(struct closure *cl) bkey_copy(&s->op.replace, &io->w->key); closure_init(&s->op.cl, cl); - bch_insert_data(&s->op.cl); + bch_data_insert(&s->op.cl); } continue_at(cl, write_moving_finish, bch_gc_wq); diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 3b85f33..1c3af44 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -25,6 +25,8 @@ struct kmem_cache *bch_search_cache; +static void bch_data_insert_start(struct closure *); + /* Cgroup interface */ #ifdef CONFIG_CGROUP_BCACHE @@ -211,31 +213,42 @@ static void bio_csum(struct bio *bio, struct bkey *k) /* Insert data into cache */ -static void bio_invalidate(struct closure *cl) +static void bch_data_insert_keys(struct closure *cl) { struct btree_op *op = container_of(cl, struct btree_op, cl); - struct bio *bio = op->cache_bio; + struct search *s = container_of(op, struct search, op); - pr_debug("invalidating %i sectors from %llu", - bio_sectors(bio), (uint64_t) bio->bi_sector); + /* + * If we're looping, might already be waiting on + * another journal write - can't wait on more than one journal write at + * a time + * + * XXX: this looks wrong + */ +#if 0 + while (atomic_read(&s->cl.remaining) & CLOSURE_WAITING) + closure_sync(&s->cl); +#endif - while (bio_sectors(bio)) { - unsigned len = min(bio_sectors(bio), 1U << 14); + if (s->write) + op->journal = bch_journal(op->c, &op->keys, + op->flush_journal + ? &s->cl : NULL); - if (bch_keylist_realloc(&op->keys, 0, op->c)) - goto out; + if (bch_btree_insert(op, op->c, &op->keys)) { + s->error = -ENOMEM; + op->insert_data_done = true; + } - bio->bi_sector += len; - bio->bi_size -= len << 9; + if (op->journal) + atomic_dec_bug(op->journal); + op->journal = NULL; - bch_keylist_add(&op->keys, - &KEY(op->inode, bio->bi_sector, len)); - } + if (!op->insert_data_done) + continue_at(cl, bch_data_insert_start, bcache_wq); - op->insert_data_done = true; - bio_put(bio); -out: - continue_at(cl, bch_journal, bcache_wq); + bch_keylist_free(&op->keys); + closure_return(cl); } struct open_bucket { @@ -423,7 +436,34 @@ static bool bch_alloc_sectors(struct bkey *k, unsigned sectors, return true; } -static void bch_insert_data_error(struct closure *cl) +static void bch_data_invalidate(struct closure *cl) +{ + struct btree_op *op = container_of(cl, struct btree_op, cl); + struct bio *bio = op->cache_bio; + + pr_debug("invalidating %i sectors from %llu", + bio_sectors(bio), (uint64_t) bio->bi_sector); + + while (bio_sectors(bio)) { + unsigned len = min(bio_sectors(bio), 1U << 14); + + if (bch_keylist_realloc(&op->keys, 0, op->c)) + goto out; + + bio->bi_sector += len; + bio->bi_size -= len << 9; + + bch_keylist_add(&op->keys, &KEY(op->inode, + bio->bi_sector, len)); + } + + op->insert_data_done = true; + bio_put(bio); +out: + continue_at(cl, bch_data_insert_keys, bcache_wq); +} + +static void bch_data_insert_error(struct closure *cl) { struct btree_op *op = container_of(cl, struct btree_op, cl); @@ -450,10 +490,10 @@ static void bch_insert_data_error(struct closure *cl) op->keys.top = dst; - bch_journal(cl); + bch_data_insert_keys(cl); } -static void bch_insert_data_endio(struct bio *bio, int error) +static void bch_data_insert_endio(struct bio *bio, int error) { struct closure *cl = bio->bi_private; struct btree_op *op = container_of(cl, struct btree_op, cl); @@ -464,7 +504,7 @@ static void bch_insert_data_endio(struct bio *bio, int error) if (s->writeback) s->error = error; else if (s->write) - set_closure_fn(cl, bch_insert_data_error, bcache_wq); + set_closure_fn(cl, bch_data_insert_error, bcache_wq); else set_closure_fn(cl, NULL, NULL); } @@ -472,14 +512,14 @@ static void bch_insert_data_endio(struct bio *bio, int error) bch_bbio_endio(op->c, bio, error, "writing data to cache"); } -static void bch_insert_data_loop(struct closure *cl) +static void bch_data_insert_start(struct closure *cl) { struct btree_op *op = container_of(cl, struct btree_op, cl); struct search *s = container_of(op, struct search, op); struct bio *bio = op->cache_bio, *n; if (op->bypass) - return bio_invalidate(cl); + return bch_data_invalidate(cl); if (atomic_sub_return(bio_sectors(bio), &op->c->sectors_to_gc) < 0) { set_gc_sectors(op->c); @@ -502,7 +542,7 @@ static void bch_insert_data_loop(struct closure *cl) if (bch_keylist_realloc(&op->keys, 1 + (op->csum ? 1 : 0), op->c)) - continue_at(cl, bch_journal, bcache_wq); + continue_at(cl, bch_data_insert_keys, bcache_wq); k = op->keys.top; bkey_init(k); @@ -514,7 +554,7 @@ static void bch_insert_data_loop(struct closure *cl) n = bch_bio_split(bio, KEY_SIZE(k), GFP_NOIO, split); - n->bi_end_io = bch_insert_data_endio; + n->bi_end_io = bch_data_insert_endio; n->bi_private = cl; if (s->writeback) { @@ -537,7 +577,7 @@ static void bch_insert_data_loop(struct closure *cl) } while (n != bio); op->insert_data_done = true; - continue_at(cl, bch_journal, bcache_wq); + continue_at(cl, bch_data_insert_keys, bcache_wq); err: /* bch_alloc_sectors() blocks if s->writeback = true */ BUG_ON(s->writeback); @@ -556,7 +596,7 @@ err: * rest of the write. */ op->bypass = true; - return bio_invalidate(cl); + return bch_data_invalidate(cl); } else { /* * From a cache miss, we can just insert the keys for the data @@ -566,14 +606,14 @@ err: bio_put(bio); if (!bch_keylist_empty(&op->keys)) - continue_at(cl, bch_journal, bcache_wq); + continue_at(cl, bch_data_insert_keys, bcache_wq); else closure_return(cl); } } /** - * bch_insert_data - stick some data in the cache + * bch_data_insert - stick some data in the cache * * This is the starting point for any data to end up in a cache device; it could * be from a normal write, or a writeback write, or a write to a flash only @@ -591,30 +631,13 @@ err: * If op->bypass is true, instead of inserting the data it invalidates the * region of the cache represented by op->cache_bio and op->inode. */ -void bch_insert_data(struct closure *cl) +void bch_data_insert(struct closure *cl) { struct btree_op *op = container_of(cl, struct btree_op, cl); bch_keylist_init(&op->keys); bio_get(op->cache_bio); - bch_insert_data_loop(cl); -} - -void bch_btree_insert_async(struct closure *cl) -{ - struct btree_op *op = container_of(cl, struct btree_op, cl); - struct search *s = container_of(op, struct search, op); - - if (bch_btree_insert(op, op->c, &op->keys)) { - s->error = -ENOMEM; - op->insert_data_done = true; - } - - if (op->insert_data_done) { - bch_keylist_free(&op->keys); - closure_return(cl); - } else - continue_at(cl, bch_insert_data_loop, bcache_wq); + bch_data_insert_start(cl); } /* Common code for the make_request functions */ @@ -969,7 +992,7 @@ static void cached_dev_read_done(struct closure *cl) if (s->op.cache_bio && !test_bit(CACHE_SET_STOPPING, &s->op.c->flags)) { s->op.type = BTREE_REPLACE; - closure_call(&s->op.cl, bch_insert_data, NULL, cl); + closure_call(&s->op.cl, bch_data_insert, NULL, cl); } continue_at(cl, cached_dev_cache_miss_done, NULL); @@ -1147,13 +1170,13 @@ static void cached_dev_write(struct cached_dev *dc, struct search *s) closure_bio_submit(bio, cl, s->d); } - closure_call(&s->op.cl, bch_insert_data, NULL, cl); + closure_call(&s->op.cl, bch_data_insert, NULL, cl); continue_at(cl, cached_dev_write_complete, NULL); } -static void cached_dev_nodata(struct cached_dev *dc, struct search *s) +static void cached_dev_nodata(struct closure *cl) { - struct closure *cl = &s->cl; + struct search *s = container_of(cl, struct search, cl); struct bio *bio = &s->bio.bio; if (s->op.flush_journal) @@ -1186,9 +1209,15 @@ static void cached_dev_make_request(struct request_queue *q, struct bio *bio) s = search_alloc(bio, d); trace_bcache_request_start(s, bio); - if (!bio->bi_size) - cached_dev_nodata(dc, s); - else { + if (!bio->bi_size) { + /* + * can't call bch_journal_meta from under + * generic_make_request + */ + continue_at_nobarrier(&s->cl, + cached_dev_nodata, + bcache_wq); + } else { s->op.bypass = check_should_bypass(dc, s); if (rw) @@ -1275,6 +1304,16 @@ static int flash_dev_cache_miss(struct btree *b, struct search *s, return 0; } +static void flash_dev_nodata(struct closure *cl) +{ + struct search *s = container_of(cl, struct search, cl); + + if (s->op.flush_journal) + bch_journal_meta(s->op.c, cl); + + continue_at(cl, search_free, NULL); +} + static void flash_dev_make_request(struct request_queue *q, struct bio *bio) { struct search *s; @@ -1294,8 +1333,13 @@ static void flash_dev_make_request(struct request_queue *q, struct bio *bio) trace_bcache_request_start(s, bio); if (!bio->bi_size) { - if (s->op.flush_journal) - bch_journal_meta(s->op.c, cl); + /* + * can't call bch_journal_meta from under + * generic_make_request + */ + continue_at_nobarrier(&s->cl, + flash_dev_nodata, + bcache_wq); } else if (rw) { bch_keybuf_check_overlapping(&s->op.c->moving_gc_keys, &KEY(d->id, bio->bi_sector, 0), @@ -1305,7 +1349,7 @@ static void flash_dev_make_request(struct request_queue *q, struct bio *bio) s->writeback = true; s->op.cache_bio = bio; - closure_call(&s->op.cl, bch_insert_data, NULL, cl); + closure_call(&s->op.cl, bch_data_insert, NULL, cl); } else { closure_call(&s->op.cl, btree_read_async, NULL, cl); } diff --git a/drivers/md/bcache/request.h b/drivers/md/bcache/request.h index 57dc478..1f1b59d 100644 --- a/drivers/md/bcache/request.h +++ b/drivers/md/bcache/request.h @@ -31,8 +31,7 @@ struct search { void bch_cache_read_endio(struct bio *, int); unsigned bch_get_congested(struct cache_set *); -void bch_insert_data(struct closure *cl); -void bch_btree_insert_async(struct closure *); +void bch_data_insert(struct closure *cl); void bch_cache_read_endio(struct bio *, int); void bch_open_buckets_free(struct cache_set *); -- cgit v0.10.2 From 0b93207abb40d3c42bb83eba1e1e7edc1da77810 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 24 Jul 2013 17:26:51 -0700 Subject: bcache: Move keylist out of btree_op Slowly working on pruning struct btree_op - the aim is for it to only contain things that are actually necessary for traversing the btree. Signed-off-by: Kent Overstreet diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 777c01d..731cd8e 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -113,7 +113,6 @@ void bch_btree_op_init_stack(struct btree_op *op) memset(op, 0, sizeof(struct btree_op)); closure_init_stack(&op->cl); op->lock = -1; - bch_keylist_init(&op->keys); } /* Btree key manipulation */ diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h index ea0814b..17b7a4e 100644 --- a/drivers/md/bcache/btree.h +++ b/drivers/md/bcache/btree.h @@ -267,10 +267,6 @@ struct btree_op { unsigned lookup_done:1; unsigned insert_collision:1; - /* Anything after this point won't get zeroed in do_bio_hook() */ - - /* Keys to be inserted */ - struct keylist keys; BKEY_PADDED(replace); }; diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c index 940e89e..8866f8e 100644 --- a/drivers/md/bcache/journal.c +++ b/drivers/md/bcache/journal.c @@ -301,6 +301,9 @@ int bch_journal_replay(struct cache_set *s, struct list_head *list, list_entry(list->prev, struct journal_replay, list); uint64_t start = i->j.last_seq, end = i->j.seq, n = start; + struct keylist keylist; + + bch_keylist_init(&keylist); list_for_each_entry(i, list, list) { BUG_ON(i->pin && atomic_read(i->pin) != 1); @@ -314,16 +317,16 @@ int bch_journal_replay(struct cache_set *s, struct list_head *list, k = bkey_next(k)) { trace_bcache_journal_replay_key(k); - bkey_copy(op->keys.top, k); - bch_keylist_push(&op->keys); + bkey_copy(keylist.top, k); + bch_keylist_push(&keylist); op->journal = i->pin; - ret = bch_btree_insert(op, s, &op->keys); + ret = bch_btree_insert(op, s, &keylist); if (ret) goto err; - BUG_ON(!bch_keylist_empty(&op->keys)); + BUG_ON(!bch_keylist_empty(&keylist)); keys++; cond_resched(); diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 1c3af44..d85c700 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -231,11 +231,11 @@ static void bch_data_insert_keys(struct closure *cl) #endif if (s->write) - op->journal = bch_journal(op->c, &op->keys, + op->journal = bch_journal(op->c, &s->insert_keys, op->flush_journal ? &s->cl : NULL); - if (bch_btree_insert(op, op->c, &op->keys)) { + if (bch_btree_insert(op, op->c, &s->insert_keys)) { s->error = -ENOMEM; op->insert_data_done = true; } @@ -247,7 +247,7 @@ static void bch_data_insert_keys(struct closure *cl) if (!op->insert_data_done) continue_at(cl, bch_data_insert_start, bcache_wq); - bch_keylist_free(&op->keys); + bch_keylist_free(&s->insert_keys); closure_return(cl); } @@ -439,6 +439,7 @@ static bool bch_alloc_sectors(struct bkey *k, unsigned sectors, static void bch_data_invalidate(struct closure *cl) { struct btree_op *op = container_of(cl, struct btree_op, cl); + struct search *s = container_of(op, struct search, op); struct bio *bio = op->cache_bio; pr_debug("invalidating %i sectors from %llu", @@ -447,14 +448,14 @@ static void bch_data_invalidate(struct closure *cl) while (bio_sectors(bio)) { unsigned len = min(bio_sectors(bio), 1U << 14); - if (bch_keylist_realloc(&op->keys, 0, op->c)) + if (bch_keylist_realloc(&s->insert_keys, 0, op->c)) goto out; bio->bi_sector += len; bio->bi_size -= len << 9; - bch_keylist_add(&op->keys, &KEY(op->inode, - bio->bi_sector, len)); + bch_keylist_add(&s->insert_keys, + &KEY(op->inode, bio->bi_sector, len)); } op->insert_data_done = true; @@ -466,6 +467,7 @@ out: static void bch_data_insert_error(struct closure *cl) { struct btree_op *op = container_of(cl, struct btree_op, cl); + struct search *s = container_of(op, struct search, op); /* * Our data write just errored, which means we've got a bunch of keys to @@ -476,9 +478,9 @@ static void bch_data_insert_error(struct closure *cl) * from the keys we'll accomplish just that. */ - struct bkey *src = op->keys.keys, *dst = op->keys.keys; + struct bkey *src = s->insert_keys.keys, *dst = s->insert_keys.keys; - while (src != op->keys.top) { + while (src != s->insert_keys.top) { struct bkey *n = bkey_next(src); SET_KEY_PTRS(src, 0); @@ -488,7 +490,7 @@ static void bch_data_insert_error(struct closure *cl) src = n; } - op->keys.top = dst; + s->insert_keys.top = dst; bch_data_insert_keys(cl); } @@ -539,12 +541,12 @@ static void bch_data_insert_start(struct closure *cl) ? s->d->bio_split : op->c->bio_split; /* 1 for the device pointer and 1 for the chksum */ - if (bch_keylist_realloc(&op->keys, + if (bch_keylist_realloc(&s->insert_keys, 1 + (op->csum ? 1 : 0), op->c)) continue_at(cl, bch_data_insert_keys, bcache_wq); - k = op->keys.top; + k = s->insert_keys.top; bkey_init(k); SET_KEY_INODE(k, op->inode); SET_KEY_OFFSET(k, bio->bi_sector); @@ -570,7 +572,7 @@ static void bch_data_insert_start(struct closure *cl) bio_csum(n, k); trace_bcache_cache_insert(k); - bch_keylist_push(&op->keys); + bch_keylist_push(&s->insert_keys); n->bi_rw |= REQ_WRITE; bch_submit_bbio(n, op->c, k, 0); @@ -605,7 +607,7 @@ err: op->insert_data_done = true; bio_put(bio); - if (!bch_keylist_empty(&op->keys)) + if (!bch_keylist_empty(&s->insert_keys)) continue_at(cl, bch_data_insert_keys, bcache_wq); else closure_return(cl); @@ -634,8 +636,9 @@ err: void bch_data_insert(struct closure *cl) { struct btree_op *op = container_of(cl, struct btree_op, cl); + struct search *s = container_of(op, struct search, op); - bch_keylist_init(&op->keys); + bch_keylist_init(&s->insert_keys); bio_get(op->cache_bio); bch_data_insert_start(cl); } @@ -724,9 +727,11 @@ static void search_free(struct closure *cl) static struct search *search_alloc(struct bio *bio, struct bcache_device *d) { + struct search *s; struct bio_vec *bv; - struct search *s = mempool_alloc(d->c->search, GFP_NOIO); - memset(s, 0, offsetof(struct search, op.keys)); + + s = mempool_alloc(d->c->search, GFP_NOIO); + memset(s, 0, offsetof(struct search, insert_keys)); __closure_init(&s->cl, NULL); diff --git a/drivers/md/bcache/request.h b/drivers/md/bcache/request.h index 1f1b59d..7d02ac5 100644 --- a/drivers/md/bcache/request.h +++ b/drivers/md/bcache/request.h @@ -25,8 +25,10 @@ struct search { short error; unsigned long start_time; - /* Anything past op->keys won't get zeroed in do_bio_hook */ struct btree_op op; + + /* Anything past this point won't get zeroed in search_alloc() */ + struct keylist insert_keys; }; void bch_cache_read_endio(struct bio *, int); diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index 8ffc8ec..51dc709 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -300,18 +300,21 @@ static void write_dirty_finish(struct closure *cl) if (KEY_DIRTY(&w->key)) { unsigned i; struct btree_op op; + struct keylist keys; + bch_btree_op_init_stack(&op); + bch_keylist_init(&keys); op.type = BTREE_REPLACE; bkey_copy(&op.replace, &w->key); SET_KEY_DIRTY(&w->key, false); - bch_keylist_add(&op.keys, &w->key); + bch_keylist_add(&keys, &w->key); for (i = 0; i < KEY_PTRS(&w->key); i++) atomic_inc(&PTR_BUCKET(dc->disk.c, &w->key, i)->pin); - bch_btree_insert(&op, dc->disk.c, &op.keys); + bch_btree_insert(&op, dc->disk.c, &keys); closure_sync(&op.cl); if (op.insert_collision) -- cgit v0.10.2 From e8e1d4682c8cb06dbcb5ef7bb851bf9bcb889c84 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 24 Jul 2013 17:27:07 -0700 Subject: bcache: Convert try_wait to wait_queue_head_t We never waited on c->try_wait asynchronously, so just use the standard primitives. Signed-off-by: Kent Overstreet diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 10ce0c8..c1c4419 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -735,8 +735,8 @@ struct cache_set { * basically a lock for this that we can wait on asynchronously. The * btree_root() macro releases the lock when it returns. */ - struct closure *try_harder; - struct closure_waitlist try_wait; + struct task_struct *try_harder; + wait_queue_head_t try_wait; uint64_t try_harder_start; /* diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 731cd8e..4d50f1e 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -437,7 +437,7 @@ static void bch_btree_leaf_dirty(struct btree *b, struct btree_op *op) set_btree_node_dirty(b); - if (op && op->journal) { + if (op->journal) { if (w->journal && journal_pin_cmp(b->c, w, op)) { atomic_dec_bug(w->journal); @@ -574,34 +574,35 @@ static struct btree *mca_bucket_alloc(struct cache_set *c, return b; } -static int mca_reap(struct btree *b, struct closure *cl, unsigned min_order) +static int mca_reap(struct btree *b, unsigned min_order, bool flush) { + struct closure cl; + + closure_init_stack(&cl); lockdep_assert_held(&b->c->bucket_lock); if (!down_write_trylock(&b->lock)) return -ENOMEM; - if (b->page_order < min_order) { + BUG_ON(btree_node_dirty(b) && !b->sets[0].data); + + if (b->page_order < min_order || + (!flush && + (btree_node_dirty(b) || + atomic_read(&b->io.cl.remaining) != -1))) { rw_unlock(true, b); return -ENOMEM; } - BUG_ON(btree_node_dirty(b) && !b->sets[0].data); - - if (cl && btree_node_dirty(b)) - bch_btree_node_write(b, NULL); - - if (cl) - closure_wait_event_async(&b->io.wait, cl, - atomic_read(&b->io.cl.remaining) == -1); - - if (btree_node_dirty(b) || - !closure_is_unlocked(&b->io.cl) || - work_pending(&b->work.work)) { - rw_unlock(true, b); - return -EAGAIN; + if (btree_node_dirty(b)) { + bch_btree_node_write(b, &cl); + closure_sync(&cl); } + /* wait for any in flight btree write */ + closure_wait_event_sync(&b->io.wait, &cl, + atomic_read(&b->io.cl.remaining) == -1); + return 0; } @@ -641,7 +642,7 @@ static unsigned long bch_mca_scan(struct shrinker *shrink, break; if (++i > 3 && - !mca_reap(b, NULL, 0)) { + !mca_reap(b, 0, false)) { mca_data_free(b); rw_unlock(true, b); freed++; @@ -660,7 +661,7 @@ static unsigned long bch_mca_scan(struct shrinker *shrink, list_rotate_left(&c->btree_cache); if (!b->accessed && - !mca_reap(b, NULL, 0)) { + !mca_reap(b, 0, false)) { mca_bucket_free(b); mca_data_free(b); rw_unlock(true, b); @@ -783,52 +784,27 @@ out: return b; } -static struct btree *mca_cannibalize(struct cache_set *c, struct bkey *k, - int level, struct closure *cl) +static struct btree *mca_cannibalize(struct cache_set *c, struct bkey *k) { - int ret = -ENOMEM; - struct btree *i; + struct btree *b; trace_bcache_btree_cache_cannibalize(c); - if (!cl) - return ERR_PTR(-ENOMEM); - - /* - * Trying to free up some memory - i.e. reuse some btree nodes - may - * require initiating IO to flush the dirty part of the node. If we're - * running under generic_make_request(), that IO will never finish and - * we would deadlock. Returning -EAGAIN causes the cache lookup code to - * punt to workqueue and retry. - */ - if (current->bio_list) - return ERR_PTR(-EAGAIN); - - if (c->try_harder && c->try_harder != cl) { - closure_wait_event_async(&c->try_wait, cl, !c->try_harder); - return ERR_PTR(-EAGAIN); - } + if (!c->try_harder) { + c->try_harder = current; + c->try_harder_start = local_clock(); + } else if (c->try_harder != current) + return ERR_PTR(-ENOSPC); - c->try_harder = cl; - c->try_harder_start = local_clock(); -retry: - list_for_each_entry_reverse(i, &c->btree_cache, list) { - int r = mca_reap(i, cl, btree_order(k)); - if (!r) - return i; - if (r != -ENOMEM) - ret = r; - } + list_for_each_entry_reverse(b, &c->btree_cache, list) + if (!mca_reap(b, btree_order(k), false)) + return b; - if (ret == -EAGAIN && - closure_blocking(cl)) { - mutex_unlock(&c->bucket_lock); - closure_sync(cl); - mutex_lock(&c->bucket_lock); - goto retry; - } + list_for_each_entry_reverse(b, &c->btree_cache, list) + if (!mca_reap(b, btree_order(k), true)) + return b; - return ERR_PTR(ret); + return ERR_PTR(-ENOMEM); } /* @@ -839,18 +815,19 @@ retry: */ void bch_cannibalize_unlock(struct cache_set *c, struct closure *cl) { - if (c->try_harder == cl) { + if (c->try_harder == current) { bch_time_stats_update(&c->try_harder_time, c->try_harder_start); c->try_harder = NULL; - __closure_wake_up(&c->try_wait); + wake_up(&c->try_wait); } } -static struct btree *mca_alloc(struct cache_set *c, struct bkey *k, - int level, struct closure *cl) +static struct btree *mca_alloc(struct cache_set *c, struct bkey *k, int level) { struct btree *b; + BUG_ON(current->bio_list); + lockdep_assert_held(&c->bucket_lock); if (mca_find(c, k)) @@ -860,14 +837,14 @@ static struct btree *mca_alloc(struct cache_set *c, struct bkey *k, * the list. Check if there's any freed nodes there: */ list_for_each_entry(b, &c->btree_cache_freeable, list) - if (!mca_reap(b, NULL, btree_order(k))) + if (!mca_reap(b, btree_order(k), false)) goto out; /* We never free struct btree itself, just the memory that holds the on * disk node. Check the freed list before allocating a new one: */ list_for_each_entry(b, &c->btree_cache_freed, list) - if (!mca_reap(b, NULL, 0)) { + if (!mca_reap(b, 0, false)) { mca_data_alloc(b, k, __GFP_NOWARN|GFP_NOIO); if (!b->sets[0].data) goto err; @@ -901,7 +878,7 @@ err: if (b) rw_unlock(true, b); - b = mca_cannibalize(c, k, level, cl); + b = mca_cannibalize(c, k); if (!IS_ERR(b)) goto out; @@ -919,10 +896,9 @@ err: * level and op->lock. */ struct btree *bch_btree_node_get(struct cache_set *c, struct bkey *k, - int level, struct btree_op *op) + int level, bool write) { int i = 0; - bool write = level <= op->lock; struct btree *b; BUG_ON(level < 0); @@ -934,7 +910,7 @@ retry: return ERR_PTR(-EAGAIN); mutex_lock(&c->bucket_lock); - b = mca_alloc(c, k, level, &op->cl); + b = mca_alloc(c, k, level); mutex_unlock(&c->bucket_lock); if (!b) @@ -980,7 +956,7 @@ static void btree_node_prefetch(struct cache_set *c, struct bkey *k, int level) struct btree *b; mutex_lock(&c->bucket_lock); - b = mca_alloc(c, k, level, NULL); + b = mca_alloc(c, k, level); mutex_unlock(&c->bucket_lock); if (!IS_ERR_OR_NULL(b)) { @@ -991,17 +967,12 @@ static void btree_node_prefetch(struct cache_set *c, struct bkey *k, int level) /* Btree alloc */ -static void btree_node_free(struct btree *b, struct btree_op *op) +static void btree_node_free(struct btree *b) { unsigned i; trace_bcache_btree_node_free(b); - /* - * The BUG_ON() in btree_node_get() implies that we must have a write - * lock on parent to free or even invalidate a node - */ - BUG_ON(op->lock <= b->level); BUG_ON(b == b->c->root); if (btree_node_dirty(b)) @@ -1037,7 +1008,7 @@ retry: SET_KEY_SIZE(&k.key, c->btree_pages * PAGE_SECTORS); - b = mca_alloc(c, &k.key, level, cl); + b = mca_alloc(c, &k.key, level); if (IS_ERR(b)) goto err_free; @@ -1173,8 +1144,7 @@ static int btree_gc_mark_node(struct btree *b, unsigned *keys, return stale; } -static struct btree *btree_gc_alloc(struct btree *b, struct bkey *k, - struct btree_op *op) +static struct btree *btree_gc_alloc(struct btree *b, struct bkey *k) { /* * We block priorities from being written for the duration of garbage @@ -1191,7 +1161,7 @@ static struct btree *btree_gc_alloc(struct btree *b, struct bkey *k, memcpy(k->ptr, b->key.ptr, sizeof(uint64_t) * KEY_PTRS(&b->key)); - btree_node_free(n, op); + btree_node_free(n); up_write(&n->lock); } @@ -1211,8 +1181,8 @@ struct gc_merge_info { unsigned keys; }; -static void btree_gc_coalesce(struct btree *b, struct btree_op *op, - struct gc_stat *gc, struct gc_merge_info *r) +static void btree_gc_coalesce(struct btree *b, struct gc_stat *gc, + struct gc_merge_info *r) { unsigned nodes = 0, keys = 0, blocks; int i; @@ -1228,7 +1198,7 @@ static void btree_gc_coalesce(struct btree *b, struct btree_op *op, for (i = nodes - 1; i >= 0; --i) { if (r[i].b->written) - r[i].b = btree_gc_alloc(r[i].b, r[i].k, op); + r[i].b = btree_gc_alloc(r[i].b, r[i].k); if (r[i].b->written) return; @@ -1292,7 +1262,7 @@ static void btree_gc_coalesce(struct btree *b, struct btree_op *op, r[i - 1].keys = n2->keys; } - btree_node_free(r->b, op); + btree_node_free(r->b); up_write(&r->b->lock); trace_bcache_btree_gc_coalesce(nodes); @@ -1324,7 +1294,7 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op, memset(r, 0, sizeof(r)); while ((r->k = bch_next_recurse_key(b, &b->c->gc_done))) { - r->b = bch_btree_node_get(b->c, r->k, b->level - 1, op); + r->b = bch_btree_node_get(b->c, r->k, b->level - 1, true); if (IS_ERR(r->b)) { ret = PTR_ERR(r->b); @@ -1337,7 +1307,7 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op, if (!b->written && (r->b->level || stale > 10 || b->c->gc_always_rewrite)) - r->b = btree_gc_alloc(r->b, r->k, op); + r->b = btree_gc_alloc(r->b, r->k); if (r->b->level) ret = btree_gc_recurse(r->b, op, writes, gc); @@ -1350,7 +1320,7 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op, bkey_copy_key(&b->c->gc_done, r->k); if (!b->written) - btree_gc_coalesce(b, op, gc, r); + btree_gc_coalesce(b, gc, r); if (r[GC_MERGE_NODES - 1].b) write(r[GC_MERGE_NODES - 1].b); @@ -1404,7 +1374,7 @@ static int bch_btree_gc_root(struct btree *b, struct btree_op *op, if (!IS_ERR_OR_NULL(n)) { closure_sync(&op->cl); bch_btree_set_root(b); - btree_node_free(n, op); + btree_node_free(n); rw_unlock(true, b); } @@ -2004,18 +1974,18 @@ static int btree_split(struct btree *b, struct btree_op *op, } rw_unlock(true, n1); - btree_node_free(b, op); + btree_node_free(b); bch_time_stats_update(&b->c->btree_split_time, start_time); return 0; err_free2: __bkey_put(n2->c, &n2->key); - btree_node_free(n2, op); + btree_node_free(n2); rw_unlock(true, n2); err_free1: __bkey_put(n1->c, &n1->key); - btree_node_free(n1, op); + btree_node_free(n1); rw_unlock(true, n1); err: if (n3 == ERR_PTR(-EAGAIN) || diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h index 17b7a4e..72794ab 100644 --- a/drivers/md/bcache/btree.h +++ b/drivers/md/bcache/btree.h @@ -326,7 +326,7 @@ static inline void rw_unlock(bool w, struct btree *b) ({ \ int _r, l = (b)->level - 1; \ bool _w = l <= (op)->lock; \ - struct btree *_child = bch_btree_node_get((b)->c, key, l, op); \ + struct btree *_child = bch_btree_node_get((b)->c, key, l, _w); \ if (!IS_ERR(_child)) { \ _child->parent = (b); \ _r = bch_btree_ ## fn(_child, op, ##__VA_ARGS__); \ @@ -356,6 +356,11 @@ static inline void rw_unlock(bool w, struct btree *b) } \ rw_unlock(_w, _b); \ bch_cannibalize_unlock(c, &(op)->cl); \ + if (_r == -ENOSPC) { \ + wait_event((c)->try_wait, \ + !(c)->try_harder); \ + _r = -EINTR; \ + } \ } while (_r == -EINTR); \ \ _r; \ @@ -375,8 +380,7 @@ void bch_btree_node_write(struct btree *, struct closure *); void bch_cannibalize_unlock(struct cache_set *, struct closure *); void bch_btree_set_root(struct btree *); struct btree *bch_btree_node_alloc(struct cache_set *, int, struct closure *); -struct btree *bch_btree_node_get(struct cache_set *, struct bkey *, - int, struct btree_op *); +struct btree *bch_btree_node_get(struct cache_set *, struct bkey *, int, bool); int bch_btree_insert_check_key(struct btree *, struct btree_op *, struct bkey *); diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index d3169c0..9a164cd 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1436,12 +1436,14 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) c->sort_crit_factor = int_sqrt(c->btree_pages); - mutex_init(&c->bucket_lock); - mutex_init(&c->sort_lock); - spin_lock_init(&c->sort_time_lock); closure_init_unlocked(&c->sb_write); + mutex_init(&c->bucket_lock); + init_waitqueue_head(&c->try_wait); closure_init_unlocked(&c->uuid_write); + spin_lock_init(&c->sort_time_lock); + mutex_init(&c->sort_lock); spin_lock_init(&c->btree_read_time_lock); + bch_moving_init_cache_set(c); INIT_LIST_HEAD(&c->list); @@ -1529,7 +1531,7 @@ static void run_cache_set(struct cache_set *c) goto err; err = "error reading btree root"; - c->root = bch_btree_node_get(c, k, j->btree_level, &op); + c->root = bch_btree_node_get(c, k, j->btree_level, true); if (IS_ERR_OR_NULL(c->root)) goto err; -- cgit v0.10.2 From 35fcd848d72683141052aa9880542461577f2dbe Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 24 Jul 2013 17:29:09 -0700 Subject: bcache: Convert bucket_wait to wait_queue_head_t At one point we did do fancy asynchronous waiting stuff with bucket_wait, but that's all gone (and bucket_wait is used a lot less than it used to be). So use the standard primitives. Signed-off-by: Kent Overstreet diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c index e033b02..1b64e66 100644 --- a/drivers/md/bcache/alloc.c +++ b/drivers/md/bcache/alloc.c @@ -339,7 +339,7 @@ static int bch_allocator_thread(void *arg) allocator_wait(ca, !fifo_full(&ca->free)); fifo_push(&ca->free, bucket); - closure_wake_up(&ca->set->bucket_wait); + wake_up(&ca->set->bucket_wait); } /* @@ -365,16 +365,41 @@ static int bch_allocator_thread(void *arg) } } -long bch_bucket_alloc(struct cache *ca, unsigned watermark, struct closure *cl) +long bch_bucket_alloc(struct cache *ca, unsigned watermark, bool wait) { - long r = -1; -again: + DEFINE_WAIT(w); + struct bucket *b; + long r; + + /* fastpath */ + if (fifo_used(&ca->free) > ca->watermark[watermark]) { + fifo_pop(&ca->free, r); + goto out; + } + + if (!wait) + return -1; + + while (1) { + if (fifo_used(&ca->free) > ca->watermark[watermark]) { + fifo_pop(&ca->free, r); + break; + } + + prepare_to_wait(&ca->set->bucket_wait, &w, + TASK_UNINTERRUPTIBLE); + + mutex_unlock(&ca->set->bucket_lock); + schedule(); + mutex_lock(&ca->set->bucket_lock); + } + + finish_wait(&ca->set->bucket_wait, &w); +out: wake_up_process(ca->alloc_thread); - if (fifo_used(&ca->free) > ca->watermark[watermark] && - fifo_pop(&ca->free, r)) { - struct bucket *b = ca->buckets + r; #ifdef CONFIG_BCACHE_EDEBUG + { size_t iter; long i; @@ -387,36 +412,23 @@ again: BUG_ON(i == r); fifo_for_each(i, &ca->unused, iter) BUG_ON(i == r); -#endif - BUG_ON(atomic_read(&b->pin) != 1); - - SET_GC_SECTORS_USED(b, ca->sb.bucket_size); - - if (watermark <= WATERMARK_METADATA) { - SET_GC_MARK(b, GC_MARK_METADATA); - b->prio = BTREE_PRIO; - } else { - SET_GC_MARK(b, GC_MARK_RECLAIMABLE); - b->prio = INITIAL_PRIO; - } - - return r; } +#endif + b = ca->buckets + r; - trace_bcache_alloc_fail(ca); + BUG_ON(atomic_read(&b->pin) != 1); - if (cl) { - closure_wait(&ca->set->bucket_wait, cl); + SET_GC_SECTORS_USED(b, ca->sb.bucket_size); - if (closure_blocking(cl)) { - mutex_unlock(&ca->set->bucket_lock); - closure_sync(cl); - mutex_lock(&ca->set->bucket_lock); - goto again; - } + if (watermark <= WATERMARK_METADATA) { + SET_GC_MARK(b, GC_MARK_METADATA); + b->prio = BTREE_PRIO; + } else { + SET_GC_MARK(b, GC_MARK_RECLAIMABLE); + b->prio = INITIAL_PRIO; } - return -1; + return r; } void bch_bucket_free(struct cache_set *c, struct bkey *k) @@ -433,7 +445,7 @@ void bch_bucket_free(struct cache_set *c, struct bkey *k) } int __bch_bucket_alloc_set(struct cache_set *c, unsigned watermark, - struct bkey *k, int n, struct closure *cl) + struct bkey *k, int n, bool wait) { int i; @@ -446,7 +458,7 @@ int __bch_bucket_alloc_set(struct cache_set *c, unsigned watermark, for (i = 0; i < n; i++) { struct cache *ca = c->cache_by_alloc[i]; - long b = bch_bucket_alloc(ca, watermark, cl); + long b = bch_bucket_alloc(ca, watermark, wait); if (b == -1) goto err; @@ -466,11 +478,11 @@ err: } int bch_bucket_alloc_set(struct cache_set *c, unsigned watermark, - struct bkey *k, int n, struct closure *cl) + struct bkey *k, int n, bool wait) { int ret; mutex_lock(&c->bucket_lock); - ret = __bch_bucket_alloc_set(c, watermark, k, n, cl); + ret = __bch_bucket_alloc_set(c, watermark, k, n, wait); mutex_unlock(&c->bucket_lock); return ret; } diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index c1c4419..d352074 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -750,7 +750,7 @@ struct cache_set { * written. */ atomic_t prio_blocked; - struct closure_waitlist bucket_wait; + wait_queue_head_t bucket_wait; /* * For any bio we don't skip we subtract the number of sectors from @@ -1162,13 +1162,13 @@ uint8_t bch_inc_gen(struct cache *, struct bucket *); void bch_rescale_priorities(struct cache_set *, int); bool bch_bucket_add_unused(struct cache *, struct bucket *); -long bch_bucket_alloc(struct cache *, unsigned, struct closure *); +long bch_bucket_alloc(struct cache *, unsigned, bool); void bch_bucket_free(struct cache_set *, struct bkey *); int __bch_bucket_alloc_set(struct cache_set *, unsigned, - struct bkey *, int, struct closure *); + struct bkey *, int, bool); int bch_bucket_alloc_set(struct cache_set *, unsigned, - struct bkey *, int, struct closure *); + struct bkey *, int, bool); __printf(2, 3) bool bch_cache_set_error(struct cache_set *, const char *, ...); diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 4d50f1e..935d90d 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -813,7 +813,7 @@ static struct btree *mca_cannibalize(struct cache_set *c, struct bkey *k) * cannibalize_bucket() will take. This means every time we unlock the root of * the btree, we need to release this lock if we have it held. */ -void bch_cannibalize_unlock(struct cache_set *c, struct closure *cl) +void bch_cannibalize_unlock(struct cache_set *c) { if (c->try_harder == current) { bch_time_stats_update(&c->try_harder_time, c->try_harder_start); @@ -995,15 +995,14 @@ static void btree_node_free(struct btree *b) mutex_unlock(&b->c->bucket_lock); } -struct btree *bch_btree_node_alloc(struct cache_set *c, int level, - struct closure *cl) +struct btree *bch_btree_node_alloc(struct cache_set *c, int level) { BKEY_PADDED(key) k; struct btree *b = ERR_PTR(-EAGAIN); mutex_lock(&c->bucket_lock); retry: - if (__bch_bucket_alloc_set(c, WATERMARK_METADATA, &k.key, 1, cl)) + if (__bch_bucket_alloc_set(c, WATERMARK_METADATA, &k.key, 1, true)) goto err; SET_KEY_SIZE(&k.key, c->btree_pages * PAGE_SECTORS); @@ -1036,10 +1035,9 @@ err: return b; } -static struct btree *btree_node_alloc_replacement(struct btree *b, - struct closure *cl) +static struct btree *btree_node_alloc_replacement(struct btree *b) { - struct btree *n = bch_btree_node_alloc(b->c, b->level, cl); + struct btree *n = bch_btree_node_alloc(b->c, b->level); if (!IS_ERR_OR_NULL(n)) bch_btree_sort_into(b, n); @@ -1152,7 +1150,7 @@ static struct btree *btree_gc_alloc(struct btree *b, struct bkey *k) * bch_bucket_alloc_set(), or we'd risk deadlock - so we don't pass it * our closure. */ - struct btree *n = btree_node_alloc_replacement(b, NULL); + struct btree *n = btree_node_alloc_replacement(b); if (!IS_ERR_OR_NULL(n)) { swap(b, n); @@ -1359,7 +1357,7 @@ static int bch_btree_gc_root(struct btree *b, struct btree_op *op, int ret = 0, stale = btree_gc_mark_node(b, &keys, gc); if (b->level || stale > 10) - n = btree_node_alloc_replacement(b, NULL); + n = btree_node_alloc_replacement(b); if (!IS_ERR_OR_NULL(n)) swap(b, n); @@ -1882,10 +1880,7 @@ static int btree_split(struct btree *b, struct btree_op *op, struct btree *n1, *n2 = NULL, *n3 = NULL; uint64_t start_time = local_clock(); - if (b->level) - set_closure_blocking(&op->cl); - - n1 = btree_node_alloc_replacement(b, &op->cl); + n1 = btree_node_alloc_replacement(b); if (IS_ERR(n1)) goto err; @@ -1896,12 +1891,12 @@ static int btree_split(struct btree *b, struct btree_op *op, trace_bcache_btree_node_split(b, n1->sets[0].data->keys); - n2 = bch_btree_node_alloc(b->c, b->level, &op->cl); + n2 = bch_btree_node_alloc(b->c, b->level); if (IS_ERR(n2)) goto err_free1; if (!b->parent) { - n3 = bch_btree_node_alloc(b->c, b->level + 1, &op->cl); + n3 = bch_btree_node_alloc(b->c, b->level + 1); if (IS_ERR(n3)) goto err_free2; } diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h index 72794ab..d691d95 100644 --- a/drivers/md/bcache/btree.h +++ b/drivers/md/bcache/btree.h @@ -355,7 +355,7 @@ static inline void rw_unlock(bool w, struct btree *b) _r = bch_btree_ ## fn(_b, op, ##__VA_ARGS__); \ } \ rw_unlock(_w, _b); \ - bch_cannibalize_unlock(c, &(op)->cl); \ + bch_cannibalize_unlock(c); \ if (_r == -ENOSPC) { \ wait_event((c)->try_wait, \ !(c)->try_harder); \ @@ -377,9 +377,9 @@ static inline bool should_split(struct btree *b) void bch_btree_node_read(struct btree *); void bch_btree_node_write(struct btree *, struct closure *); -void bch_cannibalize_unlock(struct cache_set *, struct closure *); +void bch_cannibalize_unlock(struct cache_set *); void bch_btree_set_root(struct btree *); -struct btree *bch_btree_node_alloc(struct cache_set *, int, struct closure *); +struct btree *bch_btree_node_alloc(struct cache_set *, int); struct btree *bch_btree_node_get(struct cache_set *, struct bkey *, int, bool); int bch_btree_insert_check_key(struct btree *, struct btree_op *, diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index d85c700..26d18f4 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -350,14 +350,8 @@ static bool bch_alloc_sectors(struct bkey *k, unsigned sectors, struct cache_set *c = s->op.c; struct open_bucket *b; BKEY_PADDED(key) alloc; - struct closure cl, *w = NULL; unsigned i; - if (s->writeback) { - closure_init_stack(&cl); - w = &cl; - } - /* * We might have to allocate a new bucket, which we can't do with a * spinlock held. So if we have to allocate, we drop the lock, allocate @@ -375,7 +369,8 @@ static bool bch_alloc_sectors(struct bkey *k, unsigned sectors, spin_unlock(&c->data_bucket_lock); - if (bch_bucket_alloc_set(c, watermark, &alloc.key, 1, w)) + if (bch_bucket_alloc_set(c, watermark, &alloc.key, + 1, s->writeback)) return false; spin_lock(&c->data_bucket_lock); diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 9a164cd..84398a8 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -427,7 +427,7 @@ static int __uuid_write(struct cache_set *c) lockdep_assert_held(&bch_register_lock); - if (bch_bucket_alloc_set(c, WATERMARK_METADATA, &k.key, 1, &cl)) + if (bch_bucket_alloc_set(c, WATERMARK_METADATA, &k.key, 1, true)) return 1; SET_KEY_SIZE(&k.key, c->sb.bucket_size); @@ -565,7 +565,7 @@ void bch_prio_write(struct cache *ca) p->magic = pset_magic(ca); p->csum = bch_crc64(&p->magic, bucket_bytes(ca) - 8); - bucket = bch_bucket_alloc(ca, WATERMARK_PRIO, &cl); + bucket = bch_bucket_alloc(ca, WATERMARK_PRIO, true); BUG_ON(bucket == -1); mutex_unlock(&ca->set->bucket_lock); @@ -1439,6 +1439,7 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) closure_init_unlocked(&c->sb_write); mutex_init(&c->bucket_lock); init_waitqueue_head(&c->try_wait); + init_waitqueue_head(&c->bucket_wait); closure_init_unlocked(&c->uuid_write); spin_lock_init(&c->sort_time_lock); mutex_init(&c->sort_lock); @@ -1608,7 +1609,7 @@ static void run_cache_set(struct cache_set *c) goto err_unlock_gc; err = "cannot allocate new btree root"; - c->root = bch_btree_node_alloc(c, 0, &op.cl); + c->root = bch_btree_node_alloc(c, 0); if (IS_ERR_OR_NULL(c->root)) goto err_unlock_gc; -- cgit v0.10.2 From 72a44517f3ca3725dc86081d105457df46448679 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 24 Oct 2013 17:19:26 -0700 Subject: bcache: Convert gc to a kthread We needed a dedicated rescuer workqueue for gc anyways... and gc was conceptually a dedicated thread, just one that wasn't running all the time. Switch it to a dedicated thread to make the code a bit more straightforward. Signed-off-by: Kent Overstreet diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c index 1b64e66..b9bd586 100644 --- a/drivers/md/bcache/alloc.c +++ b/drivers/md/bcache/alloc.c @@ -210,7 +210,7 @@ static void invalidate_buckets_lru(struct cache *ca) * multiple times when it can't do anything */ ca->invalidate_needs_gc = 1; - bch_queue_gc(ca->set); + wake_up_gc(ca->set); return; } @@ -235,7 +235,7 @@ static void invalidate_buckets_fifo(struct cache *ca) if (++checked >= ca->sb.nbuckets) { ca->invalidate_needs_gc = 1; - bch_queue_gc(ca->set); + wake_up_gc(ca->set); return; } } @@ -260,7 +260,7 @@ static void invalidate_buckets_random(struct cache *ca) if (++checked >= ca->sb.nbuckets / 2) { ca->invalidate_needs_gc = 1; - bch_queue_gc(ca->set); + wake_up_gc(ca->set); return; } } diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index d352074..09410eb 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -773,7 +773,7 @@ struct cache_set { struct gc_stat gc_stats; size_t nbuckets; - struct closure_with_waitlist gc; + struct task_struct *gc_thread; /* Where in the btree gc currently is */ struct bkey gc_done; @@ -786,11 +786,10 @@ struct cache_set { /* Counts how many sectors bio_insert has added to the cache */ atomic_t sectors_to_gc; - struct closure moving_gc; - struct closure_waitlist moving_gc_wait; + wait_queue_head_t moving_gc_wait; struct keybuf moving_gc_keys; /* Number of moving GC bios in flight */ - atomic_t in_flight; + struct semaphore moving_in_flight; struct btree *root; @@ -1176,7 +1175,7 @@ bool bch_cache_set_error(struct cache_set *, const char *, ...); void bch_prio_write(struct cache *); void bch_write_bdev_super(struct cached_dev *, struct closure *); -extern struct workqueue_struct *bcache_wq, *bch_gc_wq; +extern struct workqueue_struct *bcache_wq; extern const char * const bch_cache_modes[]; extern struct mutex bch_register_lock; extern struct list_head bch_cache_sets; diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 935d90d..17bfd87 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -28,7 +28,9 @@ #include #include +#include #include +#include #include #include #include @@ -105,7 +107,6 @@ static const char *op_type(struct btree_op *op) #define PTR_HASH(c, k) \ (((k)->ptr[0] >> c->bucket_bits) | PTR_GEN(k, 0)) -struct workqueue_struct *bch_gc_wq; static struct workqueue_struct *btree_io_wq; void bch_btree_op_init_stack(struct btree_op *op) @@ -732,12 +733,9 @@ int bch_btree_cache_alloc(struct cache_set *c) { unsigned i; - /* XXX: doesn't check for errors */ - - closure_init_unlocked(&c->gc); - for (i = 0; i < mca_reserve(c); i++) - mca_bucket_alloc(c, &ZERO_KEY, GFP_KERNEL); + if (!mca_bucket_alloc(c, &ZERO_KEY, GFP_KERNEL)) + return -ENOMEM; list_splice_init(&c->btree_cache, &c->btree_cache_freeable); @@ -1456,9 +1454,8 @@ size_t bch_btree_gc_finish(struct cache_set *c) return available; } -static void bch_btree_gc(struct closure *cl) +static void bch_btree_gc(struct cache_set *c) { - struct cache_set *c = container_of(cl, struct cache_set, gc.cl); int ret; unsigned long available; struct gc_stat stats; @@ -1483,7 +1480,7 @@ static void bch_btree_gc(struct closure *cl) if (ret) { pr_warn("gc failed!"); - continue_at(cl, bch_btree_gc, bch_gc_wq); + return; } /* Possibly wait for new UUIDs or whatever to hit disk */ @@ -1505,12 +1502,35 @@ static void bch_btree_gc(struct closure *cl) trace_bcache_gc_end(c); - continue_at(cl, bch_moving_gc, bch_gc_wq); + bch_moving_gc(c); +} + +static int bch_gc_thread(void *arg) +{ + struct cache_set *c = arg; + + while (1) { + bch_btree_gc(c); + + set_current_state(TASK_INTERRUPTIBLE); + if (kthread_should_stop()) + break; + + try_to_freeze(); + schedule(); + } + + return 0; } -void bch_queue_gc(struct cache_set *c) +int bch_gc_thread_start(struct cache_set *c) { - closure_trylock_call(&c->gc.cl, bch_btree_gc, bch_gc_wq, &c->cl); + c->gc_thread = kthread_create(bch_gc_thread, c, "bcache_gc"); + if (IS_ERR(c->gc_thread)) + return PTR_ERR(c->gc_thread); + + set_task_state(c->gc_thread, TASK_INTERRUPTIBLE); + return 0; } /* Initial partial gc */ @@ -2480,14 +2500,12 @@ void bch_btree_exit(void) { if (btree_io_wq) destroy_workqueue(btree_io_wq); - if (bch_gc_wq) - destroy_workqueue(bch_gc_wq); } int __init bch_btree_init(void) { - if (!(bch_gc_wq = create_singlethread_workqueue("bch_btree_gc")) || - !(btree_io_wq = create_singlethread_workqueue("bch_btree_io"))) + btree_io_wq = create_singlethread_workqueue("bch_btree_io"); + if (!btree_io_wq) return -ENOMEM; return 0; diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h index d691d95..fa9641a 100644 --- a/drivers/md/bcache/btree.h +++ b/drivers/md/bcache/btree.h @@ -388,12 +388,18 @@ int bch_btree_insert(struct btree_op *, struct cache_set *, struct keylist *); int bch_btree_search_recurse(struct btree *, struct btree_op *); -void bch_queue_gc(struct cache_set *); +int bch_gc_thread_start(struct cache_set *); size_t bch_btree_gc_finish(struct cache_set *); -void bch_moving_gc(struct closure *); +void bch_moving_gc(struct cache_set *); int bch_btree_check(struct cache_set *, struct btree_op *); uint8_t __bch_btree_mark_key(struct cache_set *, int, struct bkey *); +static inline void wake_up_gc(struct cache_set *c) +{ + if (c->gc_thread) + wake_up_process(c->gc_thread); +} + void bch_keybuf_init(struct keybuf *); void bch_refill_keybuf(struct cache_set *, struct keybuf *, struct bkey *, keybuf_pred_fn *); diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c index 2c42377..6ba0504 100644 --- a/drivers/md/bcache/movinggc.c +++ b/drivers/md/bcache/movinggc.c @@ -57,8 +57,7 @@ static void write_moving_finish(struct closure *cl) bch_keybuf_del(&io->s.op.c->moving_gc_keys, io->w); - atomic_dec_bug(&io->s.op.c->in_flight); - closure_wake_up(&io->s.op.c->moving_gc_wait); + up(&io->s.op.c->moving_in_flight); closure_return_with_destructor(cl, moving_io_destructor); } @@ -113,7 +112,7 @@ static void write_moving(struct closure *cl) bch_data_insert(&s->op.cl); } - continue_at(cl, write_moving_finish, bch_gc_wq); + continue_at(cl, write_moving_finish, system_wq); } static void read_moving_submit(struct closure *cl) @@ -124,15 +123,17 @@ static void read_moving_submit(struct closure *cl) bch_submit_bbio(bio, s->op.c, &io->w->key, 0); - continue_at(cl, write_moving, bch_gc_wq); + continue_at(cl, write_moving, system_wq); } -static void read_moving(struct closure *cl) +static void read_moving(struct cache_set *c) { - struct cache_set *c = container_of(cl, struct cache_set, moving_gc); struct keybuf_key *w; struct moving_io *io; struct bio *bio; + struct closure cl; + + closure_init_stack(&cl); /* XXX: if we error, background writeback could stall indefinitely */ @@ -164,13 +165,8 @@ static void read_moving(struct closure *cl) trace_bcache_gc_copy(&w->key); - closure_call(&io->s.cl, read_moving_submit, NULL, &c->gc.cl); - - if (atomic_inc_return(&c->in_flight) >= 64) { - closure_wait_event(&c->moving_gc_wait, cl, - atomic_read(&c->in_flight) < 64); - continue_at(cl, read_moving, bch_gc_wq); - } + down(&c->moving_in_flight); + closure_call(&io->s.cl, read_moving_submit, NULL, &cl); } if (0) { @@ -180,7 +176,7 @@ err: if (!IS_ERR_OR_NULL(w->private)) bch_keybuf_del(&c->moving_gc_keys, w); } - closure_return(cl); + closure_sync(&cl); } static bool bucket_cmp(struct bucket *l, struct bucket *r) @@ -193,15 +189,14 @@ static unsigned bucket_heap_top(struct cache *ca) return GC_SECTORS_USED(heap_peek(&ca->heap)); } -void bch_moving_gc(struct closure *cl) +void bch_moving_gc(struct cache_set *c) { - struct cache_set *c = container_of(cl, struct cache_set, gc.cl); struct cache *ca; struct bucket *b; unsigned i; if (!c->copy_gc_enabled) - closure_return(cl); + return; mutex_lock(&c->bucket_lock); @@ -242,13 +237,11 @@ void bch_moving_gc(struct closure *cl) c->moving_gc_keys.last_scanned = ZERO_KEY; - closure_init(&c->moving_gc, cl); - read_moving(&c->moving_gc); - - closure_return(cl); + read_moving(c); } void bch_moving_init_cache_set(struct cache_set *c) { bch_keybuf_init(&c->moving_gc_keys); + sema_init(&c->moving_in_flight, 64); } diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 26d18f4..f779eb4 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -520,7 +520,7 @@ static void bch_data_insert_start(struct closure *cl) if (atomic_sub_return(bio_sectors(bio), &op->c->sectors_to_gc) < 0) { set_gc_sectors(op->c); - bch_queue_gc(op->c); + wake_up_gc(op->c); } /* diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 84398a8..f89e229 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1342,6 +1342,9 @@ static void cache_set_flush(struct closure *cl) kobject_put(&c->internal); kobject_del(&c->kobj); + if (c->gc_thread) + kthread_stop(c->gc_thread); + if (!IS_ERR_OR_NULL(c->root)) list_add(&c->root->list, &c->btree_cache); @@ -1579,8 +1582,6 @@ static void run_cache_set(struct cache_set *c) bch_journal_replay(c, &journal, &op); } else { pr_notice("invalidating existing data"); - /* Don't want invalidate_buckets() to queue a gc yet */ - closure_lock(&c->gc, NULL); for_each_cache(ca, c, i) { unsigned j; @@ -1606,12 +1607,12 @@ static void run_cache_set(struct cache_set *c) err = "cannot allocate new UUID bucket"; if (__uuid_write(c)) - goto err_unlock_gc; + goto err; err = "cannot allocate new btree root"; c->root = bch_btree_node_alloc(c, 0); if (IS_ERR_OR_NULL(c->root)) - goto err_unlock_gc; + goto err; bkey_copy_key(&c->root->key, &MAX_KEY); bch_btree_node_write(c->root, &op.cl); @@ -1628,12 +1629,12 @@ static void run_cache_set(struct cache_set *c) bch_journal_next(&c->journal); bch_journal_meta(c, &op.cl); - - /* Unlock */ - closure_set_stopped(&c->gc.cl); - closure_put(&c->gc.cl); } + err = "error starting gc thread"; + if (bch_gc_thread_start(c)) + goto err; + closure_sync(&op.cl); c->sb.last_mount = get_seconds(); bcache_write_super(c); @@ -1644,9 +1645,6 @@ static void run_cache_set(struct cache_set *c) flash_devs_run(c); return; -err_unlock_gc: - closure_set_stopped(&c->gc.cl); - closure_put(&c->gc.cl); err: closure_sync(&op.cl); /* XXX: test this, it's broken */ diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index b3a66f1..ab286b9 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -566,7 +566,7 @@ STORE(__bch_cache_set) } if (attr == &sysfs_trigger_gc) - bch_queue_gc(c); + wake_up_gc(c); if (attr == &sysfs_prune_cache) { struct shrink_control sc; -- cgit v0.10.2 From 5e6926daac267dd99552ae613f041a9e88bbf258 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 24 Jul 2013 17:50:06 -0700 Subject: bcache: Convert writeback to a kthread This simplifies the writeback flow control quite a bit - previously, it was conceptually two coroutines, refill_dirty() and read_dirty(). This makes the code quite a bit more straightforward. Signed-off-by: Kent Overstreet diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 09410eb..674e2f4 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -509,7 +509,7 @@ struct cached_dev { /* Limit number of writeback bios in flight */ struct semaphore in_flight; - struct closure_with_timer writeback; + struct task_struct *writeback_thread; struct keybuf writeback_keys; @@ -1038,7 +1038,11 @@ static inline void bkey_init(struct bkey *k) #define KEY_START(k) (KEY_OFFSET(k) - KEY_SIZE(k)) #define START_KEY(k) KEY(KEY_INODE(k), KEY_START(k), 0) -#define MAX_KEY KEY(~(~0 << 20), ((uint64_t) ~0) >> 1, 0) + +#define MAX_KEY_INODE (~(~0 << 20)) +#define MAX_KEY_OFFSET (((uint64_t) ~0) >> 1) +#define MAX_KEY KEY(MAX_KEY_INODE, MAX_KEY_OFFSET, 0) + #define ZERO_KEY KEY(0, 0, 0) /* @@ -1214,8 +1218,6 @@ int bch_cache_allocator_init(struct cache *ca); void bch_debug_exit(void); int bch_debug_init(struct kobject *); -void bch_writeback_exit(void); -int bch_writeback_init(void); void bch_request_exit(void); int bch_request_init(void); void bch_btree_exit(void); diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index f89e229..b79dd5a 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1029,6 +1029,7 @@ static void cached_dev_free(struct closure *cl) struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl); cancel_delayed_work_sync(&dc->writeback_rate_update); + kthread_stop(dc->writeback_thread); mutex_lock(&bch_register_lock); @@ -2006,7 +2007,6 @@ static struct notifier_block reboot = { static void bcache_exit(void) { bch_debug_exit(); - bch_writeback_exit(); bch_request_exit(); bch_btree_exit(); if (bcache_kobj) @@ -2039,7 +2039,6 @@ static int __init bcache_init(void) sysfs_create_files(bcache_kobj, files) || bch_btree_init() || bch_request_init() || - bch_writeback_init() || bch_debug_init(bcache_kobj)) goto err; diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index 51dc709..4392f3f 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -11,18 +11,11 @@ #include "debug.h" #include "writeback.h" +#include +#include +#include #include -static struct workqueue_struct *dirty_wq; - -static void read_dirty(struct closure *); - -struct dirty_io { - struct closure cl; - struct cached_dev *dc; - struct bio bio; -}; - /* Rate limiting */ static void __update_writeback_rate(struct cached_dev *dc) @@ -72,9 +65,6 @@ out: dc->writeback_rate_derivative = derivative; dc->writeback_rate_change = change; dc->writeback_rate_target = target; - - schedule_delayed_work(&dc->writeback_rate_update, - dc->writeback_rate_update_seconds * HZ); } static void update_writeback_rate(struct work_struct *work) @@ -90,6 +80,9 @@ static void update_writeback_rate(struct work_struct *work) __update_writeback_rate(dc); up_read(&dc->writeback_lock); + + schedule_delayed_work(&dc->writeback_rate_update, + dc->writeback_rate_update_seconds * HZ); } static unsigned writeback_delay(struct cached_dev *dc, unsigned sectors) @@ -105,37 +98,11 @@ static unsigned writeback_delay(struct cached_dev *dc, unsigned sectors) return min_t(uint64_t, ret, HZ); } -/* Background writeback */ - -static bool dirty_pred(struct keybuf *buf, struct bkey *k) -{ - return KEY_DIRTY(k); -} - -static bool dirty_full_stripe_pred(struct keybuf *buf, struct bkey *k) -{ - uint64_t stripe = KEY_START(k); - unsigned nr_sectors = KEY_SIZE(k); - struct cached_dev *dc = container_of(buf, struct cached_dev, - writeback_keys); - - if (!KEY_DIRTY(k)) - return false; - - do_div(stripe, dc->disk.stripe_size); - - while (1) { - if (atomic_read(dc->disk.stripe_sectors_dirty + stripe) == - dc->disk.stripe_size) - return true; - - if (nr_sectors <= dc->disk.stripe_size) - return false; - - nr_sectors -= dc->disk.stripe_size; - stripe++; - } -} +struct dirty_io { + struct closure cl; + struct cached_dev *dc; + struct bio bio; +}; static void dirty_init(struct keybuf_key *w) { @@ -153,132 +120,6 @@ static void dirty_init(struct keybuf_key *w) bch_bio_map(bio, NULL); } -static void refill_dirty(struct closure *cl) -{ - struct cached_dev *dc = container_of(cl, struct cached_dev, - writeback.cl); - struct keybuf *buf = &dc->writeback_keys; - bool searched_from_start = false; - struct bkey end = MAX_KEY; - SET_KEY_INODE(&end, dc->disk.id); - - if (!atomic_read(&dc->disk.detaching) && - !dc->writeback_running) - closure_return(cl); - - down_write(&dc->writeback_lock); - - if (!atomic_read(&dc->has_dirty)) { - SET_BDEV_STATE(&dc->sb, BDEV_STATE_CLEAN); - bch_write_bdev_super(dc, NULL); - - up_write(&dc->writeback_lock); - closure_return(cl); - } - - if (bkey_cmp(&buf->last_scanned, &end) >= 0) { - buf->last_scanned = KEY(dc->disk.id, 0, 0); - searched_from_start = true; - } - - if (dc->partial_stripes_expensive) { - uint64_t i; - - for (i = 0; i < dc->disk.nr_stripes; i++) - if (atomic_read(dc->disk.stripe_sectors_dirty + i) == - dc->disk.stripe_size) - goto full_stripes; - - goto normal_refill; -full_stripes: - searched_from_start = false; /* not searching entire btree */ - bch_refill_keybuf(dc->disk.c, buf, &end, - dirty_full_stripe_pred); - } else { -normal_refill: - bch_refill_keybuf(dc->disk.c, buf, &end, dirty_pred); - } - - if (bkey_cmp(&buf->last_scanned, &end) >= 0 && searched_from_start) { - /* Searched the entire btree - delay awhile */ - - if (RB_EMPTY_ROOT(&buf->keys)) { - atomic_set(&dc->has_dirty, 0); - cached_dev_put(dc); - } - - if (!atomic_read(&dc->disk.detaching)) - closure_delay(&dc->writeback, dc->writeback_delay * HZ); - } - - up_write(&dc->writeback_lock); - - bch_ratelimit_reset(&dc->writeback_rate); - - /* Punt to workqueue only so we don't recurse and blow the stack */ - continue_at(cl, read_dirty, dirty_wq); -} - -void bch_writeback_queue(struct cached_dev *dc) -{ - if (closure_trylock(&dc->writeback.cl, &dc->disk.cl)) { - if (!atomic_read(&dc->disk.detaching)) - closure_delay(&dc->writeback, dc->writeback_delay * HZ); - - continue_at(&dc->writeback.cl, refill_dirty, dirty_wq); - } -} - -void bch_writeback_add(struct cached_dev *dc) -{ - if (!atomic_read(&dc->has_dirty) && - !atomic_xchg(&dc->has_dirty, 1)) { - atomic_inc(&dc->count); - - if (BDEV_STATE(&dc->sb) != BDEV_STATE_DIRTY) { - SET_BDEV_STATE(&dc->sb, BDEV_STATE_DIRTY); - /* XXX: should do this synchronously */ - bch_write_bdev_super(dc, NULL); - } - - bch_writeback_queue(dc); - - if (dc->writeback_percent) - schedule_delayed_work(&dc->writeback_rate_update, - dc->writeback_rate_update_seconds * HZ); - } -} - -void bcache_dev_sectors_dirty_add(struct cache_set *c, unsigned inode, - uint64_t offset, int nr_sectors) -{ - struct bcache_device *d = c->devices[inode]; - unsigned stripe_offset; - uint64_t stripe = offset; - - if (!d) - return; - - do_div(stripe, d->stripe_size); - - stripe_offset = offset & (d->stripe_size - 1); - - while (nr_sectors) { - int s = min_t(unsigned, abs(nr_sectors), - d->stripe_size - stripe_offset); - - if (nr_sectors < 0) - s = -s; - - atomic_add(s, d->stripe_sectors_dirty + stripe); - nr_sectors -= s; - stripe_offset = 0; - stripe++; - } -} - -/* Background writeback - IO loop */ - static void dirty_io_destructor(struct closure *cl) { struct dirty_io *io = container_of(cl, struct dirty_io, cl); @@ -378,30 +219,33 @@ static void read_dirty_submit(struct closure *cl) continue_at(cl, write_dirty, system_wq); } -static void read_dirty(struct closure *cl) +static void read_dirty(struct cached_dev *dc) { - struct cached_dev *dc = container_of(cl, struct cached_dev, - writeback.cl); - unsigned delay = writeback_delay(dc, 0); + unsigned delay = 0; struct keybuf_key *w; struct dirty_io *io; + struct closure cl; + + closure_init_stack(&cl); /* * XXX: if we error, background writeback just spins. Should use some * mempools. */ - while (1) { + while (!kthread_should_stop()) { + try_to_freeze(); + w = bch_keybuf_next(&dc->writeback_keys); if (!w) break; BUG_ON(ptr_stale(dc->disk.c, &w->key, 0)); - if (delay > 0 && - (KEY_START(&w->key) != dc->last_read || - jiffies_to_msecs(delay) > 50)) - delay = schedule_timeout_uninterruptible(delay); + if (KEY_START(&w->key) != dc->last_read || + jiffies_to_msecs(delay) > 50) + while (!kthread_should_stop() && delay) + delay = schedule_timeout_interruptible(delay); dc->last_read = KEY_OFFSET(&w->key); @@ -427,7 +271,7 @@ static void read_dirty(struct closure *cl) trace_bcache_writeback(&w->key); down(&dc->in_flight); - closure_call(&io->cl, read_dirty_submit, NULL, cl); + closure_call(&io->cl, read_dirty_submit, NULL, &cl); delay = writeback_delay(dc, KEY_SIZE(&w->key)); } @@ -443,7 +287,148 @@ err: * Wait for outstanding writeback IOs to finish (and keybuf slots to be * freed) before refilling again */ - continue_at(cl, refill_dirty, dirty_wq); + closure_sync(&cl); +} + +/* Scan for dirty data */ + +void bcache_dev_sectors_dirty_add(struct cache_set *c, unsigned inode, + uint64_t offset, int nr_sectors) +{ + struct bcache_device *d = c->devices[inode]; + unsigned stripe_offset; + uint64_t stripe = offset; + + if (!d) + return; + + do_div(stripe, d->stripe_size); + + stripe_offset = offset & (d->stripe_size - 1); + + while (nr_sectors) { + int s = min_t(unsigned, abs(nr_sectors), + d->stripe_size - stripe_offset); + + if (nr_sectors < 0) + s = -s; + + atomic_add(s, d->stripe_sectors_dirty + stripe); + nr_sectors -= s; + stripe_offset = 0; + stripe++; + } +} + +static bool dirty_pred(struct keybuf *buf, struct bkey *k) +{ + return KEY_DIRTY(k); +} + +static bool dirty_full_stripe_pred(struct keybuf *buf, struct bkey *k) +{ + uint64_t stripe = KEY_START(k); + unsigned nr_sectors = KEY_SIZE(k); + struct cached_dev *dc = container_of(buf, struct cached_dev, + writeback_keys); + + if (!KEY_DIRTY(k)) + return false; + + do_div(stripe, dc->disk.stripe_size); + + while (1) { + if (atomic_read(dc->disk.stripe_sectors_dirty + stripe) == + dc->disk.stripe_size) + return true; + + if (nr_sectors <= dc->disk.stripe_size) + return false; + + nr_sectors -= dc->disk.stripe_size; + stripe++; + } +} + +static bool refill_dirty(struct cached_dev *dc) +{ + struct keybuf *buf = &dc->writeback_keys; + bool searched_from_start = false; + struct bkey end = KEY(dc->disk.id, MAX_KEY_OFFSET, 0); + + if (bkey_cmp(&buf->last_scanned, &end) >= 0) { + buf->last_scanned = KEY(dc->disk.id, 0, 0); + searched_from_start = true; + } + + if (dc->partial_stripes_expensive) { + uint64_t i; + + for (i = 0; i < dc->disk.nr_stripes; i++) + if (atomic_read(dc->disk.stripe_sectors_dirty + i) == + dc->disk.stripe_size) + goto full_stripes; + + goto normal_refill; +full_stripes: + searched_from_start = false; /* not searching entire btree */ + bch_refill_keybuf(dc->disk.c, buf, &end, + dirty_full_stripe_pred); + } else { +normal_refill: + bch_refill_keybuf(dc->disk.c, buf, &end, dirty_pred); + } + + return bkey_cmp(&buf->last_scanned, &end) >= 0 && searched_from_start; +} + +static int bch_writeback_thread(void *arg) +{ + struct cached_dev *dc = arg; + bool searched_full_index; + + while (!kthread_should_stop()) { + down_write(&dc->writeback_lock); + if (!atomic_read(&dc->has_dirty) || + (!atomic_read(&dc->disk.detaching) && + !dc->writeback_running)) { + up_write(&dc->writeback_lock); + set_current_state(TASK_INTERRUPTIBLE); + + if (kthread_should_stop()) + return 0; + + try_to_freeze(); + schedule(); + continue; + } + + searched_full_index = refill_dirty(dc); + + if (searched_full_index && + RB_EMPTY_ROOT(&dc->writeback_keys.keys)) { + atomic_set(&dc->has_dirty, 0); + cached_dev_put(dc); + SET_BDEV_STATE(&dc->sb, BDEV_STATE_CLEAN); + bch_write_bdev_super(dc, NULL); + } + + up_write(&dc->writeback_lock); + + bch_ratelimit_reset(&dc->writeback_rate); + read_dirty(dc); + + if (searched_full_index) { + unsigned delay = dc->writeback_delay * HZ; + + while (delay && + !kthread_should_stop() && + !atomic_read(&dc->disk.detaching)) + delay = schedule_timeout_interruptible(delay); + } + } + + return 0; } /* Init */ @@ -483,12 +468,10 @@ void bch_sectors_dirty_init(struct cached_dev *dc) btree_root(sectors_dirty_init, dc->disk.c, &op, dc); } -void bch_cached_dev_writeback_init(struct cached_dev *dc) +int bch_cached_dev_writeback_init(struct cached_dev *dc) { sema_init(&dc->in_flight, 64); - closure_init_unlocked(&dc->writeback); init_rwsem(&dc->writeback_lock); - bch_keybuf_init(&dc->writeback_keys); dc->writeback_metadata = true; @@ -502,22 +485,16 @@ void bch_cached_dev_writeback_init(struct cached_dev *dc) dc->writeback_rate_p_term_inverse = 64; dc->writeback_rate_d_smooth = 8; + dc->writeback_thread = kthread_create(bch_writeback_thread, dc, + "bcache_writeback"); + if (IS_ERR(dc->writeback_thread)) + return PTR_ERR(dc->writeback_thread); + + set_task_state(dc->writeback_thread, TASK_INTERRUPTIBLE); + INIT_DELAYED_WORK(&dc->writeback_rate_update, update_writeback_rate); schedule_delayed_work(&dc->writeback_rate_update, dc->writeback_rate_update_seconds * HZ); -} - -void bch_writeback_exit(void) -{ - if (dirty_wq) - destroy_workqueue(dirty_wq); -} - -int __init bch_writeback_init(void) -{ - dirty_wq = create_workqueue("bcache_writeback"); - if (!dirty_wq) - return -ENOMEM; return 0; } diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h index 3496188..60516bf 100644 --- a/drivers/md/bcache/writeback.h +++ b/drivers/md/bcache/writeback.h @@ -56,11 +56,30 @@ static inline bool should_writeback(struct cached_dev *dc, struct bio *bio, in_use <= CUTOFF_WRITEBACK; } +static inline void bch_writeback_queue(struct cached_dev *dc) +{ + wake_up_process(dc->writeback_thread); +} + +static inline void bch_writeback_add(struct cached_dev *dc) +{ + if (!atomic_read(&dc->has_dirty) && + !atomic_xchg(&dc->has_dirty, 1)) { + atomic_inc(&dc->count); + + if (BDEV_STATE(&dc->sb) != BDEV_STATE_DIRTY) { + SET_BDEV_STATE(&dc->sb, BDEV_STATE_DIRTY); + /* XXX: should do this synchronously */ + bch_write_bdev_super(dc, NULL); + } + + bch_writeback_queue(dc); + } +} + void bcache_dev_sectors_dirty_add(struct cache_set *, unsigned, uint64_t, int); -void bch_writeback_queue(struct cached_dev *); -void bch_writeback_add(struct cached_dev *); void bch_sectors_dirty_init(struct cached_dev *dc); -void bch_cached_dev_writeback_init(struct cached_dev *); +int bch_cached_dev_writeback_init(struct cached_dev *); #endif -- cgit v0.10.2 From 48dad8baf92fe8967d9e1358af1cfdda1d2d3298 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 10 Sep 2013 18:48:51 -0700 Subject: bcache: Add btree_map() functions Lots of stuff has been open coding its own btree traversal - which is generally pretty simple code, but there are a few subtleties. This adds new new functions, bch_btree_map_nodes() and bch_btree_map_keys(), which do the traversal for you. Everything that's open coding btree traversal now (with the exception of garbage collection) is slowly going to be converted to these two functions; being able to write other code at a higher level of abstraction is a big improvement w.r.t. overall code quality. Signed-off-by: Kent Overstreet diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 674e2f4..20fe96c 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -384,8 +384,6 @@ struct keybuf_key { void *private; }; -typedef bool (keybuf_pred_fn)(struct keybuf *, struct bkey *); - struct keybuf { struct bkey last_scanned; spinlock_t lock; diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c index d0512e4..14c2a23 100644 --- a/drivers/md/bcache/bset.c +++ b/drivers/md/bcache/bset.c @@ -842,6 +842,13 @@ struct bkey *__bch_bset_search(struct btree *b, struct bset_tree *t, /* Btree iterator */ +/* + * Returns true if l > r - unless l == r, in which case returns true if l is + * older than r. + * + * Necessary for btree_sort_fixup() - if there are multiple keys that compare + * equal in different sets, we have to process them newest to oldest. + */ static inline bool btree_iter_cmp(struct btree_iter_set l, struct btree_iter_set r) { @@ -1146,16 +1153,16 @@ out: /* Sysfs stuff */ struct bset_stats { + struct btree_op op; size_t nodes; size_t sets_written, sets_unwritten; size_t bytes_written, bytes_unwritten; size_t floats, failed; }; -static int bch_btree_bset_stats(struct btree *b, struct btree_op *op, - struct bset_stats *stats) +static int btree_bset_stats(struct btree_op *op, struct btree *b) { - struct bkey *k; + struct bset_stats *stats = container_of(op, struct bset_stats, op); unsigned i; stats->nodes++; @@ -1180,30 +1187,20 @@ static int bch_btree_bset_stats(struct btree *b, struct btree_op *op, } } - if (b->level) { - struct btree_iter iter; - - for_each_key_filter(b, k, &iter, bch_ptr_bad) { - int ret = btree(bset_stats, k, b, op, stats); - if (ret) - return ret; - } - } - - return 0; + return MAP_CONTINUE; } int bch_bset_print_stats(struct cache_set *c, char *buf) { - struct btree_op op; struct bset_stats t; int ret; - bch_btree_op_init_stack(&op); memset(&t, 0, sizeof(struct bset_stats)); + bch_btree_op_init_stack(&t.op); + t.op.c = c; - ret = btree_root(bset_stats, c, &op, &t); - if (ret) + ret = bch_btree_map_nodes(&t.op, c, &ZERO_KEY, btree_bset_stats); + if (ret < 0) return ret; return snprintf(buf, PAGE_SIZE, diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 17bfd87..cfbdcf3 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -2296,6 +2296,82 @@ int bch_btree_search_recurse(struct btree *b, struct btree_op *op) return ret; } +/* Map across nodes or keys */ + +static int bch_btree_map_nodes_recurse(struct btree *b, struct btree_op *op, + struct bkey *from, + btree_map_nodes_fn *fn, int flags) +{ + int ret = MAP_CONTINUE; + + if (b->level) { + struct bkey *k; + struct btree_iter iter; + + bch_btree_iter_init(b, &iter, from); + + while ((k = bch_btree_iter_next_filter(&iter, b, + bch_ptr_bad))) { + ret = btree(map_nodes_recurse, k, b, + op, from, fn, flags); + from = NULL; + + if (ret != MAP_CONTINUE) + return ret; + } + } + + if (!b->level || flags == MAP_ALL_NODES) + ret = fn(op, b); + + return ret; +} + +int __bch_btree_map_nodes(struct btree_op *op, struct cache_set *c, + struct bkey *from, btree_map_nodes_fn *fn, int flags) +{ + int ret = btree_root(map_nodes_recurse, c, op, from, fn, flags); + if (closure_blocking(&op->cl)) + closure_sync(&op->cl); + return ret; +} + +static int bch_btree_map_keys_recurse(struct btree *b, struct btree_op *op, + struct bkey *from, btree_map_keys_fn *fn, + int flags) +{ + int ret = MAP_CONTINUE; + struct bkey *k; + struct btree_iter iter; + + bch_btree_iter_init(b, &iter, from); + + while ((k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad))) { + ret = !b->level + ? fn(op, b, k) + : btree(map_keys_recurse, k, b, op, from, fn, flags); + from = NULL; + + if (ret != MAP_CONTINUE) + return ret; + } + + if (!b->level && (flags & MAP_END_KEY)) + ret = fn(op, b, &KEY(KEY_INODE(&b->key), + KEY_OFFSET(&b->key), 0)); + + return ret; +} + +int bch_btree_map_keys(struct btree_op *op, struct cache_set *c, + struct bkey *from, btree_map_keys_fn *fn, int flags) +{ + int ret = btree_root(map_keys_recurse, c, op, from, fn, flags); + if (closure_blocking(&op->cl)) + closure_sync(&op->cl); + return ret; +} + /* Keybuf code */ static inline int keybuf_cmp(struct keybuf_key *l, struct keybuf_key *r) @@ -2314,74 +2390,70 @@ static inline int keybuf_nonoverlapping_cmp(struct keybuf_key *l, return clamp_t(int64_t, bkey_cmp(&l->key, &r->key), -1, 1); } -static int bch_btree_refill_keybuf(struct btree *b, struct btree_op *op, - struct keybuf *buf, struct bkey *end, - keybuf_pred_fn *pred) -{ - struct btree_iter iter; - bch_btree_iter_init(b, &iter, &buf->last_scanned); - - while (!array_freelist_empty(&buf->freelist)) { - struct bkey *k = bch_btree_iter_next_filter(&iter, b, - bch_ptr_bad); - - if (!b->level) { - if (!k) { - buf->last_scanned = b->key; - break; - } +struct refill { + struct btree_op op; + struct keybuf *buf; + struct bkey *end; + keybuf_pred_fn *pred; +}; - buf->last_scanned = *k; - if (bkey_cmp(&buf->last_scanned, end) >= 0) - break; +static int refill_keybuf_fn(struct btree_op *op, struct btree *b, + struct bkey *k) +{ + struct refill *refill = container_of(op, struct refill, op); + struct keybuf *buf = refill->buf; + int ret = MAP_CONTINUE; - if (pred(buf, k)) { - struct keybuf_key *w; + if (bkey_cmp(k, refill->end) >= 0) { + ret = MAP_DONE; + goto out; + } - spin_lock(&buf->lock); + if (!KEY_SIZE(k)) /* end key */ + goto out; - w = array_alloc(&buf->freelist); + if (refill->pred(buf, k)) { + struct keybuf_key *w; - w->private = NULL; - bkey_copy(&w->key, k); + spin_lock(&buf->lock); - if (RB_INSERT(&buf->keys, w, node, keybuf_cmp)) - array_free(&buf->freelist, w); + w = array_alloc(&buf->freelist); + if (!w) { + spin_unlock(&buf->lock); + return MAP_DONE; + } - spin_unlock(&buf->lock); - } - } else { - if (!k) - break; + w->private = NULL; + bkey_copy(&w->key, k); - btree(refill_keybuf, k, b, op, buf, end, pred); - /* - * Might get an error here, but can't really do anything - * and it'll get logged elsewhere. Just read what we - * can. - */ + if (RB_INSERT(&buf->keys, w, node, keybuf_cmp)) + array_free(&buf->freelist, w); - if (bkey_cmp(&buf->last_scanned, end) >= 0) - break; + if (array_freelist_empty(&buf->freelist)) + ret = MAP_DONE; - cond_resched(); - } + spin_unlock(&buf->lock); } - - return 0; +out: + buf->last_scanned = *k; + return ret; } void bch_refill_keybuf(struct cache_set *c, struct keybuf *buf, struct bkey *end, keybuf_pred_fn *pred) { struct bkey start = buf->last_scanned; - struct btree_op op; - bch_btree_op_init_stack(&op); + struct refill refill; cond_resched(); - btree_root(refill_keybuf, c, &op, buf, end, pred); - closure_sync(&op.cl); + bch_btree_op_init_stack(&refill.op); + refill.buf = buf; + refill.end = end; + refill.pred = pred; + + bch_btree_map_keys(&refill.op, c, &buf->last_scanned, + refill_keybuf_fn, MAP_END_KEY); pr_debug("found %s keys from %llu:%llu to %llu:%llu", RB_EMPTY_ROOT(&buf->keys) ? "no" : @@ -2465,9 +2537,9 @@ struct keybuf_key *bch_keybuf_next(struct keybuf *buf) } struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *c, - struct keybuf *buf, - struct bkey *end, - keybuf_pred_fn *pred) + struct keybuf *buf, + struct bkey *end, + keybuf_pred_fn *pred) { struct keybuf_key *ret; diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h index fa9641a..cafdeb0 100644 --- a/drivers/md/bcache/btree.h +++ b/drivers/md/bcache/btree.h @@ -400,9 +400,42 @@ static inline void wake_up_gc(struct cache_set *c) wake_up_process(c->gc_thread); } +#define MAP_DONE 0 +#define MAP_CONTINUE 1 + +#define MAP_ALL_NODES 0 +#define MAP_LEAF_NODES 1 + +#define MAP_END_KEY 1 + +typedef int (btree_map_nodes_fn)(struct btree_op *, struct btree *); +int __bch_btree_map_nodes(struct btree_op *, struct cache_set *, + struct bkey *, btree_map_nodes_fn *, int); + +static inline int bch_btree_map_nodes(struct btree_op *op, struct cache_set *c, + struct bkey *from, btree_map_nodes_fn *fn) +{ + return __bch_btree_map_nodes(op, c, from, fn, MAP_ALL_NODES); +} + +static inline int bch_btree_map_leaf_nodes(struct btree_op *op, + struct cache_set *c, + struct bkey *from, + btree_map_nodes_fn *fn) +{ + return __bch_btree_map_nodes(op, c, from, fn, MAP_LEAF_NODES); +} + +typedef int (btree_map_keys_fn)(struct btree_op *, struct btree *, + struct bkey *); +int bch_btree_map_keys(struct btree_op *, struct cache_set *, + struct bkey *, btree_map_keys_fn *, int); + +typedef bool (keybuf_pred_fn)(struct keybuf *, struct bkey *); + void bch_keybuf_init(struct keybuf *); -void bch_refill_keybuf(struct cache_set *, struct keybuf *, struct bkey *, - keybuf_pred_fn *); +void bch_refill_keybuf(struct cache_set *, struct keybuf *, + struct bkey *, keybuf_pred_fn *); bool bch_keybuf_check_overlapping(struct keybuf *, struct bkey *, struct bkey *); void bch_keybuf_del(struct keybuf *, struct keybuf_key *); diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index 4392f3f..c68de9f 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -433,31 +433,17 @@ static int bch_writeback_thread(void *arg) /* Init */ -static int bch_btree_sectors_dirty_init(struct btree *b, struct btree_op *op, - struct cached_dev *dc) +static int sectors_dirty_init_fn(struct btree_op *op, struct btree *b, + struct bkey *k) { - struct bkey *k; - struct btree_iter iter; - - bch_btree_iter_init(b, &iter, &KEY(dc->disk.id, 0, 0)); - while ((k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad))) - if (!b->level) { - if (KEY_INODE(k) > dc->disk.id) - break; - - if (KEY_DIRTY(k)) - bcache_dev_sectors_dirty_add(b->c, dc->disk.id, - KEY_START(k), - KEY_SIZE(k)); - } else { - btree(sectors_dirty_init, k, b, op, dc); - if (KEY_INODE(k) > dc->disk.id) - break; - - cond_resched(); - } + if (KEY_INODE(k) > op->inode) + return MAP_DONE; - return 0; + if (KEY_DIRTY(k)) + bcache_dev_sectors_dirty_add(b->c, KEY_INODE(k), + KEY_START(k), KEY_SIZE(k)); + + return MAP_CONTINUE; } void bch_sectors_dirty_init(struct cached_dev *dc) @@ -465,7 +451,10 @@ void bch_sectors_dirty_init(struct cached_dev *dc) struct btree_op op; bch_btree_op_init_stack(&op); - btree_root(sectors_dirty_init, dc->disk.c, &op, dc); + op.inode = dc->disk.id; + + bch_btree_map_keys(&op, dc->disk.c, &KEY(op.inode, 0, 0), + sectors_dirty_init_fn, 0); } int bch_cached_dev_writeback_init(struct cached_dev *dc) -- cgit v0.10.2 From df8e89701fb02cba6e09c5f46f002778b5b52dd2 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 24 Jul 2013 17:37:59 -0700 Subject: bcache: Move some stuff to btree.c With the new btree_map() functions, we don't need to export the stuff needed for traversing the btree anymore. Signed-off-by: Kent Overstreet diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index cfbdcf3..bfd60e6 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -99,6 +99,13 @@ static const char *op_type(struct btree_op *op) return op_types[op->type]; } +enum { + BTREE_INSERT_STATUS_INSERT, + BTREE_INSERT_STATUS_BACK_MERGE, + BTREE_INSERT_STATUS_OVERWROTE, + BTREE_INSERT_STATUS_FRONT_MERGE, +}; + #define MAX_NEED_GC 64 #define MAX_SAVE_PRIO 72 @@ -116,6 +123,78 @@ void bch_btree_op_init_stack(struct btree_op *op) op->lock = -1; } +static inline bool should_split(struct btree *b) +{ + struct bset *i = write_block(b); + return b->written >= btree_blocks(b) || + (b->written + __set_blocks(i, i->keys + 15, b->c) + > btree_blocks(b)); +} + +#define insert_lock(s, b) ((b)->level <= (s)->lock) + +/* + * These macros are for recursing down the btree - they handle the details of + * locking and looking up nodes in the cache for you. They're best treated as + * mere syntax when reading code that uses them. + * + * op->lock determines whether we take a read or a write lock at a given depth. + * If you've got a read lock and find that you need a write lock (i.e. you're + * going to have to split), set op->lock and return -EINTR; btree_root() will + * call you again and you'll have the correct lock. + */ + +/** + * btree - recurse down the btree on a specified key + * @fn: function to call, which will be passed the child node + * @key: key to recurse on + * @b: parent btree node + * @op: pointer to struct btree_op + */ +#define btree(fn, key, b, op, ...) \ +({ \ + int _r, l = (b)->level - 1; \ + bool _w = l <= (op)->lock; \ + struct btree *_child = bch_btree_node_get((b)->c, key, l, _w); \ + if (!IS_ERR(_child)) { \ + _child->parent = (b); \ + _r = bch_btree_ ## fn(_child, op, ##__VA_ARGS__); \ + rw_unlock(_w, _child); \ + } else \ + _r = PTR_ERR(_child); \ + _r; \ +}) + +/** + * btree_root - call a function on the root of the btree + * @fn: function to call, which will be passed the child node + * @c: cache set + * @op: pointer to struct btree_op + */ +#define btree_root(fn, c, op, ...) \ +({ \ + int _r = -EINTR; \ + do { \ + struct btree *_b = (c)->root; \ + bool _w = insert_lock(op, _b); \ + rw_lock(_w, _b, _b->level); \ + if (_b == (c)->root && \ + _w == insert_lock(op, _b)) { \ + _b->parent = NULL; \ + _r = bch_btree_ ## fn(_b, op, ##__VA_ARGS__); \ + } \ + rw_unlock(_w, _b); \ + bch_cannibalize_unlock(c); \ + if (_r == -ENOSPC) { \ + wait_event((c)->try_wait, \ + !(c)->try_harder); \ + _r = -EINTR; \ + } \ + } while (_r == -EINTR); \ + \ + _r; \ +}) + /* Btree key manipulation */ void __bkey_put(struct cache_set *c, struct bkey *k) @@ -811,7 +890,7 @@ static struct btree *mca_cannibalize(struct cache_set *c, struct bkey *k) * cannibalize_bucket() will take. This means every time we unlock the root of * the btree, we need to release this lock if we have it held. */ -void bch_cannibalize_unlock(struct cache_set *c) +static void bch_cannibalize_unlock(struct cache_set *c) { if (c->try_harder == current) { bch_time_stats_update(&c->try_harder_time, c->try_harder_start); @@ -2262,7 +2341,7 @@ static int submit_partial_cache_hit(struct btree *b, struct btree_op *op, return 0; } -int bch_btree_search_recurse(struct btree *b, struct btree_op *op) +static int bch_btree_search_recurse(struct btree *b, struct btree_op *op) { struct search *s = container_of(op, struct search, op); struct bio *bio = &s->bio.bio; @@ -2296,6 +2375,18 @@ int bch_btree_search_recurse(struct btree *b, struct btree_op *op) return ret; } +void bch_btree_search_async(struct closure *cl) +{ + struct btree_op *op = container_of(cl, struct btree_op, cl); + + int ret = btree_root(search_recurse, op->c, op); + + if (ret == -EAGAIN) + continue_at(cl, bch_btree_search_async, bcache_wq); + + closure_return(cl); +} + /* Map across nodes or keys */ static int bch_btree_map_nodes_recurse(struct btree *b, struct btree_op *op, diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h index cafdeb0..1690f47 100644 --- a/drivers/md/bcache/btree.h +++ b/drivers/md/bcache/btree.h @@ -270,13 +270,6 @@ struct btree_op { BKEY_PADDED(replace); }; -enum { - BTREE_INSERT_STATUS_INSERT, - BTREE_INSERT_STATUS_BACK_MERGE, - BTREE_INSERT_STATUS_OVERWROTE, - BTREE_INSERT_STATUS_FRONT_MERGE, -}; - void bch_btree_op_init_stack(struct btree_op *); static inline void rw_lock(bool w, struct btree *b, int level) @@ -302,82 +295,9 @@ static inline void rw_unlock(bool w, struct btree *b) (w ? up_write : up_read)(&b->lock); } -#define insert_lock(s, b) ((b)->level <= (s)->lock) - -/* - * These macros are for recursing down the btree - they handle the details of - * locking and looking up nodes in the cache for you. They're best treated as - * mere syntax when reading code that uses them. - * - * op->lock determines whether we take a read or a write lock at a given depth. - * If you've got a read lock and find that you need a write lock (i.e. you're - * going to have to split), set op->lock and return -EINTR; btree_root() will - * call you again and you'll have the correct lock. - */ - -/** - * btree - recurse down the btree on a specified key - * @fn: function to call, which will be passed the child node - * @key: key to recurse on - * @b: parent btree node - * @op: pointer to struct btree_op - */ -#define btree(fn, key, b, op, ...) \ -({ \ - int _r, l = (b)->level - 1; \ - bool _w = l <= (op)->lock; \ - struct btree *_child = bch_btree_node_get((b)->c, key, l, _w); \ - if (!IS_ERR(_child)) { \ - _child->parent = (b); \ - _r = bch_btree_ ## fn(_child, op, ##__VA_ARGS__); \ - rw_unlock(_w, _child); \ - } else \ - _r = PTR_ERR(_child); \ - _r; \ -}) - -/** - * btree_root - call a function on the root of the btree - * @fn: function to call, which will be passed the child node - * @c: cache set - * @op: pointer to struct btree_op - */ -#define btree_root(fn, c, op, ...) \ -({ \ - int _r = -EINTR; \ - do { \ - struct btree *_b = (c)->root; \ - bool _w = insert_lock(op, _b); \ - rw_lock(_w, _b, _b->level); \ - if (_b == (c)->root && \ - _w == insert_lock(op, _b)) { \ - _b->parent = NULL; \ - _r = bch_btree_ ## fn(_b, op, ##__VA_ARGS__); \ - } \ - rw_unlock(_w, _b); \ - bch_cannibalize_unlock(c); \ - if (_r == -ENOSPC) { \ - wait_event((c)->try_wait, \ - !(c)->try_harder); \ - _r = -EINTR; \ - } \ - } while (_r == -EINTR); \ - \ - _r; \ -}) - -static inline bool should_split(struct btree *b) -{ - struct bset *i = write_block(b); - return b->written >= btree_blocks(b) || - (b->written + __set_blocks(i, i->keys + 15, b->c) - > btree_blocks(b)); -} - void bch_btree_node_read(struct btree *); void bch_btree_node_write(struct btree *, struct closure *); -void bch_cannibalize_unlock(struct cache_set *); void bch_btree_set_root(struct btree *); struct btree *bch_btree_node_alloc(struct cache_set *, int); struct btree *bch_btree_node_get(struct cache_set *, struct bkey *, int, bool); @@ -386,7 +306,7 @@ int bch_btree_insert_check_key(struct btree *, struct btree_op *, struct bkey *); int bch_btree_insert(struct btree_op *, struct cache_set *, struct keylist *); -int bch_btree_search_recurse(struct btree *, struct btree_op *); +void bch_btree_search_async(struct closure *); int bch_gc_thread_start(struct cache_set *); size_t bch_btree_gc_finish(struct cache_set *); diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index f779eb4..a168725 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -754,18 +754,6 @@ static struct search *search_alloc(struct bio *bio, struct bcache_device *d) return s; } -static void btree_read_async(struct closure *cl) -{ - struct btree_op *op = container_of(cl, struct btree_op, cl); - - int ret = btree_root(search_recurse, op->c, op); - - if (ret == -EAGAIN) - continue_at(cl, btree_read_async, bcache_wq); - - closure_return(cl); -} - /* Cached devices */ static void cached_dev_bio_complete(struct closure *cl) @@ -1087,7 +1075,7 @@ static void cached_dev_read(struct cached_dev *dc, struct search *s) { struct closure *cl = &s->cl; - closure_call(&s->op.cl, btree_read_async, NULL, cl); + closure_call(&s->op.cl, bch_btree_search_async, NULL, cl); continue_at(cl, cached_dev_read_done_bh, NULL); } @@ -1351,7 +1339,7 @@ static void flash_dev_make_request(struct request_queue *q, struct bio *bio) closure_call(&s->op.cl, bch_data_insert, NULL, cl); } else { - closure_call(&s->op.cl, btree_read_async, NULL, cl); + closure_call(&s->op.cl, bch_btree_search_async, NULL, cl); } continue_at(cl, search_free, NULL); -- cgit v0.10.2 From 2c1953e201a05ddfb1ea53f23d81a492c6513028 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 24 Jul 2013 17:41:08 -0700 Subject: bcache: Convert bch_btree_read_async() to bch_btree_map_keys() This is a fairly straightforward conversion, mostly reshuffling - op->lookup_done goes away, replaced by MAP_DONE/MAP_CONTINUE. And the code for handling cache hits and misses wasn't really btree code, so it gets moved to request.c. Signed-off-by: Kent Overstreet diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index bfd60e6..3949673 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -23,7 +23,6 @@ #include "bcache.h" #include "btree.h" #include "debug.h" -#include "request.h" #include "writeback.h" #include @@ -2255,138 +2254,6 @@ void bch_btree_set_root(struct btree *b) closure_sync(&cl); } -/* Cache lookup */ - -static int submit_partial_cache_miss(struct btree *b, struct btree_op *op, - struct bkey *k) -{ - struct search *s = container_of(op, struct search, op); - struct bio *bio = &s->bio.bio; - int ret = 0; - - while (!ret && - !op->lookup_done) { - unsigned sectors = INT_MAX; - - if (KEY_INODE(k) == op->inode) { - if (KEY_START(k) <= bio->bi_sector) - break; - - sectors = min_t(uint64_t, sectors, - KEY_START(k) - bio->bi_sector); - } - - ret = s->d->cache_miss(b, s, bio, sectors); - } - - return ret; -} - -/* - * Read from a single key, handling the initial cache miss if the key starts in - * the middle of the bio - */ -static int submit_partial_cache_hit(struct btree *b, struct btree_op *op, - struct bkey *k) -{ - struct search *s = container_of(op, struct search, op); - struct bio *bio = &s->bio.bio; - unsigned ptr; - struct bio *n; - - int ret = submit_partial_cache_miss(b, op, k); - if (ret || op->lookup_done) - return ret; - - /* XXX: figure out best pointer - for multiple cache devices */ - ptr = 0; - - PTR_BUCKET(b->c, k, ptr)->prio = INITIAL_PRIO; - - while (!op->lookup_done && - KEY_INODE(k) == op->inode && - bio->bi_sector < KEY_OFFSET(k)) { - struct bkey *bio_key; - sector_t sector = PTR_OFFSET(k, ptr) + - (bio->bi_sector - KEY_START(k)); - unsigned sectors = min_t(uint64_t, INT_MAX, - KEY_OFFSET(k) - bio->bi_sector); - - n = bch_bio_split(bio, sectors, GFP_NOIO, s->d->bio_split); - if (n == bio) - op->lookup_done = true; - - bio_key = &container_of(n, struct bbio, bio)->key; - - /* - * The bucket we're reading from might be reused while our bio - * is in flight, and we could then end up reading the wrong - * data. - * - * We guard against this by checking (in cache_read_endio()) if - * the pointer is stale again; if so, we treat it as an error - * and reread from the backing device (but we don't pass that - * error up anywhere). - */ - - bch_bkey_copy_single_ptr(bio_key, k, ptr); - SET_PTR_OFFSET(bio_key, 0, sector); - - n->bi_end_io = bch_cache_read_endio; - n->bi_private = &s->cl; - - __bch_submit_bbio(n, b->c); - } - - return 0; -} - -static int bch_btree_search_recurse(struct btree *b, struct btree_op *op) -{ - struct search *s = container_of(op, struct search, op); - struct bio *bio = &s->bio.bio; - - int ret = 0; - struct bkey *k; - struct btree_iter iter; - bch_btree_iter_init(b, &iter, &KEY(op->inode, bio->bi_sector, 0)); - - do { - k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad); - if (!k) { - /* - * b->key would be exactly what we want, except that - * pointers to btree nodes have nonzero size - we - * wouldn't go far enough - */ - - ret = submit_partial_cache_miss(b, op, - &KEY(KEY_INODE(&b->key), - KEY_OFFSET(&b->key), 0)); - break; - } - - ret = b->level - ? btree(search_recurse, k, b, op) - : submit_partial_cache_hit(b, op, k); - } while (!ret && - !op->lookup_done); - - return ret; -} - -void bch_btree_search_async(struct closure *cl) -{ - struct btree_op *op = container_of(cl, struct btree_op, cl); - - int ret = btree_root(search_recurse, op->c, op); - - if (ret == -EAGAIN) - continue_at(cl, bch_btree_search_async, bcache_wq); - - closure_return(cl); -} - /* Map across nodes or keys */ static int bch_btree_map_nodes_recurse(struct btree *b, struct btree_op *op, diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h index 1690f47..60dadd7 100644 --- a/drivers/md/bcache/btree.h +++ b/drivers/md/bcache/btree.h @@ -264,7 +264,6 @@ struct btree_op { unsigned flush_journal:1; unsigned insert_data_done:1; - unsigned lookup_done:1; unsigned insert_collision:1; BKEY_PADDED(replace); @@ -306,8 +305,6 @@ int bch_btree_insert_check_key(struct btree *, struct btree_op *, struct bkey *); int bch_btree_insert(struct btree_op *, struct cache_set *, struct keylist *); -void bch_btree_search_async(struct closure *); - int bch_gc_thread_start(struct cache_set *); size_t bch_btree_gc_finish(struct cache_set *); void bch_moving_gc(struct cache_set *); diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c index 8866f8e..6f4daf0 100644 --- a/drivers/md/bcache/journal.c +++ b/drivers/md/bcache/journal.c @@ -7,7 +7,6 @@ #include "bcache.h" #include "btree.h" #include "debug.h" -#include "request.h" #include diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index a168725..854743e 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -638,24 +638,9 @@ void bch_data_insert(struct closure *cl) bch_data_insert_start(cl); } -/* Common code for the make_request functions */ +/* Cache lookup */ -static void request_endio(struct bio *bio, int error) -{ - struct closure *cl = bio->bi_private; - - if (error) { - struct search *s = container_of(cl, struct search, cl); - s->error = error; - /* Only cache read errors are recoverable */ - s->recoverable = false; - } - - bio_put(bio); - closure_put(cl); -} - -void bch_cache_read_endio(struct bio *bio, int error) +static void bch_cache_read_endio(struct bio *bio, int error) { struct bbio *b = container_of(bio, struct bbio, bio); struct closure *cl = bio->bi_private; @@ -678,6 +663,120 @@ void bch_cache_read_endio(struct bio *bio, int error) bch_bbio_endio(s->op.c, bio, error, "reading from cache"); } +static int submit_partial_cache_miss(struct btree *b, struct search *s, + struct bkey *k) +{ + struct bio *bio = &s->bio.bio; + int ret = MAP_CONTINUE; + + do { + unsigned sectors = INT_MAX; + + if (KEY_INODE(k) == s->op.inode) { + if (KEY_START(k) <= bio->bi_sector) + break; + + sectors = min_t(uint64_t, sectors, + KEY_START(k) - bio->bi_sector); + } + + ret = s->d->cache_miss(b, s, bio, sectors); + } while (ret == MAP_CONTINUE); + + return ret; +} + +/* + * Read from a single key, handling the initial cache miss if the key starts in + * the middle of the bio + */ +static int submit_partial_cache_hit(struct btree_op *op, struct btree *b, + struct bkey *k) +{ + struct search *s = container_of(op, struct search, op); + struct bio *bio = &s->bio.bio; + unsigned ptr; + struct bio *n; + + int ret = submit_partial_cache_miss(b, s, k); + if (ret != MAP_CONTINUE || !KEY_SIZE(k)) + return ret; + + /* XXX: figure out best pointer - for multiple cache devices */ + ptr = 0; + + PTR_BUCKET(b->c, k, ptr)->prio = INITIAL_PRIO; + + while (ret == MAP_CONTINUE && + KEY_INODE(k) == op->inode && + bio->bi_sector < KEY_OFFSET(k)) { + struct bkey *bio_key; + sector_t sector = PTR_OFFSET(k, ptr) + + (bio->bi_sector - KEY_START(k)); + unsigned sectors = min_t(uint64_t, INT_MAX, + KEY_OFFSET(k) - bio->bi_sector); + + n = bch_bio_split(bio, sectors, GFP_NOIO, s->d->bio_split); + if (n == bio) + ret = MAP_DONE; + + bio_key = &container_of(n, struct bbio, bio)->key; + + /* + * The bucket we're reading from might be reused while our bio + * is in flight, and we could then end up reading the wrong + * data. + * + * We guard against this by checking (in cache_read_endio()) if + * the pointer is stale again; if so, we treat it as an error + * and reread from the backing device (but we don't pass that + * error up anywhere). + */ + + bch_bkey_copy_single_ptr(bio_key, k, ptr); + SET_PTR_OFFSET(bio_key, 0, sector); + + n->bi_end_io = bch_cache_read_endio; + n->bi_private = &s->cl; + + __bch_submit_bbio(n, b->c); + } + + return ret; +} + +static void cache_lookup(struct closure *cl) +{ + struct btree_op *op = container_of(cl, struct btree_op, cl); + struct search *s = container_of(op, struct search, op); + struct bio *bio = &s->bio.bio; + + int ret = bch_btree_map_keys(op, op->c, + &KEY(op->inode, bio->bi_sector, 0), + submit_partial_cache_hit, 1); + if (ret == -EAGAIN) + continue_at(cl, cache_lookup, bcache_wq); + + closure_return(cl); +} + +/* Common code for the make_request functions */ + +static void request_endio(struct bio *bio, int error) +{ + struct closure *cl = bio->bi_private; + + if (error) { + struct search *s = container_of(cl, struct search, cl); + s->error = error; + /* Only cache read errors are recoverable */ + s->recoverable = false; + } + + bio_put(bio); + closure_put(cl); +} + static void bio_complete(struct search *s) { if (s->orig_bio) { @@ -1005,15 +1104,14 @@ static void cached_dev_read_done_bh(struct closure *cl) static int cached_dev_cache_miss(struct btree *b, struct search *s, struct bio *bio, unsigned sectors) { - int ret = 0; + int ret = MAP_CONTINUE; unsigned reada = 0; struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); struct bio *miss, *cache_bio; if (s->cache_miss || s->op.bypass) { miss = bch_bio_split(bio, sectors, GFP_NOIO, s->d->bio_split); - if (miss == bio) - s->op.lookup_done = true; + ret = miss == bio ? MAP_DONE : MAP_CONTINUE; goto out_submit; } @@ -1033,11 +1131,9 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s, return ret; miss = bch_bio_split(bio, sectors, GFP_NOIO, s->d->bio_split); - if (miss == bio) - s->op.lookup_done = true; - else - /* btree_search_recurse()'s btree iterator is no good anymore */ - ret = -EINTR; + + /* btree_search_recurse()'s btree iterator is no good anymore */ + ret = miss == bio ? MAP_DONE : -EINTR; cache_bio = bio_alloc_bioset(GFP_NOWAIT, DIV_ROUND_UP(s->cache_bio_sectors, PAGE_SECTORS), @@ -1075,7 +1171,7 @@ static void cached_dev_read(struct cached_dev *dc, struct search *s) { struct closure *cl = &s->cl; - closure_call(&s->op.cl, bch_btree_search_async, NULL, cl); + closure_call(&s->op.cl, cache_lookup, NULL, cl); continue_at(cl, cached_dev_read_done_bh, NULL); } @@ -1287,9 +1383,9 @@ static int flash_dev_cache_miss(struct btree *b, struct search *s, bio_advance(bio, min(sectors << 9, bio->bi_size)); if (!bio->bi_size) - s->op.lookup_done = true; + return MAP_DONE; - return 0; + return MAP_CONTINUE; } static void flash_dev_nodata(struct closure *cl) @@ -1339,7 +1435,7 @@ static void flash_dev_make_request(struct request_queue *q, struct bio *bio) closure_call(&s->op.cl, bch_data_insert, NULL, cl); } else { - closure_call(&s->op.cl, bch_btree_search_async, NULL, cl); + closure_call(&s->op.cl, cache_lookup, NULL, cl); } continue_at(cl, search_free, NULL); diff --git a/drivers/md/bcache/request.h b/drivers/md/bcache/request.h index 7d02ac5..b0b4b0b 100644 --- a/drivers/md/bcache/request.h +++ b/drivers/md/bcache/request.h @@ -31,10 +31,8 @@ struct search { struct keylist insert_keys; }; -void bch_cache_read_endio(struct bio *, int); unsigned bch_get_congested(struct cache_set *); void bch_data_insert(struct closure *cl); -void bch_cache_read_endio(struct bio *, int); void bch_open_buckets_free(struct cache_set *); int bch_open_buckets_alloc(struct cache_set *); -- cgit v0.10.2 From cc23196631fbcd1bc3eafedbb712413fdbf946a3 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 24 Jul 2013 17:41:13 -0700 Subject: bcache: Clean up cache_lookup_fn There was some looping in submit_partial_cache_hit() and submit_partial_cache_hit() that isn't needed anymore - originally, we wouldn't necessarily process the full hit or miss all at once because when splitting the bio, we took into account the restrictions of the device we were sending it to. But, device bio size restrictions are now handled elsewhere, with a wrapper around generic_make_request() - so that looping has been unnecessary for awhile now and we can now do quite a bit of cleanup. And if we trim the key we're reading from to match the subset we're actually reading, we don't have to explicitly calculate bi_sector anymore. Neat. Signed-off-by: Kent Overstreet diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 854743e..de3fc76 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -663,86 +663,70 @@ static void bch_cache_read_endio(struct bio *bio, int error) bch_bbio_endio(s->op.c, bio, error, "reading from cache"); } -static int submit_partial_cache_miss(struct btree *b, struct search *s, - struct bkey *k) -{ - struct bio *bio = &s->bio.bio; - int ret = MAP_CONTINUE; - - do { - unsigned sectors = INT_MAX; - - if (KEY_INODE(k) == s->op.inode) { - if (KEY_START(k) <= bio->bi_sector) - break; - - sectors = min_t(uint64_t, sectors, - KEY_START(k) - bio->bi_sector); - } - - ret = s->d->cache_miss(b, s, bio, sectors); - } while (ret == MAP_CONTINUE); - - return ret; -} - /* * Read from a single key, handling the initial cache miss if the key starts in * the middle of the bio */ -static int submit_partial_cache_hit(struct btree_op *op, struct btree *b, - struct bkey *k) +static int cache_lookup_fn(struct btree_op *op, struct btree *b, struct bkey *k) { struct search *s = container_of(op, struct search, op); - struct bio *bio = &s->bio.bio; + struct bio *n, *bio = &s->bio.bio; + struct bkey *bio_key; unsigned ptr; - struct bio *n; - int ret = submit_partial_cache_miss(b, s, k); - if (ret != MAP_CONTINUE || !KEY_SIZE(k)) - return ret; + if (bkey_cmp(k, &KEY(op->inode, bio->bi_sector, 0)) <= 0) + return MAP_CONTINUE; + + if (KEY_INODE(k) != s->op.inode || + KEY_START(k) > bio->bi_sector) { + unsigned bio_sectors = bio_sectors(bio); + unsigned sectors = KEY_INODE(k) == s->op.inode + ? min_t(uint64_t, INT_MAX, + KEY_START(k) - bio->bi_sector) + : INT_MAX; + + int ret = s->d->cache_miss(b, s, bio, sectors); + if (ret != MAP_CONTINUE) + return ret; + + /* if this was a complete miss we shouldn't get here */ + BUG_ON(bio_sectors <= sectors); + } + + if (!KEY_SIZE(k)) + return MAP_CONTINUE; /* XXX: figure out best pointer - for multiple cache devices */ ptr = 0; PTR_BUCKET(b->c, k, ptr)->prio = INITIAL_PRIO; - while (ret == MAP_CONTINUE && - KEY_INODE(k) == op->inode && - bio->bi_sector < KEY_OFFSET(k)) { - struct bkey *bio_key; - sector_t sector = PTR_OFFSET(k, ptr) + - (bio->bi_sector - KEY_START(k)); - unsigned sectors = min_t(uint64_t, INT_MAX, - KEY_OFFSET(k) - bio->bi_sector); - - n = bch_bio_split(bio, sectors, GFP_NOIO, s->d->bio_split); - if (n == bio) - ret = MAP_DONE; - - bio_key = &container_of(n, struct bbio, bio)->key; + n = bch_bio_split(bio, min_t(uint64_t, INT_MAX, + KEY_OFFSET(k) - bio->bi_sector), + GFP_NOIO, s->d->bio_split); - /* - * The bucket we're reading from might be reused while our bio - * is in flight, and we could then end up reading the wrong - * data. - * - * We guard against this by checking (in cache_read_endio()) if - * the pointer is stale again; if so, we treat it as an error - * and reread from the backing device (but we don't pass that - * error up anywhere). - */ + bio_key = &container_of(n, struct bbio, bio)->key; + bch_bkey_copy_single_ptr(bio_key, k, ptr); - bch_bkey_copy_single_ptr(bio_key, k, ptr); - SET_PTR_OFFSET(bio_key, 0, sector); + bch_cut_front(&KEY(s->op.inode, n->bi_sector, 0), bio_key); + bch_cut_back(&KEY(s->op.inode, bio_end_sector(n), 0), bio_key); - n->bi_end_io = bch_cache_read_endio; - n->bi_private = &s->cl; + n->bi_end_io = bch_cache_read_endio; + n->bi_private = &s->cl; - __bch_submit_bbio(n, b->c); - } + /* + * The bucket we're reading from might be reused while our bio + * is in flight, and we could then end up reading the wrong + * data. + * + * We guard against this by checking (in cache_read_endio()) if + * the pointer is stale again; if so, we treat it as an error + * and reread from the backing device (but we don't pass that + * error up anywhere). + */ - return ret; + __bch_submit_bbio(n, b->c); + return n == bio ? MAP_DONE : MAP_CONTINUE; } static void cache_lookup(struct closure *cl) @@ -753,7 +737,7 @@ static void cache_lookup(struct closure *cl) int ret = bch_btree_map_keys(op, op->c, &KEY(op->inode, bio->bi_sector, 0), - submit_partial_cache_hit, 1); + cache_lookup_fn, MAP_END_KEY); if (ret == -EAGAIN) continue_at(cl, cache_lookup, bcache_wq); -- cgit v0.10.2 From c18536a72ddd7fe30d63e6c1500b5c930ac14594 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 24 Jul 2013 17:44:17 -0700 Subject: bcache: Prune struct btree_op Eventual goal is for struct btree_op to contain only what is necessary for traversing the btree. Signed-off-by: Kent Overstreet diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c index 14c2a23..fae5b7b 100644 --- a/drivers/md/bcache/bset.c +++ b/drivers/md/bcache/bset.c @@ -1197,7 +1197,6 @@ int bch_bset_print_stats(struct cache_set *c, char *buf) memset(&t, 0, sizeof(struct bset_stats)); bch_btree_op_init_stack(&t.op); - t.op.c = c; ret = bch_btree_map_nodes(&t.op, c, &ZERO_KEY, btree_bset_stats); if (ret < 0) diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 3949673..5cb59c3 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -503,7 +503,7 @@ static void btree_node_write_work(struct work_struct *w) rw_unlock(true, b); } -static void bch_btree_leaf_dirty(struct btree *b, struct btree_op *op) +static void bch_btree_leaf_dirty(struct btree *b, atomic_t *journal_ref) { struct bset *i = b->sets[b->nsets].data; struct btree_write *w = btree_current_write(b); @@ -516,15 +516,15 @@ static void bch_btree_leaf_dirty(struct btree *b, struct btree_op *op) set_btree_node_dirty(b); - if (op->journal) { + if (journal_ref) { if (w->journal && - journal_pin_cmp(b->c, w, op)) { + journal_pin_cmp(b->c, w->journal, journal_ref)) { atomic_dec_bug(w->journal); w->journal = NULL; } if (!w->journal) { - w->journal = op->journal; + w->journal = journal_ref; atomic_inc(w->journal); } } @@ -1663,13 +1663,16 @@ static int bch_btree_check_recurse(struct btree *b, struct btree_op *op, return 0; } -int bch_btree_check(struct cache_set *c, struct btree_op *op) +int bch_btree_check(struct cache_set *c) { int ret = -ENOMEM; unsigned i; unsigned long *seen[MAX_CACHES_PER_SET]; + struct btree_op op; memset(seen, 0, sizeof(seen)); + bch_btree_op_init_stack(&op); + op.lock = SHRT_MAX; for (i = 0; c->cache[i]; i++) { size_t n = DIV_ROUND_UP(c->cache[i]->sb.nbuckets, 8); @@ -1681,7 +1684,7 @@ int bch_btree_check(struct cache_set *c, struct btree_op *op) memset(seen[i], 0xFF, n); } - ret = btree_root(check_recurse, c, op, seen); + ret = btree_root(check_recurse, c, &op, seen); err: for (i = 0; i < MAX_CACHES_PER_SET; i++) kfree(seen[i]); @@ -2091,7 +2094,8 @@ err: } static int bch_btree_insert_node(struct btree *b, struct btree_op *op, - struct keylist *insert_keys) + struct keylist *insert_keys, + atomic_t *journal_ref) { int ret = 0; struct keylist split_keys; @@ -2123,7 +2127,7 @@ static int bch_btree_insert_node(struct btree *b, struct btree_op *op, if (bch_btree_insert_keys(b, op, insert_keys)) { if (!b->level) - bch_btree_leaf_dirty(b, op); + bch_btree_leaf_dirty(b, journal_ref); else bch_btree_node_write(b, &op->cl); } @@ -2162,7 +2166,7 @@ int bch_btree_insert_check_key(struct btree *b, struct btree_op *op, BUG_ON(op->type != BTREE_INSERT); - ret = bch_btree_insert_node(b, op, &insert); + ret = bch_btree_insert_node(b, op, &insert, NULL); BUG_ON(!ret && !bch_keylist_empty(&insert)); out: @@ -2172,7 +2176,7 @@ out: } static int bch_btree_insert_recurse(struct btree *b, struct btree_op *op, - struct keylist *keys) + struct keylist *keys, atomic_t *journal_ref) { if (bch_keylist_empty(keys)) return 0; @@ -2189,14 +2193,14 @@ static int bch_btree_insert_recurse(struct btree *b, struct btree_op *op, return -EIO; } - return btree(insert_recurse, k, b, op, keys); + return btree(insert_recurse, k, b, op, keys, journal_ref); } else { - return bch_btree_insert_node(b, op, keys); + return bch_btree_insert_node(b, op, keys, journal_ref); } } int bch_btree_insert(struct btree_op *op, struct cache_set *c, - struct keylist *keys) + struct keylist *keys, atomic_t *journal_ref) { int ret = 0; @@ -2210,7 +2214,7 @@ int bch_btree_insert(struct btree_op *op, struct cache_set *c, while (!bch_keylist_empty(keys)) { op->lock = 0; - ret = btree_root(insert_recurse, c, op, keys); + ret = btree_root(insert_recurse, c, op, keys, journal_ref); if (ret == -EAGAIN) { ret = 0; diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h index 60dadd7..3f820b6 100644 --- a/drivers/md/bcache/btree.h +++ b/drivers/md/bcache/btree.h @@ -238,17 +238,6 @@ void __bkey_put(struct cache_set *c, struct bkey *k); struct btree_op { struct closure cl; - struct cache_set *c; - - /* Journal entry we have a refcount on */ - atomic_t *journal; - - /* Bio to be inserted into the cache */ - struct bio *cache_bio; - - unsigned inode; - - uint16_t write_prio; /* Btree level at which we start taking write locks */ short lock; @@ -259,11 +248,6 @@ struct btree_op { BTREE_REPLACE } type:8; - unsigned csum:1; - unsigned bypass:1; - unsigned flush_journal:1; - - unsigned insert_data_done:1; unsigned insert_collision:1; BKEY_PADDED(replace); @@ -303,12 +287,13 @@ struct btree *bch_btree_node_get(struct cache_set *, struct bkey *, int, bool); int bch_btree_insert_check_key(struct btree *, struct btree_op *, struct bkey *); -int bch_btree_insert(struct btree_op *, struct cache_set *, struct keylist *); +int bch_btree_insert(struct btree_op *, struct cache_set *, + struct keylist *, atomic_t *); int bch_gc_thread_start(struct cache_set *); size_t bch_btree_gc_finish(struct cache_set *); void bch_moving_gc(struct cache_set *); -int bch_btree_check(struct cache_set *, struct btree_op *); +int bch_btree_check(struct cache_set *); uint8_t __bch_btree_mark_key(struct cache_set *, int, struct bkey *); static inline void wake_up_gc(struct cache_set *c) diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c index 6f4daf0..725c8eb 100644 --- a/drivers/md/bcache/journal.c +++ b/drivers/md/bcache/journal.c @@ -30,17 +30,20 @@ static void journal_read_endio(struct bio *bio, int error) } static int journal_read_bucket(struct cache *ca, struct list_head *list, - struct btree_op *op, unsigned bucket_index) + unsigned bucket_index) { struct journal_device *ja = &ca->journal; struct bio *bio = &ja->bio; struct journal_replay *i; struct jset *j, *data = ca->set->journal.w[0].data; + struct closure cl; unsigned len, left, offset = 0; int ret = 0; sector_t bucket = bucket_to_sector(ca->set, ca->sb.d[bucket_index]); + closure_init_stack(&cl); + pr_debug("reading %llu", (uint64_t) bucket); while (offset < ca->sb.bucket_size) { @@ -54,11 +57,11 @@ reread: left = ca->sb.bucket_size - offset; bio->bi_size = len << 9; bio->bi_end_io = journal_read_endio; - bio->bi_private = &op->cl; + bio->bi_private = &cl; bch_bio_map(bio, data); - closure_bio_submit(bio, &op->cl, ca); - closure_sync(&op->cl); + closure_bio_submit(bio, &cl, ca); + closure_sync(&cl); /* This function could be simpler now since we no longer write * journal entries that overlap bucket boundaries; this means @@ -128,12 +131,11 @@ next_set: return ret; } -int bch_journal_read(struct cache_set *c, struct list_head *list, - struct btree_op *op) +int bch_journal_read(struct cache_set *c, struct list_head *list) { #define read_bucket(b) \ ({ \ - int ret = journal_read_bucket(ca, list, op, b); \ + int ret = journal_read_bucket(ca, list, b); \ __set_bit(b, bitmap); \ if (ret < 0) \ return ret; \ @@ -291,8 +293,7 @@ void bch_journal_mark(struct cache_set *c, struct list_head *list) } } -int bch_journal_replay(struct cache_set *s, struct list_head *list, - struct btree_op *op) +int bch_journal_replay(struct cache_set *s, struct list_head *list) { int ret = 0, keys = 0, entries = 0; struct bkey *k; @@ -301,8 +302,11 @@ int bch_journal_replay(struct cache_set *s, struct list_head *list, uint64_t start = i->j.last_seq, end = i->j.seq, n = start; struct keylist keylist; + struct btree_op op; bch_keylist_init(&keylist); + bch_btree_op_init_stack(&op); + op.lock = SHRT_MAX; list_for_each_entry(i, list, list) { BUG_ON(i->pin && atomic_read(i->pin) != 1); @@ -319,9 +323,7 @@ int bch_journal_replay(struct cache_set *s, struct list_head *list, bkey_copy(keylist.top, k); bch_keylist_push(&keylist); - op->journal = i->pin; - - ret = bch_btree_insert(op, s, &keylist); + ret = bch_btree_insert(&op, s, &keylist, i->pin); if (ret) goto err; @@ -346,7 +348,7 @@ int bch_journal_replay(struct cache_set *s, struct list_head *list, kfree(i); } err: - closure_sync(&op->cl); + closure_sync(&op.cl); return ret; } @@ -368,8 +370,8 @@ retry: if (!best) best = b; else if (journal_pin_cmp(c, - btree_current_write(best), - btree_current_write(b))) { + btree_current_write(best)->journal, + btree_current_write(b)->journal)) { best = b; } } diff --git a/drivers/md/bcache/journal.h b/drivers/md/bcache/journal.h index 7045e6f..5e9edb9 100644 --- a/drivers/md/bcache/journal.h +++ b/drivers/md/bcache/journal.h @@ -189,8 +189,7 @@ struct journal_device { }; #define journal_pin_cmp(c, l, r) \ - (fifo_idx(&(c)->journal.pin, (l)->journal) > \ - fifo_idx(&(c)->journal.pin, (r)->journal)) + (fifo_idx(&(c)->journal.pin, (l)) > fifo_idx(&(c)->journal.pin, (r))) #define JOURNAL_PIN 20000 @@ -206,10 +205,8 @@ atomic_t *bch_journal(struct cache_set *, struct keylist *, struct closure *); void bch_journal_next(struct journal *); void bch_journal_mark(struct cache_set *, struct list_head *); void bch_journal_meta(struct cache_set *, struct closure *); -int bch_journal_read(struct cache_set *, struct list_head *, - struct btree_op *); -int bch_journal_replay(struct cache_set *, struct list_head *, - struct btree_op *); +int bch_journal_read(struct cache_set *, struct list_head *); +int bch_journal_replay(struct cache_set *, struct list_head *); void bch_journal_free(struct cache_set *); int bch_journal_alloc(struct cache_set *); diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c index 6ba0504..80e30d7 100644 --- a/drivers/md/bcache/movinggc.c +++ b/drivers/md/bcache/movinggc.c @@ -55,9 +55,9 @@ static void write_moving_finish(struct closure *cl) if (io->s.op.insert_collision) trace_bcache_gc_copy_collision(&io->w->key); - bch_keybuf_del(&io->s.op.c->moving_gc_keys, io->w); + bch_keybuf_del(&io->s.c->moving_gc_keys, io->w); - up(&io->s.op.c->moving_in_flight); + up(&io->s.c->moving_in_flight); closure_return_with_destructor(cl, moving_io_destructor); } @@ -70,7 +70,7 @@ static void read_moving_endio(struct bio *bio, int error) if (error) io->s.error = error; - bch_bbio_endio(io->s.op.c, bio, error, "reading data to move"); + bch_bbio_endio(io->s.c, bio, error, "reading data to move"); } static void moving_init(struct moving_io *io) @@ -99,11 +99,11 @@ static void write_moving(struct closure *cl) io->bio.bio.bi_sector = KEY_START(&io->w->key); s->op.lock = -1; - s->op.write_prio = 1; - s->op.cache_bio = &io->bio.bio; + s->write_prio = 1; + s->cache_bio = &io->bio.bio; s->writeback = KEY_DIRTY(&io->w->key); - s->op.csum = KEY_CSUM(&io->w->key); + s->csum = KEY_CSUM(&io->w->key); s->op.type = BTREE_REPLACE; bkey_copy(&s->op.replace, &io->w->key); @@ -121,7 +121,7 @@ static void read_moving_submit(struct closure *cl) struct moving_io *io = container_of(s, struct moving_io, s); struct bio *bio = &io->bio.bio; - bch_submit_bbio(bio, s->op.c, &io->w->key, 0); + bch_submit_bbio(bio, s->c, &io->w->key, 0); continue_at(cl, write_moving, system_wq); } @@ -151,8 +151,8 @@ static void read_moving(struct cache_set *c) w->private = io; io->w = w; - io->s.op.inode = KEY_INODE(&w->key); - io->s.op.c = c; + io->s.inode = KEY_INODE(&w->key); + io->s.c = c; moving_init(io); bio = &io->bio.bio; diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index de3fc76..818e2e3 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -217,6 +217,7 @@ static void bch_data_insert_keys(struct closure *cl) { struct btree_op *op = container_of(cl, struct btree_op, cl); struct search *s = container_of(op, struct search, op); + atomic_t *journal_ref = NULL; /* * If we're looping, might already be waiting on @@ -231,20 +232,19 @@ static void bch_data_insert_keys(struct closure *cl) #endif if (s->write) - op->journal = bch_journal(op->c, &s->insert_keys, - op->flush_journal + journal_ref = bch_journal(s->c, &s->insert_keys, + s->flush_journal ? &s->cl : NULL); - if (bch_btree_insert(op, op->c, &s->insert_keys)) { + if (bch_btree_insert(op, s->c, &s->insert_keys, journal_ref)) { s->error = -ENOMEM; - op->insert_data_done = true; + s->insert_data_done = true; } - if (op->journal) - atomic_dec_bug(op->journal); - op->journal = NULL; + if (journal_ref) + atomic_dec_bug(journal_ref); - if (!op->insert_data_done) + if (!s->insert_data_done) continue_at(cl, bch_data_insert_start, bcache_wq); bch_keylist_free(&s->insert_keys); @@ -347,7 +347,7 @@ found: static bool bch_alloc_sectors(struct bkey *k, unsigned sectors, struct search *s) { - struct cache_set *c = s->op.c; + struct cache_set *c = s->c; struct open_bucket *b; BKEY_PADDED(key) alloc; unsigned i; @@ -363,7 +363,7 @@ static bool bch_alloc_sectors(struct bkey *k, unsigned sectors, spin_lock(&c->data_bucket_lock); while (!(b = pick_data_bucket(c, k, s->task, &alloc.key))) { - unsigned watermark = s->op.write_prio + unsigned watermark = s->write_prio ? WATERMARK_MOVINGGC : WATERMARK_NONE; @@ -435,7 +435,7 @@ static void bch_data_invalidate(struct closure *cl) { struct btree_op *op = container_of(cl, struct btree_op, cl); struct search *s = container_of(op, struct search, op); - struct bio *bio = op->cache_bio; + struct bio *bio = s->cache_bio; pr_debug("invalidating %i sectors from %llu", bio_sectors(bio), (uint64_t) bio->bi_sector); @@ -443,17 +443,17 @@ static void bch_data_invalidate(struct closure *cl) while (bio_sectors(bio)) { unsigned len = min(bio_sectors(bio), 1U << 14); - if (bch_keylist_realloc(&s->insert_keys, 0, op->c)) + if (bch_keylist_realloc(&s->insert_keys, 0, s->c)) goto out; bio->bi_sector += len; bio->bi_size -= len << 9; bch_keylist_add(&s->insert_keys, - &KEY(op->inode, bio->bi_sector, len)); + &KEY(s->inode, bio->bi_sector, len)); } - op->insert_data_done = true; + s->insert_data_done = true; bio_put(bio); out: continue_at(cl, bch_data_insert_keys, bcache_wq); @@ -506,21 +506,21 @@ static void bch_data_insert_endio(struct bio *bio, int error) set_closure_fn(cl, NULL, NULL); } - bch_bbio_endio(op->c, bio, error, "writing data to cache"); + bch_bbio_endio(s->c, bio, error, "writing data to cache"); } static void bch_data_insert_start(struct closure *cl) { struct btree_op *op = container_of(cl, struct btree_op, cl); struct search *s = container_of(op, struct search, op); - struct bio *bio = op->cache_bio, *n; + struct bio *bio = s->cache_bio, *n; - if (op->bypass) + if (s->bypass) return bch_data_invalidate(cl); - if (atomic_sub_return(bio_sectors(bio), &op->c->sectors_to_gc) < 0) { - set_gc_sectors(op->c); - wake_up_gc(op->c); + if (atomic_sub_return(bio_sectors(bio), &s->c->sectors_to_gc) < 0) { + set_gc_sectors(s->c); + wake_up_gc(s->c); } /* @@ -533,17 +533,17 @@ static void bch_data_insert_start(struct closure *cl) unsigned i; struct bkey *k; struct bio_set *split = s->d - ? s->d->bio_split : op->c->bio_split; + ? s->d->bio_split : s->c->bio_split; /* 1 for the device pointer and 1 for the chksum */ if (bch_keylist_realloc(&s->insert_keys, - 1 + (op->csum ? 1 : 0), - op->c)) + 1 + (s->csum ? 1 : 0), + s->c)) continue_at(cl, bch_data_insert_keys, bcache_wq); k = s->insert_keys.top; bkey_init(k); - SET_KEY_INODE(k, op->inode); + SET_KEY_INODE(k, s->inode); SET_KEY_OFFSET(k, bio->bi_sector); if (!bch_alloc_sectors(k, bio_sectors(bio), s)) @@ -558,11 +558,11 @@ static void bch_data_insert_start(struct closure *cl) SET_KEY_DIRTY(k, true); for (i = 0; i < KEY_PTRS(k); i++) - SET_GC_MARK(PTR_BUCKET(op->c, k, i), + SET_GC_MARK(PTR_BUCKET(s->c, k, i), GC_MARK_DIRTY); } - SET_KEY_CSUM(k, op->csum); + SET_KEY_CSUM(k, s->csum); if (KEY_CSUM(k)) bio_csum(n, k); @@ -570,10 +570,10 @@ static void bch_data_insert_start(struct closure *cl) bch_keylist_push(&s->insert_keys); n->bi_rw |= REQ_WRITE; - bch_submit_bbio(n, op->c, k, 0); + bch_submit_bbio(n, s->c, k, 0); } while (n != bio); - op->insert_data_done = true; + s->insert_data_done = true; continue_at(cl, bch_data_insert_keys, bcache_wq); err: /* bch_alloc_sectors() blocks if s->writeback = true */ @@ -592,14 +592,14 @@ err: * we wait for buckets to be freed up, so just invalidate the * rest of the write. */ - op->bypass = true; + s->bypass = true; return bch_data_invalidate(cl); } else { /* * From a cache miss, we can just insert the keys for the data * we have written or bail out if we didn't do anything. */ - op->insert_data_done = true; + s->insert_data_done = true; bio_put(bio); if (!bch_keylist_empty(&s->insert_keys)) @@ -622,11 +622,11 @@ err: * data is written it calls bch_journal, and after the keys have been added to * the next journal write they're inserted into the btree. * - * It inserts the data in op->cache_bio; bi_sector is used for the key offset, + * It inserts the data in s->cache_bio; bi_sector is used for the key offset, * and op->inode is used for the key inode. * - * If op->bypass is true, instead of inserting the data it invalidates the - * region of the cache represented by op->cache_bio and op->inode. + * If s->bypass is true, instead of inserting the data it invalidates the + * region of the cache represented by s->cache_bio and op->inode. */ void bch_data_insert(struct closure *cl) { @@ -634,7 +634,7 @@ void bch_data_insert(struct closure *cl) struct search *s = container_of(op, struct search, op); bch_keylist_init(&s->insert_keys); - bio_get(op->cache_bio); + bio_get(s->cache_bio); bch_data_insert_start(cl); } @@ -655,12 +655,12 @@ static void bch_cache_read_endio(struct bio *bio, int error) if (error) s->error = error; - else if (ptr_stale(s->op.c, &b->key, 0)) { - atomic_long_inc(&s->op.c->cache_read_races); + else if (ptr_stale(s->c, &b->key, 0)) { + atomic_long_inc(&s->c->cache_read_races); s->error = -EINTR; } - bch_bbio_endio(s->op.c, bio, error, "reading from cache"); + bch_bbio_endio(s->c, bio, error, "reading from cache"); } /* @@ -674,13 +674,13 @@ static int cache_lookup_fn(struct btree_op *op, struct btree *b, struct bkey *k) struct bkey *bio_key; unsigned ptr; - if (bkey_cmp(k, &KEY(op->inode, bio->bi_sector, 0)) <= 0) + if (bkey_cmp(k, &KEY(s->inode, bio->bi_sector, 0)) <= 0) return MAP_CONTINUE; - if (KEY_INODE(k) != s->op.inode || + if (KEY_INODE(k) != s->inode || KEY_START(k) > bio->bi_sector) { unsigned bio_sectors = bio_sectors(bio); - unsigned sectors = KEY_INODE(k) == s->op.inode + unsigned sectors = KEY_INODE(k) == s->inode ? min_t(uint64_t, INT_MAX, KEY_START(k) - bio->bi_sector) : INT_MAX; @@ -708,8 +708,8 @@ static int cache_lookup_fn(struct btree_op *op, struct btree *b, struct bkey *k) bio_key = &container_of(n, struct bbio, bio)->key; bch_bkey_copy_single_ptr(bio_key, k, ptr); - bch_cut_front(&KEY(s->op.inode, n->bi_sector, 0), bio_key); - bch_cut_back(&KEY(s->op.inode, bio_end_sector(n), 0), bio_key); + bch_cut_front(&KEY(s->inode, n->bi_sector, 0), bio_key); + bch_cut_back(&KEY(s->inode, bio_end_sector(n), 0), bio_key); n->bi_end_io = bch_cache_read_endio; n->bi_private = &s->cl; @@ -735,8 +735,8 @@ static void cache_lookup(struct closure *cl) struct search *s = container_of(op, struct search, op); struct bio *bio = &s->bio.bio; - int ret = bch_btree_map_keys(op, op->c, - &KEY(op->inode, bio->bi_sector, 0), + int ret = bch_btree_map_keys(op, s->c, + &KEY(s->inode, bio->bi_sector, 0), cache_lookup_fn, MAP_END_KEY); if (ret == -EAGAIN) continue_at(cl, cache_lookup, bcache_wq); @@ -793,8 +793,8 @@ static void search_free(struct closure *cl) struct search *s = container_of(cl, struct search, cl); bio_complete(s); - if (s->op.cache_bio) - bio_put(s->op.cache_bio); + if (s->cache_bio) + bio_put(s->cache_bio); if (s->unaligned_bvec) mempool_free(s->bio.bio.bi_io_vec, s->d->unaligned_bvec); @@ -813,14 +813,14 @@ static struct search *search_alloc(struct bio *bio, struct bcache_device *d) __closure_init(&s->cl, NULL); - s->op.inode = d->id; - s->op.c = d->c; + s->inode = d->id; + s->c = d->c; s->d = d; s->op.lock = -1; s->task = current; s->orig_bio = bio; s->write = (bio->bi_rw & REQ_WRITE) != 0; - s->op.flush_journal = (bio->bi_rw & (REQ_FLUSH|REQ_FUA)) != 0; + s->flush_journal = (bio->bi_rw & (REQ_FLUSH|REQ_FUA)) != 0; s->recoverable = 1; s->start_time = jiffies; do_bio_hook(s); @@ -891,7 +891,7 @@ static struct hlist_head *iohash(struct cached_dev *dc, uint64_t k) static bool check_should_bypass(struct cached_dev *dc, struct search *s) { - struct cache_set *c = s->op.c; + struct cache_set *c = s->c; struct bio *bio = &s->bio.bio; unsigned mode = cache_mode(dc, bio); unsigned sectors, congested = bch_get_congested(c); @@ -985,11 +985,11 @@ static void cached_dev_cache_miss_done(struct closure *cl) if (s->op.insert_collision) bch_mark_cache_miss_collision(s); - if (s->op.cache_bio) { + if (s->cache_bio) { int i; struct bio_vec *bv; - __bio_for_each_segment(bv, s->op.cache_bio, i, 0) + bio_for_each_segment_all(bv, s->cache_bio, i) __free_page(bv->bv_page); } @@ -1042,14 +1042,15 @@ static void cached_dev_read_done(struct closure *cl) * to the buffers the original bio pointed to: */ - if (s->op.cache_bio) { - bio_reset(s->op.cache_bio); - s->op.cache_bio->bi_sector = s->cache_miss->bi_sector; - s->op.cache_bio->bi_bdev = s->cache_miss->bi_bdev; - s->op.cache_bio->bi_size = s->cache_bio_sectors << 9; - bch_bio_map(s->op.cache_bio, NULL); + if (s->cache_bio) { + bio_reset(s->cache_bio); + s->cache_bio->bi_sector = + s->cache_miss->bi_sector; + s->cache_bio->bi_bdev = s->cache_miss->bi_bdev; + s->cache_bio->bi_size = s->cache_bio_sectors << 9; + bch_bio_map(s->cache_bio, NULL); - bio_copy_data(s->cache_miss, s->op.cache_bio); + bio_copy_data(s->cache_miss, s->cache_bio); bio_put(s->cache_miss); s->cache_miss = NULL; @@ -1060,8 +1061,8 @@ static void cached_dev_read_done(struct closure *cl) bio_complete(s); - if (s->op.cache_bio && - !test_bit(CACHE_SET_STOPPING, &s->op.c->flags)) { + if (s->cache_bio && + !test_bit(CACHE_SET_STOPPING, &s->c->flags)) { s->op.type = BTREE_REPLACE; closure_call(&s->op.cl, bch_data_insert, NULL, cl); } @@ -1074,12 +1075,12 @@ static void cached_dev_read_done_bh(struct closure *cl) struct search *s = container_of(cl, struct search, cl); struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); - bch_mark_cache_accounting(s, !s->cache_miss, s->op.bypass); - trace_bcache_read(s->orig_bio, !s->cache_miss, s->op.bypass); + bch_mark_cache_accounting(s, !s->cache_miss, s->bypass); + trace_bcache_read(s->orig_bio, !s->cache_miss, s->bypass); if (s->error) continue_at_nobarrier(cl, cached_dev_read_error, bcache_wq); - else if (s->op.cache_bio || verify(dc, &s->bio.bio)) + else if (s->cache_bio || verify(dc, &s->bio.bio)) continue_at_nobarrier(cl, cached_dev_read_done, bcache_wq); else continue_at_nobarrier(cl, cached_dev_bio_complete, NULL); @@ -1093,7 +1094,7 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s, struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); struct bio *miss, *cache_bio; - if (s->cache_miss || s->op.bypass) { + if (s->cache_miss || s->bypass) { miss = bch_bio_split(bio, sectors, GFP_NOIO, s->d->bio_split); ret = miss == bio ? MAP_DONE : MAP_CONTINUE; goto out_submit; @@ -1101,13 +1102,13 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s, if (!(bio->bi_rw & REQ_RAHEAD) && !(bio->bi_rw & REQ_META) && - s->op.c->gc_stats.in_use < CUTOFF_CACHE_READA) + s->c->gc_stats.in_use < CUTOFF_CACHE_READA) reada = min_t(sector_t, dc->readahead >> 9, bdev_sectors(bio->bi_bdev) - bio_end_sector(bio)); s->cache_bio_sectors = min(sectors, bio_sectors(bio) + reada); - s->op.replace = KEY(s->op.inode, bio->bi_sector + + s->op.replace = KEY(s->inode, bio->bi_sector + s->cache_bio_sectors, s->cache_bio_sectors); ret = bch_btree_insert_check_key(b, &s->op, &s->op.replace); @@ -1137,7 +1138,7 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s, goto out_put; s->cache_miss = miss; - s->op.cache_bio = cache_bio; + s->cache_bio = cache_bio; bio_get(cache_bio); closure_bio_submit(cache_bio, &s->cl, s->d); @@ -1177,7 +1178,7 @@ static void cached_dev_write(struct cached_dev *dc, struct search *s) struct bkey start = KEY(dc->disk.id, bio->bi_sector, 0); struct bkey end = KEY(dc->disk.id, bio_end_sector(bio), 0); - bch_keybuf_check_overlapping(&s->op.c->moving_gc_keys, &start, &end); + bch_keybuf_check_overlapping(&s->c->moving_gc_keys, &start, &end); down_read_non_owner(&dc->writeback_lock); if (bch_keybuf_check_overlapping(&dc->writeback_keys, &start, &end)) { @@ -1185,7 +1186,7 @@ static void cached_dev_write(struct cached_dev *dc, struct search *s) * We overlap with some dirty data undergoing background * writeback, force this write to writeback */ - s->op.bypass = false; + s->bypass = false; s->writeback = true; } @@ -1197,27 +1198,27 @@ static void cached_dev_write(struct cached_dev *dc, struct search *s) * so we still want to call it. */ if (bio->bi_rw & REQ_DISCARD) - s->op.bypass = true; + s->bypass = true; if (should_writeback(dc, s->orig_bio, cache_mode(dc, bio), - s->op.bypass)) { - s->op.bypass = false; + s->bypass)) { + s->bypass = false; s->writeback = true; } - trace_bcache_write(s->orig_bio, s->writeback, s->op.bypass); + trace_bcache_write(s->orig_bio, s->writeback, s->bypass); - if (s->op.bypass) { - s->op.cache_bio = s->orig_bio; - bio_get(s->op.cache_bio); + if (s->bypass) { + s->cache_bio = s->orig_bio; + bio_get(s->cache_bio); if (!(bio->bi_rw & REQ_DISCARD) || blk_queue_discard(bdev_get_queue(dc->bdev))) closure_bio_submit(bio, cl, s->d); } else if (s->writeback) { bch_writeback_add(dc); - s->op.cache_bio = bio; + s->cache_bio = bio; if (bio->bi_rw & REQ_FLUSH) { /* Also need to send a flush to the backing device */ @@ -1232,8 +1233,8 @@ static void cached_dev_write(struct cached_dev *dc, struct search *s) closure_bio_submit(flush, cl, s->d); } } else { - s->op.cache_bio = bio_clone_bioset(bio, GFP_NOIO, - dc->disk.bio_split); + s->cache_bio = bio_clone_bioset(bio, GFP_NOIO, + dc->disk.bio_split); closure_bio_submit(bio, cl, s->d); } @@ -1247,8 +1248,8 @@ static void cached_dev_nodata(struct closure *cl) struct search *s = container_of(cl, struct search, cl); struct bio *bio = &s->bio.bio; - if (s->op.flush_journal) - bch_journal_meta(s->op.c, cl); + if (s->flush_journal) + bch_journal_meta(s->c, cl); /* If it's a flush, we send the flush to the backing device too */ closure_bio_submit(bio, cl, s->d); @@ -1286,7 +1287,7 @@ static void cached_dev_make_request(struct request_queue *q, struct bio *bio) cached_dev_nodata, bcache_wq); } else { - s->op.bypass = check_should_bypass(dc, s); + s->bypass = check_should_bypass(dc, s); if (rw) cached_dev_write(dc, s); @@ -1376,8 +1377,8 @@ static void flash_dev_nodata(struct closure *cl) { struct search *s = container_of(cl, struct search, cl); - if (s->op.flush_journal) - bch_journal_meta(s->op.c, cl); + if (s->flush_journal) + bch_journal_meta(s->c, cl); continue_at(cl, search_free, NULL); } @@ -1409,13 +1410,13 @@ static void flash_dev_make_request(struct request_queue *q, struct bio *bio) flash_dev_nodata, bcache_wq); } else if (rw) { - bch_keybuf_check_overlapping(&s->op.c->moving_gc_keys, + bch_keybuf_check_overlapping(&s->c->moving_gc_keys, &KEY(d->id, bio->bi_sector, 0), &KEY(d->id, bio_end_sector(bio), 0)); - s->op.bypass = (bio->bi_rw & REQ_DISCARD) != 0; + s->bypass = (bio->bi_rw & REQ_DISCARD) != 0; s->writeback = true; - s->op.cache_bio = bio; + s->cache_bio = bio; closure_call(&s->op.cl, bch_data_insert, NULL, cl); } else { diff --git a/drivers/md/bcache/request.h b/drivers/md/bcache/request.h index b0b4b0b..0f79177 100644 --- a/drivers/md/bcache/request.h +++ b/drivers/md/bcache/request.h @@ -8,19 +8,33 @@ struct search { struct closure cl; struct bcache_device *d; + struct cache_set *c; struct task_struct *task; struct bbio bio; struct bio *orig_bio; struct bio *cache_miss; + + /* Bio to be inserted into the cache */ + struct bio *cache_bio; unsigned cache_bio_sectors; + unsigned inode; + unsigned recoverable:1; unsigned unaligned_bvec:1; unsigned write:1; unsigned writeback:1; + unsigned csum:1; + unsigned bypass:1; + unsigned flush_journal:1; + + unsigned insert_data_done:1; + + uint16_t write_prio; + /* IO error returned to s->bio */ short error; unsigned long start_time; diff --git a/drivers/md/bcache/stats.c b/drivers/md/bcache/stats.c index b8730e7..ea77263 100644 --- a/drivers/md/bcache/stats.c +++ b/drivers/md/bcache/stats.c @@ -200,7 +200,7 @@ void bch_mark_cache_accounting(struct search *s, bool hit, bool bypass) { struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); mark_cache_stats(&dc->accounting.collector, hit, bypass); - mark_cache_stats(&s->op.c->accounting.collector, hit, bypass); + mark_cache_stats(&s->c->accounting.collector, hit, bypass); #ifdef CONFIG_CGROUP_BCACHE mark_cache_stats(&(bch_bio_to_cgroup(s->orig_bio)->stats), hit, bypass); #endif @@ -210,21 +210,21 @@ void bch_mark_cache_readahead(struct search *s) { struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); atomic_inc(&dc->accounting.collector.cache_readaheads); - atomic_inc(&s->op.c->accounting.collector.cache_readaheads); + atomic_inc(&s->c->accounting.collector.cache_readaheads); } void bch_mark_cache_miss_collision(struct search *s) { struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); atomic_inc(&dc->accounting.collector.cache_miss_collisions); - atomic_inc(&s->op.c->accounting.collector.cache_miss_collisions); + atomic_inc(&s->c->accounting.collector.cache_miss_collisions); } void bch_mark_sectors_bypassed(struct search *s, int sectors) { struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); atomic_add(sectors, &dc->accounting.collector.sectors_bypassed); - atomic_add(sectors, &s->op.c->accounting.collector.sectors_bypassed); + atomic_add(sectors, &s->c->accounting.collector.sectors_bypassed); } void bch_cache_accounting_init(struct cache_accounting *acc, diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index b79dd5a..a314c77 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1493,11 +1493,10 @@ static void run_cache_set(struct cache_set *c) const char *err = "cannot allocate memory"; struct cached_dev *dc, *t; struct cache *ca; + struct closure cl; unsigned i; - struct btree_op op; - bch_btree_op_init_stack(&op); - op.lock = SHRT_MAX; + closure_init_stack(&cl); for_each_cache(ca, c, i) c->nbuckets += ca->sb.nbuckets; @@ -1508,7 +1507,7 @@ static void run_cache_set(struct cache_set *c) struct jset *j; err = "cannot allocate memory for journal"; - if (bch_journal_read(c, &journal, &op)) + if (bch_journal_read(c, &journal)) goto err; pr_debug("btree_journal_read() done"); @@ -1543,12 +1542,12 @@ static void run_cache_set(struct cache_set *c) list_del_init(&c->root->list); rw_unlock(true, c->root); - err = uuid_read(c, j, &op.cl); + err = uuid_read(c, j, &cl); if (err) goto err; err = "error in recovery"; - if (bch_btree_check(c, &op)) + if (bch_btree_check(c)) goto err; bch_journal_mark(c, &journal); @@ -1580,7 +1579,7 @@ static void run_cache_set(struct cache_set *c) if (j->version < BCACHE_JSET_VERSION_UUID) __uuid_write(c); - bch_journal_replay(c, &journal, &op); + bch_journal_replay(c, &journal); } else { pr_notice("invalidating existing data"); @@ -1616,7 +1615,7 @@ static void run_cache_set(struct cache_set *c) goto err; bkey_copy_key(&c->root->key, &MAX_KEY); - bch_btree_node_write(c->root, &op.cl); + bch_btree_node_write(c->root, &cl); bch_btree_set_root(c->root); rw_unlock(true, c->root); @@ -1629,14 +1628,14 @@ static void run_cache_set(struct cache_set *c) SET_CACHE_SYNC(&c->sb, true); bch_journal_next(&c->journal); - bch_journal_meta(c, &op.cl); + bch_journal_meta(c, &cl); } err = "error starting gc thread"; if (bch_gc_thread_start(c)) goto err; - closure_sync(&op.cl); + closure_sync(&cl); c->sb.last_mount = get_seconds(); bcache_write_super(c); @@ -1647,7 +1646,7 @@ static void run_cache_set(struct cache_set *c) return; err: - closure_sync(&op.cl); + closure_sync(&cl); /* XXX: test this, it's broken */ bch_cache_set_error(c, err); } diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index c68de9f..b58c2bc 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -155,7 +155,7 @@ static void write_dirty_finish(struct closure *cl) for (i = 0; i < KEY_PTRS(&w->key); i++) atomic_inc(&PTR_BUCKET(dc->disk.c, &w->key, i)->pin); - bch_btree_insert(&op, dc->disk.c, &keys); + bch_btree_insert(&op, dc->disk.c, &keys, NULL); closure_sync(&op.cl); if (op.insert_collision) @@ -433,9 +433,16 @@ static int bch_writeback_thread(void *arg) /* Init */ -static int sectors_dirty_init_fn(struct btree_op *op, struct btree *b, +struct sectors_dirty_init { + struct btree_op op; + unsigned inode; +}; + +static int sectors_dirty_init_fn(struct btree_op *_op, struct btree *b, struct bkey *k) { + struct sectors_dirty_init *op = container_of(_op, + struct sectors_dirty_init, op); if (KEY_INODE(k) > op->inode) return MAP_DONE; @@ -448,12 +455,12 @@ static int sectors_dirty_init_fn(struct btree_op *op, struct btree *b, void bch_sectors_dirty_init(struct cached_dev *dc) { - struct btree_op op; + struct sectors_dirty_init op; - bch_btree_op_init_stack(&op); + bch_btree_op_init_stack(&op.op); op.inode = dc->disk.id; - bch_btree_map_keys(&op, dc->disk.c, &KEY(op.inode, 0, 0), + bch_btree_map_keys(&op.op, dc->disk.c, &KEY(op.inode, 0, 0), sectors_dirty_init_fn, 0); } -- cgit v0.10.2 From b54d6934da7857f87b092df9b77dc1f42818ba94 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 24 Jul 2013 18:04:18 -0700 Subject: bcache: Kill op->cl This isn't used for waiting asynchronously anymore - so this is a fairly trivial refactoring. Signed-off-by: Kent Overstreet diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c index fae5b7b..f7b5525 100644 --- a/drivers/md/bcache/bset.c +++ b/drivers/md/bcache/bset.c @@ -1196,7 +1196,7 @@ int bch_bset_print_stats(struct cache_set *c, char *buf) int ret; memset(&t, 0, sizeof(struct bset_stats)); - bch_btree_op_init_stack(&t.op); + bch_btree_op_init(&t.op, -1); ret = bch_btree_map_nodes(&t.op, c, &ZERO_KEY, btree_bset_stats); if (ret < 0) diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 5cb59c3..cb1a490 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -115,13 +115,6 @@ enum { static struct workqueue_struct *btree_io_wq; -void bch_btree_op_init_stack(struct btree_op *op) -{ - memset(op, 0, sizeof(struct btree_op)); - closure_init_stack(&op->cl); - op->lock = -1; -} - static inline bool should_split(struct btree *b) { struct bset *i = write_block(b); @@ -965,8 +958,7 @@ err: * bch_btree_node_get - find a btree node in the cache and lock it, reading it * in from disk if necessary. * - * If IO is necessary, it uses the closure embedded in struct btree_op to wait; - * if that closure is in non blocking mode, will return -EAGAIN. + * If IO is necessary and running under generic_make_request, returns -EAGAIN. * * The btree node will have either a read or a write lock held, depending on * level and op->lock. @@ -1260,6 +1252,9 @@ static void btree_gc_coalesce(struct btree *b, struct gc_stat *gc, { unsigned nodes = 0, keys = 0, blocks; int i; + struct closure cl; + + closure_init_stack(&cl); while (nodes < GC_MERGE_NODES && r[nodes].b) keys += r[nodes++].keys; @@ -1353,9 +1348,7 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op, { void write(struct btree *r) { - if (!r->written) - bch_btree_node_write(r, &op->cl); - else if (btree_node_dirty(r)) + if (!r->written || btree_node_dirty(r)) bch_btree_node_write(r, writes); up_write(&r->lock); @@ -1431,6 +1424,9 @@ static int bch_btree_gc_root(struct btree *b, struct btree_op *op, struct btree *n = NULL; unsigned keys = 0; int ret = 0, stale = btree_gc_mark_node(b, &keys, gc); + struct closure cl; + + closure_init_stack(&cl); if (b->level || stale > 10) n = btree_node_alloc_replacement(b); @@ -1442,11 +1438,11 @@ static int bch_btree_gc_root(struct btree *b, struct btree_op *op, ret = btree_gc_recurse(b, op, writes, gc); if (!b->written || btree_node_dirty(b)) { - bch_btree_node_write(b, n ? &op->cl : NULL); + bch_btree_node_write(b, n ? &cl : NULL); } if (!IS_ERR_OR_NULL(n)) { - closure_sync(&op->cl); + closure_sync(&cl); bch_btree_set_root(b); btree_node_free(n); rw_unlock(true, b); @@ -1545,15 +1541,13 @@ static void bch_btree_gc(struct cache_set *c) memset(&stats, 0, sizeof(struct gc_stat)); closure_init_stack(&writes); - bch_btree_op_init_stack(&op); - op.lock = SHRT_MAX; + bch_btree_op_init(&op, SHRT_MAX); btree_gc_start(c); atomic_inc(&c->prio_blocked); ret = btree_root(gc_root, c, &op, &writes, &stats); - closure_sync(&op.cl); closure_sync(&writes); if (ret) { @@ -1562,8 +1556,8 @@ static void bch_btree_gc(struct cache_set *c) } /* Possibly wait for new UUIDs or whatever to hit disk */ - bch_journal_meta(c, &op.cl); - closure_sync(&op.cl); + bch_journal_meta(c, &writes); + closure_sync(&writes); available = bch_btree_gc_finish(c); @@ -1671,8 +1665,7 @@ int bch_btree_check(struct cache_set *c) struct btree_op op; memset(seen, 0, sizeof(seen)); - bch_btree_op_init_stack(&op); - op.lock = SHRT_MAX; + bch_btree_op_init(&op, SHRT_MAX); for (i = 0; c->cache[i]; i++) { size_t n = DIV_ROUND_UP(c->cache[i]->sb.nbuckets, 8); @@ -1980,6 +1973,9 @@ static int btree_split(struct btree *b, struct btree_op *op, bool split; struct btree *n1, *n2 = NULL, *n3 = NULL; uint64_t start_time = local_clock(); + struct closure cl; + + closure_init_stack(&cl); n1 = btree_node_alloc_replacement(b); if (IS_ERR(n1)) @@ -2025,7 +2021,7 @@ static int btree_split(struct btree *b, struct btree_op *op, bkey_copy_key(&n2->key, &b->key); bch_keylist_add(parent_keys, &n2->key); - bch_btree_node_write(n2, &op->cl); + bch_btree_node_write(n2, &cl); rw_unlock(true, n2); } else { trace_bcache_btree_node_compact(b, n1->sets[0].data->keys); @@ -2034,23 +2030,23 @@ static int btree_split(struct btree *b, struct btree_op *op, } bch_keylist_add(parent_keys, &n1->key); - bch_btree_node_write(n1, &op->cl); + bch_btree_node_write(n1, &cl); if (n3) { /* Depth increases, make a new root */ bkey_copy_key(&n3->key, &MAX_KEY); bch_btree_insert_keys(n3, op, parent_keys); - bch_btree_node_write(n3, &op->cl); + bch_btree_node_write(n3, &cl); - closure_sync(&op->cl); + closure_sync(&cl); bch_btree_set_root(n3); rw_unlock(true, n3); } else if (!b->parent) { /* Root filled up but didn't need to be split */ bch_keylist_reset(parent_keys); - closure_sync(&op->cl); + closure_sync(&cl); bch_btree_set_root(n1); } else { unsigned i; @@ -2065,7 +2061,7 @@ static int btree_split(struct btree *b, struct btree_op *op, } bch_keylist_push(parent_keys); - closure_sync(&op->cl); + closure_sync(&cl); atomic_inc(&b->c->prio_blocked); } @@ -2126,10 +2122,15 @@ static int bch_btree_insert_node(struct btree *b, struct btree_op *op, BUG_ON(write_block(b) != b->sets[b->nsets].data); if (bch_btree_insert_keys(b, op, insert_keys)) { - if (!b->level) + if (!b->level) { bch_btree_leaf_dirty(b, journal_ref); - else - bch_btree_node_write(b, &op->cl); + } else { + struct closure cl; + + closure_init_stack(&cl); + bch_btree_node_write(b, &cl); + closure_sync(&cl); + } } } } while (!bch_keylist_empty(&split_keys)); @@ -2204,12 +2205,6 @@ int bch_btree_insert(struct btree_op *op, struct cache_set *c, { int ret = 0; - /* - * Don't want to block with the btree locked unless we have to, - * otherwise we get deadlocks with try_harder and between split/gc - */ - clear_closure_blocking(&op->cl); - BUG_ON(bch_keylist_empty(keys)); while (!bch_keylist_empty(keys)) { @@ -2217,8 +2212,8 @@ int bch_btree_insert(struct btree_op *op, struct cache_set *c, ret = btree_root(insert_recurse, c, op, keys, journal_ref); if (ret == -EAGAIN) { + BUG(); ret = 0; - closure_sync(&op->cl); } else if (ret) { struct bkey *k; @@ -2292,10 +2287,7 @@ static int bch_btree_map_nodes_recurse(struct btree *b, struct btree_op *op, int __bch_btree_map_nodes(struct btree_op *op, struct cache_set *c, struct bkey *from, btree_map_nodes_fn *fn, int flags) { - int ret = btree_root(map_nodes_recurse, c, op, from, fn, flags); - if (closure_blocking(&op->cl)) - closure_sync(&op->cl); - return ret; + return btree_root(map_nodes_recurse, c, op, from, fn, flags); } static int bch_btree_map_keys_recurse(struct btree *b, struct btree_op *op, @@ -2328,10 +2320,7 @@ static int bch_btree_map_keys_recurse(struct btree *b, struct btree_op *op, int bch_btree_map_keys(struct btree_op *op, struct cache_set *c, struct bkey *from, btree_map_keys_fn *fn, int flags) { - int ret = btree_root(map_keys_recurse, c, op, from, fn, flags); - if (closure_blocking(&op->cl)) - closure_sync(&op->cl); - return ret; + return btree_root(map_keys_recurse, c, op, from, fn, flags); } /* Keybuf code */ @@ -2409,7 +2398,7 @@ void bch_refill_keybuf(struct cache_set *c, struct keybuf *buf, cond_resched(); - bch_btree_op_init_stack(&refill.op); + bch_btree_op_init(&refill.op, -1); refill.buf = buf; refill.end = end; refill.pred = pred; diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h index 3f820b6..34ee535 100644 --- a/drivers/md/bcache/btree.h +++ b/drivers/md/bcache/btree.h @@ -237,8 +237,6 @@ void __bkey_put(struct cache_set *c, struct bkey *k); /* Recursing down the btree */ struct btree_op { - struct closure cl; - /* Btree level at which we start taking write locks */ short lock; @@ -253,7 +251,11 @@ struct btree_op { BKEY_PADDED(replace); }; -void bch_btree_op_init_stack(struct btree_op *); +static inline void bch_btree_op_init(struct btree_op *op, int write_lock_level) +{ + memset(op, 0, sizeof(struct btree_op)); + op->lock = write_lock_level; +} static inline void rw_lock(bool w, struct btree *b, int level) { diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c index 725c8eb..20e900a 100644 --- a/drivers/md/bcache/journal.c +++ b/drivers/md/bcache/journal.c @@ -305,8 +305,7 @@ int bch_journal_replay(struct cache_set *s, struct list_head *list) struct btree_op op; bch_keylist_init(&keylist); - bch_btree_op_init_stack(&op); - op.lock = SHRT_MAX; + bch_btree_op_init(&op, SHRT_MAX); list_for_each_entry(i, list, list) { BUG_ON(i->pin && atomic_read(i->pin) != 1); @@ -341,14 +340,13 @@ int bch_journal_replay(struct cache_set *s, struct list_head *list) pr_info("journal replay done, %i keys in %i entries, seq %llu", keys, entries, end); - +err: while (!list_empty(list)) { i = list_first_entry(list, struct journal_replay, list); list_del(&i->list); kfree(i); } -err: - closure_sync(&op.cl); + return ret; } diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c index 80e30d7..219356f 100644 --- a/drivers/md/bcache/movinggc.c +++ b/drivers/md/bcache/movinggc.c @@ -108,8 +108,8 @@ static void write_moving(struct closure *cl) s->op.type = BTREE_REPLACE; bkey_copy(&s->op.replace, &io->w->key); - closure_init(&s->op.cl, cl); - bch_data_insert(&s->op.cl); + closure_init(&s->btree, cl); + bch_data_insert(&s->btree); } continue_at(cl, write_moving_finish, system_wq); diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 818e2e3..5df44fb 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -215,8 +215,7 @@ static void bio_csum(struct bio *bio, struct bkey *k) static void bch_data_insert_keys(struct closure *cl) { - struct btree_op *op = container_of(cl, struct btree_op, cl); - struct search *s = container_of(op, struct search, op); + struct search *s = container_of(cl, struct search, btree); atomic_t *journal_ref = NULL; /* @@ -236,7 +235,7 @@ static void bch_data_insert_keys(struct closure *cl) s->flush_journal ? &s->cl : NULL); - if (bch_btree_insert(op, s->c, &s->insert_keys, journal_ref)) { + if (bch_btree_insert(&s->op, s->c, &s->insert_keys, journal_ref)) { s->error = -ENOMEM; s->insert_data_done = true; } @@ -433,8 +432,7 @@ static bool bch_alloc_sectors(struct bkey *k, unsigned sectors, static void bch_data_invalidate(struct closure *cl) { - struct btree_op *op = container_of(cl, struct btree_op, cl); - struct search *s = container_of(op, struct search, op); + struct search *s = container_of(cl, struct search, btree); struct bio *bio = s->cache_bio; pr_debug("invalidating %i sectors from %llu", @@ -461,8 +459,7 @@ out: static void bch_data_insert_error(struct closure *cl) { - struct btree_op *op = container_of(cl, struct btree_op, cl); - struct search *s = container_of(op, struct search, op); + struct search *s = container_of(cl, struct search, btree); /* * Our data write just errored, which means we've got a bunch of keys to @@ -493,8 +490,7 @@ static void bch_data_insert_error(struct closure *cl) static void bch_data_insert_endio(struct bio *bio, int error) { struct closure *cl = bio->bi_private; - struct btree_op *op = container_of(cl, struct btree_op, cl); - struct search *s = container_of(op, struct search, op); + struct search *s = container_of(cl, struct search, btree); if (error) { /* TODO: We could try to recover from this. */ @@ -511,8 +507,7 @@ static void bch_data_insert_endio(struct bio *bio, int error) static void bch_data_insert_start(struct closure *cl) { - struct btree_op *op = container_of(cl, struct btree_op, cl); - struct search *s = container_of(op, struct search, op); + struct search *s = container_of(cl, struct search, btree); struct bio *bio = s->cache_bio, *n; if (s->bypass) @@ -630,8 +625,7 @@ err: */ void bch_data_insert(struct closure *cl) { - struct btree_op *op = container_of(cl, struct btree_op, cl); - struct search *s = container_of(op, struct search, op); + struct search *s = container_of(cl, struct search, btree); bch_keylist_init(&s->insert_keys); bio_get(s->cache_bio); @@ -731,11 +725,10 @@ static int cache_lookup_fn(struct btree_op *op, struct btree *b, struct bkey *k) static void cache_lookup(struct closure *cl) { - struct btree_op *op = container_of(cl, struct btree_op, cl); - struct search *s = container_of(op, struct search, op); + struct search *s = container_of(cl, struct search, btree); struct bio *bio = &s->bio.bio; - int ret = bch_btree_map_keys(op, s->c, + int ret = bch_btree_map_keys(&s->op, s->c, &KEY(s->inode, bio->bi_sector, 0), cache_lookup_fn, MAP_END_KEY); if (ret == -EAGAIN) @@ -1064,7 +1057,7 @@ static void cached_dev_read_done(struct closure *cl) if (s->cache_bio && !test_bit(CACHE_SET_STOPPING, &s->c->flags)) { s->op.type = BTREE_REPLACE; - closure_call(&s->op.cl, bch_data_insert, NULL, cl); + closure_call(&s->btree, bch_data_insert, NULL, cl); } continue_at(cl, cached_dev_cache_miss_done, NULL); @@ -1156,7 +1149,7 @@ static void cached_dev_read(struct cached_dev *dc, struct search *s) { struct closure *cl = &s->cl; - closure_call(&s->op.cl, cache_lookup, NULL, cl); + closure_call(&s->btree, cache_lookup, NULL, cl); continue_at(cl, cached_dev_read_done_bh, NULL); } @@ -1239,7 +1232,7 @@ static void cached_dev_write(struct cached_dev *dc, struct search *s) closure_bio_submit(bio, cl, s->d); } - closure_call(&s->op.cl, bch_data_insert, NULL, cl); + closure_call(&s->btree, bch_data_insert, NULL, cl); continue_at(cl, cached_dev_write_complete, NULL); } @@ -1418,9 +1411,9 @@ static void flash_dev_make_request(struct request_queue *q, struct bio *bio) s->writeback = true; s->cache_bio = bio; - closure_call(&s->op.cl, bch_data_insert, NULL, cl); + closure_call(&s->btree, bch_data_insert, NULL, cl); } else { - closure_call(&s->op.cl, cache_lookup, NULL, cl); + closure_call(&s->btree, cache_lookup, NULL, cl); } continue_at(cl, search_free, NULL); diff --git a/drivers/md/bcache/request.h b/drivers/md/bcache/request.h index 0f79177..ed578aa 100644 --- a/drivers/md/bcache/request.h +++ b/drivers/md/bcache/request.h @@ -6,6 +6,7 @@ struct search { /* Stack frame for bio_complete */ struct closure cl; + struct closure btree; struct bcache_device *d; struct cache_set *c; diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index b58c2bc..d0968e8 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -143,7 +143,7 @@ static void write_dirty_finish(struct closure *cl) struct btree_op op; struct keylist keys; - bch_btree_op_init_stack(&op); + bch_btree_op_init(&op, -1); bch_keylist_init(&keys); op.type = BTREE_REPLACE; @@ -156,7 +156,6 @@ static void write_dirty_finish(struct closure *cl) atomic_inc(&PTR_BUCKET(dc->disk.c, &w->key, i)->pin); bch_btree_insert(&op, dc->disk.c, &keys, NULL); - closure_sync(&op.cl); if (op.insert_collision) trace_bcache_writeback_collision(&w->key); @@ -457,7 +456,7 @@ void bch_sectors_dirty_init(struct cached_dev *dc) { struct sectors_dirty_init op; - bch_btree_op_init_stack(&op.op); + bch_btree_op_init(&op.op, -1); op.inode = dc->disk.id; bch_btree_map_keys(&op.op, dc->disk.c, &KEY(op.inode, 0, 0), -- cgit v0.10.2 From faadf0c96547ec8277ad0abd6959f2ef48522f31 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 1 Nov 2013 18:03:08 -0700 Subject: bcache: Drop some closure stuff With a the recent bcache refactoring, some of the closure code isn't needed anymore. Signed-off-by: Kent Overstreet diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index cb1a490..3e0c901 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -407,7 +407,7 @@ static void do_btree_node_write(struct btree *b) b->bio = bch_bbio_alloc(b->c); b->bio->bi_end_io = btree_node_write_endio; - b->bio->bi_private = &b->io.cl; + b->bio->bi_private = cl; b->bio->bi_rw = REQ_META|WRITE_SYNC|REQ_FUA; b->bio->bi_size = set_blocks(i, b->c) * block_bytes(b->c); bch_bio_map(b->bio, i); @@ -672,8 +672,8 @@ static int mca_reap(struct btree *b, unsigned min_order, bool flush) } /* wait for any in flight btree write */ - closure_wait_event_sync(&b->io.wait, &cl, - atomic_read(&b->io.cl.remaining) == -1); + closure_wait_event(&b->io.wait, &cl, + atomic_read(&b->io.cl.remaining) == -1); return 0; } diff --git a/drivers/md/bcache/closure.c b/drivers/md/bcache/closure.c index 9aba201..dfff241 100644 --- a/drivers/md/bcache/closure.c +++ b/drivers/md/bcache/closure.c @@ -11,17 +11,6 @@ #include "closure.h" -void closure_queue(struct closure *cl) -{ - struct workqueue_struct *wq = cl->wq; - if (wq) { - INIT_WORK(&cl->work, cl->work.func); - BUG_ON(!queue_work(wq, &cl->work)); - } else - cl->fn(cl); -} -EXPORT_SYMBOL_GPL(closure_queue); - #define CL_FIELD(type, field) \ case TYPE_ ## type: \ return &container_of(cl, struct type, cl)->field @@ -30,17 +19,6 @@ static struct closure_waitlist *closure_waitlist(struct closure *cl) { switch (cl->type) { CL_FIELD(closure_with_waitlist, wait); - CL_FIELD(closure_with_waitlist_and_timer, wait); - default: - return NULL; - } -} - -static struct timer_list *closure_timer(struct closure *cl) -{ - switch (cl->type) { - CL_FIELD(closure_with_timer, timer); - CL_FIELD(closure_with_waitlist_and_timer, timer); default: return NULL; } @@ -51,7 +29,7 @@ static inline void closure_put_after_sub(struct closure *cl, int flags) int r = flags & CLOSURE_REMAINING_MASK; BUG_ON(flags & CLOSURE_GUARD_MASK); - BUG_ON(!r && (flags & ~(CLOSURE_DESTRUCTOR|CLOSURE_BLOCKING))); + BUG_ON(!r && (flags & ~CLOSURE_DESTRUCTOR)); /* Must deliver precisely one wakeup */ if (r == 1 && (flags & CLOSURE_SLEEPING)) @@ -59,7 +37,6 @@ static inline void closure_put_after_sub(struct closure *cl, int flags) if (!r) { if (cl->fn && !(flags & CLOSURE_DESTRUCTOR)) { - /* CLOSURE_BLOCKING might be set - clear it */ atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER); closure_queue(cl); @@ -90,13 +67,13 @@ void closure_sub(struct closure *cl, int v) { closure_put_after_sub(cl, atomic_sub_return(v, &cl->remaining)); } -EXPORT_SYMBOL_GPL(closure_sub); +EXPORT_SYMBOL(closure_sub); void closure_put(struct closure *cl) { closure_put_after_sub(cl, atomic_dec_return(&cl->remaining)); } -EXPORT_SYMBOL_GPL(closure_put); +EXPORT_SYMBOL(closure_put); static void set_waiting(struct closure *cl, unsigned long f) { @@ -133,7 +110,7 @@ void __closure_wake_up(struct closure_waitlist *wait_list) closure_sub(cl, CLOSURE_WAITING + 1); } } -EXPORT_SYMBOL_GPL(__closure_wake_up); +EXPORT_SYMBOL(__closure_wake_up); bool closure_wait(struct closure_waitlist *list, struct closure *cl) { @@ -146,7 +123,7 @@ bool closure_wait(struct closure_waitlist *list, struct closure *cl) return true; } -EXPORT_SYMBOL_GPL(closure_wait); +EXPORT_SYMBOL(closure_wait); /** * closure_sync() - sleep until a closure a closure has nothing left to wait on @@ -169,7 +146,7 @@ void closure_sync(struct closure *cl) __closure_end_sleep(cl); } -EXPORT_SYMBOL_GPL(closure_sync); +EXPORT_SYMBOL(closure_sync); /** * closure_trylock() - try to acquire the closure, without waiting @@ -183,17 +160,17 @@ bool closure_trylock(struct closure *cl, struct closure *parent) CLOSURE_REMAINING_INITIALIZER) != -1) return false; - closure_set_ret_ip(cl); - smp_mb(); + cl->parent = parent; if (parent) closure_get(parent); + closure_set_ret_ip(cl); closure_debug_create(cl); return true; } -EXPORT_SYMBOL_GPL(closure_trylock); +EXPORT_SYMBOL(closure_trylock); void __closure_lock(struct closure *cl, struct closure *parent, struct closure_waitlist *wait_list) @@ -205,57 +182,11 @@ void __closure_lock(struct closure *cl, struct closure *parent, if (closure_trylock(cl, parent)) return; - closure_wait_event_sync(wait_list, &wait, - atomic_read(&cl->remaining) == -1); + closure_wait_event(wait_list, &wait, + atomic_read(&cl->remaining) == -1); } } -EXPORT_SYMBOL_GPL(__closure_lock); - -static void closure_delay_timer_fn(unsigned long data) -{ - struct closure *cl = (struct closure *) data; - closure_sub(cl, CLOSURE_TIMER + 1); -} - -void do_closure_timer_init(struct closure *cl) -{ - struct timer_list *timer = closure_timer(cl); - - init_timer(timer); - timer->data = (unsigned long) cl; - timer->function = closure_delay_timer_fn; -} -EXPORT_SYMBOL_GPL(do_closure_timer_init); - -bool __closure_delay(struct closure *cl, unsigned long delay, - struct timer_list *timer) -{ - if (atomic_read(&cl->remaining) & CLOSURE_TIMER) - return false; - - BUG_ON(timer_pending(timer)); - - timer->expires = jiffies + delay; - - atomic_add(CLOSURE_TIMER + 1, &cl->remaining); - add_timer(timer); - return true; -} -EXPORT_SYMBOL_GPL(__closure_delay); - -void __closure_flush(struct closure *cl, struct timer_list *timer) -{ - if (del_timer(timer)) - closure_sub(cl, CLOSURE_TIMER + 1); -} -EXPORT_SYMBOL_GPL(__closure_flush); - -void __closure_flush_sync(struct closure *cl, struct timer_list *timer) -{ - if (del_timer_sync(timer)) - closure_sub(cl, CLOSURE_TIMER + 1); -} -EXPORT_SYMBOL_GPL(__closure_flush_sync); +EXPORT_SYMBOL(__closure_lock); #ifdef CONFIG_BCACHE_CLOSURES_DEBUG @@ -273,7 +204,7 @@ void closure_debug_create(struct closure *cl) list_add(&cl->all, &closure_list); spin_unlock_irqrestore(&closure_list_lock, flags); } -EXPORT_SYMBOL_GPL(closure_debug_create); +EXPORT_SYMBOL(closure_debug_create); void closure_debug_destroy(struct closure *cl) { @@ -286,7 +217,7 @@ void closure_debug_destroy(struct closure *cl) list_del(&cl->all); spin_unlock_irqrestore(&closure_list_lock, flags); } -EXPORT_SYMBOL_GPL(closure_debug_destroy); +EXPORT_SYMBOL(closure_debug_destroy); static struct dentry *debug; @@ -304,14 +235,12 @@ static int debug_seq_show(struct seq_file *f, void *data) cl, (void *) cl->ip, cl->fn, cl->parent, r & CLOSURE_REMAINING_MASK); - seq_printf(f, "%s%s%s%s%s%s\n", + seq_printf(f, "%s%s%s%s\n", test_bit(WORK_STRUCT_PENDING, work_data_bits(&cl->work)) ? "Q" : "", r & CLOSURE_RUNNING ? "R" : "", - r & CLOSURE_BLOCKING ? "B" : "", r & CLOSURE_STACK ? "S" : "", - r & CLOSURE_SLEEPING ? "Sl" : "", - r & CLOSURE_TIMER ? "T" : ""); + r & CLOSURE_SLEEPING ? "Sl" : ""); if (r & CLOSURE_WAITING) seq_printf(f, " W %pF\n", diff --git a/drivers/md/bcache/closure.h b/drivers/md/bcache/closure.h index ab011f0..9762f1b 100644 --- a/drivers/md/bcache/closure.h +++ b/drivers/md/bcache/closure.h @@ -155,21 +155,6 @@ * delayed_work embeds a work item and a timer_list. The important thing is, use * it exactly like you would a regular closure and closure_put() will magically * handle everything for you. - * - * We've got closures that embed timers, too. They're called, appropriately - * enough: - * struct closure_with_timer; - * - * This gives you access to closure_delay(). It takes a refcount for a specified - * number of jiffies - you could then call closure_sync() (for a slightly - * convoluted version of msleep()) or continue_at() - which gives you the same - * effect as using a delayed work item, except you can reuse the work_struct - * already embedded in struct closure. - * - * Lastly, there's struct closure_with_waitlist_and_timer. It does what you - * probably expect, if you happen to need the features of both. (You don't - * really want to know how all this is implemented, but if I've done my job - * right you shouldn't have to care). */ struct closure; @@ -182,16 +167,11 @@ struct closure_waitlist { enum closure_type { TYPE_closure = 0, TYPE_closure_with_waitlist = 1, - TYPE_closure_with_timer = 2, - TYPE_closure_with_waitlist_and_timer = 3, - MAX_CLOSURE_TYPE = 3, + MAX_CLOSURE_TYPE = 1, }; enum closure_state { /* - * CLOSURE_BLOCKING: Causes closure_wait_event() to block, instead of - * waiting asynchronously - * * CLOSURE_WAITING: Set iff the closure is on a waitlist. Must be set by * the thread that owns the closure, and cleared by the thread that's * waking up the closure. @@ -200,10 +180,6 @@ enum closure_state { * - indicates that cl->task is valid and closure_put() may wake it up. * Only set or cleared by the thread that owns the closure. * - * CLOSURE_TIMER: Analagous to CLOSURE_WAITING, indicates that a closure - * has an outstanding timer. Must be set by the thread that owns the - * closure, and cleared by the timer function when the timer goes off. - * * The rest are for debugging and don't affect behaviour: * * CLOSURE_RUNNING: Set when a closure is running (i.e. by @@ -218,19 +194,17 @@ enum closure_state { * closure with this flag set */ - CLOSURE_BITS_START = (1 << 19), - CLOSURE_DESTRUCTOR = (1 << 19), - CLOSURE_BLOCKING = (1 << 21), - CLOSURE_WAITING = (1 << 23), - CLOSURE_SLEEPING = (1 << 25), - CLOSURE_TIMER = (1 << 27), + CLOSURE_BITS_START = (1 << 23), + CLOSURE_DESTRUCTOR = (1 << 23), + CLOSURE_WAITING = (1 << 25), + CLOSURE_SLEEPING = (1 << 27), CLOSURE_RUNNING = (1 << 29), CLOSURE_STACK = (1 << 31), }; #define CLOSURE_GUARD_MASK \ - ((CLOSURE_DESTRUCTOR|CLOSURE_BLOCKING|CLOSURE_WAITING| \ - CLOSURE_SLEEPING|CLOSURE_TIMER|CLOSURE_RUNNING|CLOSURE_STACK) << 1) + ((CLOSURE_DESTRUCTOR|CLOSURE_WAITING|CLOSURE_SLEEPING| \ + CLOSURE_RUNNING|CLOSURE_STACK) << 1) #define CLOSURE_REMAINING_MASK (CLOSURE_BITS_START - 1) #define CLOSURE_REMAINING_INITIALIZER (1|CLOSURE_RUNNING) @@ -268,17 +242,6 @@ struct closure_with_waitlist { struct closure_waitlist wait; }; -struct closure_with_timer { - struct closure cl; - struct timer_list timer; -}; - -struct closure_with_waitlist_and_timer { - struct closure cl; - struct closure_waitlist wait; - struct timer_list timer; -}; - extern unsigned invalid_closure_type(void); #define __CLOSURE_TYPE(cl, _t) \ @@ -289,14 +252,11 @@ extern unsigned invalid_closure_type(void); ( \ __CLOSURE_TYPE(cl, closure) \ __CLOSURE_TYPE(cl, closure_with_waitlist) \ - __CLOSURE_TYPE(cl, closure_with_timer) \ - __CLOSURE_TYPE(cl, closure_with_waitlist_and_timer) \ invalid_closure_type() \ ) void closure_sub(struct closure *cl, int v); void closure_put(struct closure *cl); -void closure_queue(struct closure *cl); void __closure_wake_up(struct closure_waitlist *list); bool closure_wait(struct closure_waitlist *list, struct closure *cl); void closure_sync(struct closure *cl); @@ -305,12 +265,6 @@ bool closure_trylock(struct closure *cl, struct closure *parent); void __closure_lock(struct closure *cl, struct closure *parent, struct closure_waitlist *wait_list); -void do_closure_timer_init(struct closure *cl); -bool __closure_delay(struct closure *cl, unsigned long delay, - struct timer_list *timer); -void __closure_flush(struct closure *cl, struct timer_list *timer); -void __closure_flush_sync(struct closure *cl, struct timer_list *timer); - #ifdef CONFIG_BCACHE_CLOSURES_DEBUG void closure_debug_init(void); @@ -354,11 +308,6 @@ static inline void closure_set_stopped(struct closure *cl) atomic_sub(CLOSURE_RUNNING, &cl->remaining); } -static inline bool closure_is_stopped(struct closure *cl) -{ - return !(atomic_read(&cl->remaining) & CLOSURE_RUNNING); -} - static inline bool closure_is_unlocked(struct closure *cl) { return atomic_read(&cl->remaining) == -1; @@ -367,14 +316,6 @@ static inline bool closure_is_unlocked(struct closure *cl) static inline void do_closure_init(struct closure *cl, struct closure *parent, bool running) { - switch (cl->type) { - case TYPE_closure_with_timer: - case TYPE_closure_with_waitlist_and_timer: - do_closure_timer_init(cl); - default: - break; - } - cl->parent = parent; if (parent) closure_get(parent); @@ -429,8 +370,7 @@ do { \ static inline void closure_init_stack(struct closure *cl) { memset(cl, 0, sizeof(struct closure)); - atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER| - CLOSURE_BLOCKING|CLOSURE_STACK); + atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER|CLOSURE_STACK); } /** @@ -461,24 +401,6 @@ do { \ #define closure_lock(cl, parent) \ __closure_lock(__to_internal_closure(cl), parent, &(cl)->wait) -/** - * closure_delay() - delay some number of jiffies - * @cl: the closure that will sleep - * @delay: the delay in jiffies - * - * Takes a refcount on @cl which will be released after @delay jiffies; this may - * be used to have a function run after a delay with continue_at(), or - * closure_sync() may be used for a convoluted version of msleep(). - */ -#define closure_delay(cl, delay) \ - __closure_delay(__to_internal_closure(cl), delay, &(cl)->timer) - -#define closure_flush(cl) \ - __closure_flush(__to_internal_closure(cl), &(cl)->timer) - -#define closure_flush_sync(cl) \ - __closure_flush_sync(__to_internal_closure(cl), &(cl)->timer) - static inline void __closure_end_sleep(struct closure *cl) { __set_current_state(TASK_RUNNING); @@ -498,40 +420,6 @@ static inline void __closure_start_sleep(struct closure *cl) } /** - * closure_blocking() - returns true if the closure is in blocking mode. - * - * If a closure is in blocking mode, closure_wait_event() will sleep until the - * condition is true instead of waiting asynchronously. - */ -static inline bool closure_blocking(struct closure *cl) -{ - return atomic_read(&cl->remaining) & CLOSURE_BLOCKING; -} - -/** - * set_closure_blocking() - put a closure in blocking mode. - * - * If a closure is in blocking mode, closure_wait_event() will sleep until the - * condition is true instead of waiting asynchronously. - * - * Not thread safe - can only be called by the thread running the closure. - */ -static inline void set_closure_blocking(struct closure *cl) -{ - if (!closure_blocking(cl)) - atomic_add(CLOSURE_BLOCKING, &cl->remaining); -} - -/* - * Not thread safe - can only be called by the thread running the closure. - */ -static inline void clear_closure_blocking(struct closure *cl) -{ - if (closure_blocking(cl)) - atomic_sub(CLOSURE_BLOCKING, &cl->remaining); -} - -/** * closure_wake_up() - wake up all closures on a wait list. */ static inline void closure_wake_up(struct closure_waitlist *list) @@ -561,63 +449,36 @@ static inline void closure_wake_up(struct closure_waitlist *list) * refcount on our closure. If this was a stack allocated closure, that would be * bad. */ -#define __closure_wait_event(list, cl, condition, _block) \ +#define closure_wait_event(list, cl, condition) \ ({ \ - bool block = _block; \ typeof(condition) ret; \ \ while (1) { \ ret = (condition); \ if (ret) { \ __closure_wake_up(list); \ - if (block) \ - closure_sync(cl); \ - \ + closure_sync(cl); \ break; \ } \ \ - if (block) \ - __closure_start_sleep(cl); \ - \ - if (!closure_wait(list, cl)) { \ - if (!block) \ - break; \ + __closure_start_sleep(cl); \ \ + if (!closure_wait(list, cl)) \ schedule(); \ - } \ } \ \ ret; \ }) -/** - * closure_wait_event() - wait on a condition, synchronously or asynchronously. - * @list: the wait list to wait on - * @cl: the closure that is doing the waiting - * @condition: a C expression for the event to wait for - * - * If the closure is in blocking mode, sleeps until the @condition evaluates to - * true - exactly like wait_event(). - * - * If the closure is not in blocking mode, waits asynchronously; if the - * condition is currently false the @cl is put onto @list and returns. @list - * owns a refcount on @cl; closure_sync() or continue_at() may be used later to - * wait for another thread to wake up @list, which drops the refcount on @cl. - * - * Returns the value of @condition; @cl will be on @list iff @condition was - * false. - * - * closure_wake_up(@list) must be called after changing any variable that could - * cause @condition to become true. - */ -#define closure_wait_event(list, cl, condition) \ - __closure_wait_event(list, cl, condition, closure_blocking(cl)) - -#define closure_wait_event_async(list, cl, condition) \ - __closure_wait_event(list, cl, condition, false) - -#define closure_wait_event_sync(list, cl, condition) \ - __closure_wait_event(list, cl, condition, true) +static inline void closure_queue(struct closure *cl) +{ + struct workqueue_struct *wq = cl->wq; + if (wq) { + INIT_WORK(&cl->work, cl->work.func); + BUG_ON(!queue_work(wq, &cl->work)); + } else + cl->fn(cl); +} static inline void set_closure_fn(struct closure *cl, closure_fn *fn, struct workqueue_struct *wq) -- cgit v0.10.2 From 1b207d80d5b986fb305bc899357435d319319513 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 10 Sep 2013 18:52:54 -0700 Subject: bcache: Kill op->replace This is prep work for converting bch_btree_insert to bch_btree_map_leaf_nodes() - we have to convert all its arguments to actual arguments. Bunch of churn, but should be straightforward. Signed-off-by: Kent Overstreet diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 3e0c901..7a1d8dc 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -89,15 +89,6 @@ * Test module load/unload */ -static const char * const op_types[] = { - "insert", "replace" -}; - -static const char *op_type(struct btree_op *op) -{ - return op_types[op->type]; -} - enum { BTREE_INSERT_STATUS_INSERT, BTREE_INSERT_STATUS_BACK_MERGE, @@ -1699,10 +1690,9 @@ static void shift_keys(struct btree *b, struct bkey *where, struct bkey *insert) bch_bset_fix_lookup_table(b, where); } -static bool fix_overlapping_extents(struct btree *b, - struct bkey *insert, +static bool fix_overlapping_extents(struct btree *b, struct bkey *insert, struct btree_iter *iter, - struct btree_op *op) + struct bkey *replace_key) { void subtract_dirty(struct bkey *k, uint64_t offset, int sectors) { @@ -1730,39 +1720,38 @@ static bool fix_overlapping_extents(struct btree *b, * We might overlap with 0 size extents; we can't skip these * because if they're in the set we're inserting to we have to * adjust them so they don't overlap with the key we're - * inserting. But we don't want to check them for BTREE_REPLACE + * inserting. But we don't want to check them for replace * operations. */ - if (op->type == BTREE_REPLACE && - KEY_SIZE(k)) { + if (replace_key && KEY_SIZE(k)) { /* * k might have been split since we inserted/found the * key we're replacing */ unsigned i; uint64_t offset = KEY_START(k) - - KEY_START(&op->replace); + KEY_START(replace_key); /* But it must be a subset of the replace key */ - if (KEY_START(k) < KEY_START(&op->replace) || - KEY_OFFSET(k) > KEY_OFFSET(&op->replace)) + if (KEY_START(k) < KEY_START(replace_key) || + KEY_OFFSET(k) > KEY_OFFSET(replace_key)) goto check_failed; /* We didn't find a key that we were supposed to */ if (KEY_START(k) > KEY_START(insert) + sectors_found) goto check_failed; - if (KEY_PTRS(&op->replace) != KEY_PTRS(k)) + if (KEY_PTRS(replace_key) != KEY_PTRS(k)) goto check_failed; /* skip past gen */ offset <<= 8; - BUG_ON(!KEY_PTRS(&op->replace)); + BUG_ON(!KEY_PTRS(replace_key)); - for (i = 0; i < KEY_PTRS(&op->replace); i++) - if (k->ptr[i] != op->replace.ptr[i] + offset) + for (i = 0; i < KEY_PTRS(replace_key); i++) + if (k->ptr[i] != replace_key->ptr[i] + offset) goto check_failed; sectors_found = KEY_OFFSET(k) - KEY_START(insert); @@ -1833,9 +1822,8 @@ static bool fix_overlapping_extents(struct btree *b, } check_failed: - if (op->type == BTREE_REPLACE) { + if (replace_key) { if (!sectors_found) { - op->insert_collision = true; return true; } else if (sectors_found < KEY_SIZE(insert)) { SET_KEY_OFFSET(insert, KEY_OFFSET(insert) - @@ -1848,7 +1836,7 @@ check_failed: } static bool btree_insert_key(struct btree *b, struct btree_op *op, - struct bkey *k) + struct bkey *k, struct bkey *replace_key) { struct bset *i = b->sets[b->nsets].data; struct bkey *m, *prev; @@ -1874,8 +1862,10 @@ static bool btree_insert_key(struct btree *b, struct btree_op *op, prev = NULL; m = bch_btree_iter_init(b, &iter, &search); - if (fix_overlapping_extents(b, k, &iter, op)) + if (fix_overlapping_extents(b, k, &iter, replace_key)) { + op->insert_collision = true; return false; + } if (KEY_DIRTY(k)) bcache_dev_sectors_dirty_add(b->c, KEY_INODE(k), @@ -1903,24 +1893,28 @@ static bool btree_insert_key(struct btree *b, struct btree_op *op, if (m != end(i) && bch_bkey_try_merge(b, k, m)) goto copy; - } else + } else { + BUG_ON(replace_key); m = bch_bset_search(b, &b->sets[b->nsets], k); + } insert: shift_keys(b, m, k); copy: bkey_copy(m, k); merged: - bch_check_keys(b, "%u for %s", status, op_type(op)); + bch_check_keys(b, "%u for %s", status, + replace_key ? "replace" : "insert"); if (b->level && !KEY_OFFSET(k)) btree_current_write(b)->prio_blocked++; - trace_bcache_btree_insert_key(b, k, op->type, status); + trace_bcache_btree_insert_key(b, k, replace_key != NULL, status); return true; } static bool bch_btree_insert_keys(struct btree *b, struct btree_op *op, - struct keylist *insert_keys) + struct keylist *insert_keys, + struct bkey *replace_key) { bool ret = false; unsigned oldsize = bch_count_data(b); @@ -1936,11 +1930,11 @@ static bool bch_btree_insert_keys(struct btree *b, struct btree_op *op, if (bkey_cmp(k, &b->key) <= 0) { bkey_put(b->c, k, b->level); - ret |= btree_insert_key(b, op, k); + ret |= btree_insert_key(b, op, k, replace_key); bch_keylist_pop_front(insert_keys); } else if (bkey_cmp(&START_KEY(k), &b->key) < 0) { #if 0 - if (op->type == BTREE_REPLACE) { + if (replace_key) { bkey_put(b->c, k, b->level); bch_keylist_pop_front(insert_keys); op->insert_collision = true; @@ -1953,7 +1947,7 @@ static bool bch_btree_insert_keys(struct btree *b, struct btree_op *op, bch_cut_back(&b->key, &temp.key); bch_cut_front(&b->key, insert_keys->keys); - ret |= btree_insert_key(b, op, &temp.key); + ret |= btree_insert_key(b, op, &temp.key, replace_key); break; } else { break; @@ -1968,7 +1962,8 @@ static bool bch_btree_insert_keys(struct btree *b, struct btree_op *op, static int btree_split(struct btree *b, struct btree_op *op, struct keylist *insert_keys, - struct keylist *parent_keys) + struct keylist *parent_keys, + struct bkey *replace_key) { bool split; struct btree *n1, *n2 = NULL, *n3 = NULL; @@ -1998,7 +1993,7 @@ static int btree_split(struct btree *b, struct btree_op *op, goto err_free2; } - bch_btree_insert_keys(n1, op, insert_keys); + bch_btree_insert_keys(n1, op, insert_keys, replace_key); /* * Has to be a linear search because we don't have an auxiliary @@ -2026,7 +2021,7 @@ static int btree_split(struct btree *b, struct btree_op *op, } else { trace_bcache_btree_node_compact(b, n1->sets[0].data->keys); - bch_btree_insert_keys(n1, op, insert_keys); + bch_btree_insert_keys(n1, op, insert_keys, replace_key); } bch_keylist_add(parent_keys, &n1->key); @@ -2036,7 +2031,7 @@ static int btree_split(struct btree *b, struct btree_op *op, /* Depth increases, make a new root */ bkey_copy_key(&n3->key, &MAX_KEY); - bch_btree_insert_keys(n3, op, parent_keys); + bch_btree_insert_keys(n3, op, parent_keys, NULL); bch_btree_node_write(n3, &cl); closure_sync(&cl); @@ -2091,7 +2086,8 @@ err: static int bch_btree_insert_node(struct btree *b, struct btree_op *op, struct keylist *insert_keys, - atomic_t *journal_ref) + atomic_t *journal_ref, + struct bkey *replace_key) { int ret = 0; struct keylist split_keys; @@ -2101,6 +2097,8 @@ static int bch_btree_insert_node(struct btree *b, struct btree_op *op, BUG_ON(b->level); do { + BUG_ON(b->level && replace_key); + if (should_split(b)) { if (current->bio_list) { op->lock = b->c->root->level + 1; @@ -2112,8 +2110,9 @@ static int bch_btree_insert_node(struct btree *b, struct btree_op *op, struct btree *parent = b->parent; ret = btree_split(b, op, insert_keys, - &split_keys); + &split_keys, replace_key); insert_keys = &split_keys; + replace_key = NULL; b = parent; if (!ret) ret = -EINTR; @@ -2121,7 +2120,8 @@ static int bch_btree_insert_node(struct btree *b, struct btree_op *op, } else { BUG_ON(write_block(b) != b->sets[b->nsets].data); - if (bch_btree_insert_keys(b, op, insert_keys)) { + if (bch_btree_insert_keys(b, op, insert_keys, + replace_key)) { if (!b->level) { bch_btree_leaf_dirty(b, journal_ref); } else { @@ -2165,9 +2165,7 @@ int bch_btree_insert_check_key(struct btree *b, struct btree_op *op, bch_keylist_add(&insert, check_key); - BUG_ON(op->type != BTREE_INSERT); - - ret = bch_btree_insert_node(b, op, &insert, NULL); + ret = bch_btree_insert_node(b, op, &insert, NULL, NULL); BUG_ON(!ret && !bch_keylist_empty(&insert)); out: @@ -2177,7 +2175,8 @@ out: } static int bch_btree_insert_recurse(struct btree *b, struct btree_op *op, - struct keylist *keys, atomic_t *journal_ref) + struct keylist *keys, atomic_t *journal_ref, + struct bkey *replace_key) { if (bch_keylist_empty(keys)) return 0; @@ -2194,14 +2193,17 @@ static int bch_btree_insert_recurse(struct btree *b, struct btree_op *op, return -EIO; } - return btree(insert_recurse, k, b, op, keys, journal_ref); + return btree(insert_recurse, k, b, op, keys, + journal_ref, replace_key); } else { - return bch_btree_insert_node(b, op, keys, journal_ref); + return bch_btree_insert_node(b, op, keys, + journal_ref, replace_key); } } int bch_btree_insert(struct btree_op *op, struct cache_set *c, - struct keylist *keys, atomic_t *journal_ref) + struct keylist *keys, atomic_t *journal_ref, + struct bkey *replace_key) { int ret = 0; @@ -2209,7 +2211,8 @@ int bch_btree_insert(struct btree_op *op, struct cache_set *c, while (!bch_keylist_empty(keys)) { op->lock = 0; - ret = btree_root(insert_recurse, c, op, keys, journal_ref); + ret = btree_root(insert_recurse, c, op, keys, + journal_ref, replace_key); if (ret == -EAGAIN) { BUG(); @@ -2217,8 +2220,7 @@ int bch_btree_insert(struct btree_op *op, struct cache_set *c, } else if (ret) { struct bkey *k; - pr_err("error %i trying to insert key for %s", - ret, op_type(op)); + pr_err("error %i", ret); while ((k = bch_keylist_pop(keys))) bkey_put(c, k, 0); diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h index 34ee535..6ff08be 100644 --- a/drivers/md/bcache/btree.h +++ b/drivers/md/bcache/btree.h @@ -240,15 +240,7 @@ struct btree_op { /* Btree level at which we start taking write locks */ short lock; - /* Btree insertion type */ - enum { - BTREE_INSERT, - BTREE_REPLACE - } type:8; - unsigned insert_collision:1; - - BKEY_PADDED(replace); }; static inline void bch_btree_op_init(struct btree_op *op, int write_lock_level) @@ -290,7 +282,7 @@ struct btree *bch_btree_node_get(struct cache_set *, struct bkey *, int, bool); int bch_btree_insert_check_key(struct btree *, struct btree_op *, struct bkey *); int bch_btree_insert(struct btree_op *, struct cache_set *, - struct keylist *, atomic_t *); + struct keylist *, atomic_t *, struct bkey *); int bch_gc_thread_start(struct cache_set *); size_t bch_btree_gc_finish(struct cache_set *); diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c index 20e900a..592adf5 100644 --- a/drivers/md/bcache/journal.c +++ b/drivers/md/bcache/journal.c @@ -322,7 +322,7 @@ int bch_journal_replay(struct cache_set *s, struct list_head *list) bkey_copy(keylist.top, k); bch_keylist_push(&keylist); - ret = bch_btree_insert(&op, s, &keylist, i->pin); + ret = bch_btree_insert(&op, s, &keylist, i->pin, NULL); if (ret) goto err; diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c index 219356f..c45ba4f 100644 --- a/drivers/md/bcache/movinggc.c +++ b/drivers/md/bcache/movinggc.c @@ -105,8 +105,8 @@ static void write_moving(struct closure *cl) s->writeback = KEY_DIRTY(&io->w->key); s->csum = KEY_CSUM(&io->w->key); - s->op.type = BTREE_REPLACE; - bkey_copy(&s->op.replace, &io->w->key); + bkey_copy(&s->replace_key, &io->w->key); + s->replace = true; closure_init(&s->btree, cl); bch_data_insert(&s->btree); diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 5df44fb..16a3e16 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -217,6 +217,7 @@ static void bch_data_insert_keys(struct closure *cl) { struct search *s = container_of(cl, struct search, btree); atomic_t *journal_ref = NULL; + struct bkey *replace_key = s->replace ? &s->replace_key : NULL; /* * If we're looping, might already be waiting on @@ -235,7 +236,8 @@ static void bch_data_insert_keys(struct closure *cl) s->flush_journal ? &s->cl : NULL); - if (bch_btree_insert(&s->op, s->c, &s->insert_keys, journal_ref)) { + if (bch_btree_insert(&s->op, s->c, &s->insert_keys, + journal_ref, replace_key)) { s->error = -ENOMEM; s->insert_data_done = true; } @@ -1056,7 +1058,7 @@ static void cached_dev_read_done(struct closure *cl) if (s->cache_bio && !test_bit(CACHE_SET_STOPPING, &s->c->flags)) { - s->op.type = BTREE_REPLACE; + BUG_ON(!s->replace); closure_call(&s->btree, bch_data_insert, NULL, cl); } @@ -1101,13 +1103,15 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s, s->cache_bio_sectors = min(sectors, bio_sectors(bio) + reada); - s->op.replace = KEY(s->inode, bio->bi_sector + - s->cache_bio_sectors, s->cache_bio_sectors); + s->replace_key = KEY(s->inode, bio->bi_sector + + s->cache_bio_sectors, s->cache_bio_sectors); - ret = bch_btree_insert_check_key(b, &s->op, &s->op.replace); + ret = bch_btree_insert_check_key(b, &s->op, &s->replace_key); if (ret) return ret; + s->replace = true; + miss = bch_bio_split(bio, sectors, GFP_NOIO, s->d->bio_split); /* btree_search_recurse()'s btree iterator is no good anymore */ diff --git a/drivers/md/bcache/request.h b/drivers/md/bcache/request.h index ed578aa..5ea7a0e 100644 --- a/drivers/md/bcache/request.h +++ b/drivers/md/bcache/request.h @@ -33,6 +33,7 @@ struct search { unsigned flush_journal:1; unsigned insert_data_done:1; + unsigned replace:1; uint16_t write_prio; @@ -44,6 +45,7 @@ struct search { /* Anything past this point won't get zeroed in search_alloc() */ struct keylist insert_keys; + BKEY_PADDED(replace_key); }; unsigned bch_get_congested(struct cache_set *); diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index d0968e8..346a534 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -146,16 +146,14 @@ static void write_dirty_finish(struct closure *cl) bch_btree_op_init(&op, -1); bch_keylist_init(&keys); - op.type = BTREE_REPLACE; - bkey_copy(&op.replace, &w->key); - - SET_KEY_DIRTY(&w->key, false); - bch_keylist_add(&keys, &w->key); + bkey_copy(keys.top, &w->key); + SET_KEY_DIRTY(keys.top, false); + bch_keylist_push(&keys); for (i = 0; i < KEY_PTRS(&w->key); i++) atomic_inc(&PTR_BUCKET(dc->disk.c, &w->key, i)->pin); - bch_btree_insert(&op, dc->disk.c, &keys, NULL); + bch_btree_insert(&op, dc->disk.c, &keys, NULL, &w->key); if (op.insert_collision) trace_bcache_writeback_collision(&w->key); -- cgit v0.10.2 From 6054c6d4da1940c7bf8870c6393773aa794f53d8 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 24 Jul 2013 18:06:22 -0700 Subject: bcache: Don't use op->insert_collision When we convert bch_btree_insert() to bch_btree_map_leaf_nodes(), we won't be passing struct btree_op to bch_btree_insert() anymore - so we need a different way of returning whether there was a collision (really, a replace collision). Signed-off-by: Kent Overstreet diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 7a1d8dc..441524d 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -2227,6 +2227,9 @@ int bch_btree_insert(struct btree_op *op, struct cache_set *c, } } + if (op->insert_collision) + return -ESRCH; + return ret; } diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c index c45ba4f..601c96a 100644 --- a/drivers/md/bcache/movinggc.c +++ b/drivers/md/bcache/movinggc.c @@ -52,7 +52,7 @@ static void write_moving_finish(struct closure *cl) bio_for_each_segment_all(bv, bio, i) __free_page(bv->bv_page); - if (io->s.op.insert_collision) + if (io->s.insert_collision) trace_bcache_gc_copy_collision(&io->w->key); bch_keybuf_del(&io->s.c->moving_gc_keys, io->w); diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 16a3e16..bcce06a 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -218,6 +218,7 @@ static void bch_data_insert_keys(struct closure *cl) struct search *s = container_of(cl, struct search, btree); atomic_t *journal_ref = NULL; struct bkey *replace_key = s->replace ? &s->replace_key : NULL; + int ret; /* * If we're looping, might already be waiting on @@ -236,8 +237,11 @@ static void bch_data_insert_keys(struct closure *cl) s->flush_journal ? &s->cl : NULL); - if (bch_btree_insert(&s->op, s->c, &s->insert_keys, - journal_ref, replace_key)) { + ret = bch_btree_insert(&s->op, s->c, &s->insert_keys, + journal_ref, replace_key); + if (ret == -ESRCH) { + s->insert_collision = true; + } else if (ret) { s->error = -ENOMEM; s->insert_data_done = true; } @@ -977,7 +981,7 @@ static void cached_dev_cache_miss_done(struct closure *cl) { struct search *s = container_of(cl, struct search, cl); - if (s->op.insert_collision) + if (s->insert_collision) bch_mark_cache_miss_collision(s); if (s->cache_bio) { diff --git a/drivers/md/bcache/request.h b/drivers/md/bcache/request.h index 5ea7a0e..f0e930b 100644 --- a/drivers/md/bcache/request.h +++ b/drivers/md/bcache/request.h @@ -34,6 +34,7 @@ struct search { unsigned insert_data_done:1; unsigned replace:1; + unsigned insert_collision:1; uint16_t write_prio; diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index 346a534..312032e 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -142,6 +142,7 @@ static void write_dirty_finish(struct closure *cl) unsigned i; struct btree_op op; struct keylist keys; + int ret; bch_btree_op_init(&op, -1); bch_keylist_init(&keys); @@ -153,12 +154,12 @@ static void write_dirty_finish(struct closure *cl) for (i = 0; i < KEY_PTRS(&w->key); i++) atomic_inc(&PTR_BUCKET(dc->disk.c, &w->key, i)->pin); - bch_btree_insert(&op, dc->disk.c, &keys, NULL, &w->key); + ret = bch_btree_insert(&op, dc->disk.c, &keys, NULL, &w->key); - if (op.insert_collision) + if (ret) trace_bcache_writeback_collision(&w->key); - atomic_long_inc(op.insert_collision + atomic_long_inc(ret ? &dc->disk.c->writeback_keys_failed : &dc->disk.c->writeback_keys_done); } -- cgit v0.10.2 From cc7b8819212f437fc82f0f9cdc24deb0fb5d775f Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 24 Jul 2013 18:07:22 -0700 Subject: bcache: Convert bch_btree_insert() to bch_btree_map_leaf_nodes() Last of the btree_map() conversions. Main visible effect is bch_btree_insert() is no longer taking a struct btree_op as an argument anymore - there's no fancy state machine stuff going on, it's just a normal function. Signed-off-by: Kent Overstreet diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 441524d..f5aa4ad 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -2174,61 +2174,56 @@ out: return ret; } -static int bch_btree_insert_recurse(struct btree *b, struct btree_op *op, - struct keylist *keys, atomic_t *journal_ref, - struct bkey *replace_key) -{ - if (bch_keylist_empty(keys)) - return 0; - - if (b->level) { - struct bkey *k; - - k = bch_next_recurse_key(b, &START_KEY(keys->keys)); - if (!k) { - btree_bug(b, "no key to recurse on at level %i/%i", - b->level, b->c->root->level); +struct btree_insert_op { + struct btree_op op; + struct keylist *keys; + atomic_t *journal_ref; + struct bkey *replace_key; +}; - bch_keylist_reset(keys); - return -EIO; - } +int btree_insert_fn(struct btree_op *b_op, struct btree *b) +{ + struct btree_insert_op *op = container_of(b_op, + struct btree_insert_op, op); - return btree(insert_recurse, k, b, op, keys, - journal_ref, replace_key); - } else { - return bch_btree_insert_node(b, op, keys, - journal_ref, replace_key); - } + int ret = bch_btree_insert_node(b, &op->op, op->keys, + op->journal_ref, op->replace_key); + if (ret && !bch_keylist_empty(op->keys)) + return ret; + else + return MAP_DONE; } -int bch_btree_insert(struct btree_op *op, struct cache_set *c, - struct keylist *keys, atomic_t *journal_ref, - struct bkey *replace_key) +int bch_btree_insert(struct cache_set *c, struct keylist *keys, + atomic_t *journal_ref, struct bkey *replace_key) { + struct btree_insert_op op; int ret = 0; + BUG_ON(current->bio_list); BUG_ON(bch_keylist_empty(keys)); - while (!bch_keylist_empty(keys)) { - op->lock = 0; - ret = btree_root(insert_recurse, c, op, keys, - journal_ref, replace_key); + bch_btree_op_init(&op.op, 0); + op.keys = keys; + op.journal_ref = journal_ref; + op.replace_key = replace_key; - if (ret == -EAGAIN) { - BUG(); - ret = 0; - } else if (ret) { - struct bkey *k; + while (!ret && !bch_keylist_empty(keys)) { + op.op.lock = 0; + ret = bch_btree_map_leaf_nodes(&op.op, c, + &START_KEY(keys->keys), + btree_insert_fn); + } - pr_err("error %i", ret); + if (ret) { + struct bkey *k; - while ((k = bch_keylist_pop(keys))) - bkey_put(c, k, 0); - } - } + pr_err("error %i", ret); - if (op->insert_collision) - return -ESRCH; + while ((k = bch_keylist_pop(keys))) + bkey_put(c, k, 0); + } else if (op.op.insert_collision) + ret = -ESRCH; return ret; } diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h index 6ff08be..8fc1e89 100644 --- a/drivers/md/bcache/btree.h +++ b/drivers/md/bcache/btree.h @@ -281,8 +281,8 @@ struct btree *bch_btree_node_get(struct cache_set *, struct bkey *, int, bool); int bch_btree_insert_check_key(struct btree *, struct btree_op *, struct bkey *); -int bch_btree_insert(struct btree_op *, struct cache_set *, - struct keylist *, atomic_t *, struct bkey *); +int bch_btree_insert(struct cache_set *, struct keylist *, + atomic_t *, struct bkey *); int bch_gc_thread_start(struct cache_set *); size_t bch_btree_gc_finish(struct cache_set *); diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c index 592adf5..86de64a 100644 --- a/drivers/md/bcache/journal.c +++ b/drivers/md/bcache/journal.c @@ -302,10 +302,8 @@ int bch_journal_replay(struct cache_set *s, struct list_head *list) uint64_t start = i->j.last_seq, end = i->j.seq, n = start; struct keylist keylist; - struct btree_op op; bch_keylist_init(&keylist); - bch_btree_op_init(&op, SHRT_MAX); list_for_each_entry(i, list, list) { BUG_ON(i->pin && atomic_read(i->pin) != 1); @@ -322,7 +320,7 @@ int bch_journal_replay(struct cache_set *s, struct list_head *list) bkey_copy(keylist.top, k); bch_keylist_push(&keylist); - ret = bch_btree_insert(&op, s, &keylist, i->pin, NULL); + ret = bch_btree_insert(s, &keylist, i->pin, NULL); if (ret) goto err; diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index bcce06a..6cee2ae 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -237,7 +237,7 @@ static void bch_data_insert_keys(struct closure *cl) s->flush_journal ? &s->cl : NULL); - ret = bch_btree_insert(&s->op, s->c, &s->insert_keys, + ret = bch_btree_insert(s->c, &s->insert_keys, journal_ref, replace_key); if (ret == -ESRCH) { s->insert_collision = true; diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index 312032e..ab0f6b4 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -139,12 +139,10 @@ static void write_dirty_finish(struct closure *cl) /* This is kind of a dumb way of signalling errors. */ if (KEY_DIRTY(&w->key)) { + int ret; unsigned i; - struct btree_op op; struct keylist keys; - int ret; - bch_btree_op_init(&op, -1); bch_keylist_init(&keys); bkey_copy(keys.top, &w->key); @@ -154,7 +152,7 @@ static void write_dirty_finish(struct closure *cl) for (i = 0; i < KEY_PTRS(&w->key); i++) atomic_inc(&PTR_BUCKET(dc->disk.c, &w->key, i)->pin); - ret = bch_btree_insert(&op, dc->disk.c, &keys, NULL, &w->key); + ret = bch_btree_insert(dc->disk.c, &keys, NULL, &w->key); if (ret) trace_bcache_writeback_collision(&w->key); -- cgit v0.10.2 From 220bb38c21b83e2f7b842f33220bf727093eca89 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 10 Sep 2013 19:02:45 -0700 Subject: bcache: Break up struct search With all the recent refactoring around struct btree op struct search has gotten rather large. But we can now easily break it up in a different way - we break out struct btree_insert_op which is for inserting data into the cache, and that's now what the copying gc code uses - struct search is now specific to request.c Signed-off-by: Kent Overstreet diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c index 88e6411..d9ccb31 100644 --- a/drivers/md/bcache/debug.c +++ b/drivers/md/bcache/debug.c @@ -8,7 +8,6 @@ #include "bcache.h" #include "btree.h" #include "debug.h" -#include "request.h" #include #include @@ -176,42 +175,25 @@ void bch_btree_verify(struct btree *b, struct bset *new) mutex_unlock(&b->c->verify_lock); } -static void data_verify_endio(struct bio *bio, int error) -{ - struct closure *cl = bio->bi_private; - closure_put(cl); -} - -void bch_data_verify(struct search *s) +void bch_data_verify(struct cached_dev *dc, struct bio *bio) { char name[BDEVNAME_SIZE]; - struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); - struct closure *cl = &s->cl; struct bio *check; struct bio_vec *bv; int i; - if (!s->unaligned_bvec) - bio_for_each_segment(bv, s->orig_bio, i) - bv->bv_offset = 0, bv->bv_len = PAGE_SIZE; - - check = bio_clone(s->orig_bio, GFP_NOIO); + check = bio_clone(bio, GFP_NOIO); if (!check) return; if (bio_alloc_pages(check, GFP_NOIO)) goto out_put; - check->bi_rw = READ_SYNC; - check->bi_private = cl; - check->bi_end_io = data_verify_endio; - - closure_bio_submit(check, cl, &dc->disk); - closure_sync(cl); + submit_bio_wait(READ_SYNC, check); - bio_for_each_segment(bv, s->orig_bio, i) { - void *p1 = kmap(bv->bv_page); - void *p2 = kmap(check->bi_io_vec[i].bv_page); + bio_for_each_segment(bv, bio, i) { + void *p1 = kmap_atomic(bv->bv_page); + void *p2 = page_address(check->bi_io_vec[i].bv_page); if (memcmp(p1 + bv->bv_offset, p2 + bv->bv_offset, @@ -219,13 +201,11 @@ void bch_data_verify(struct search *s) printk(KERN_ERR "bcache (%s): verify failed at sector %llu\n", bdevname(dc->bdev, name), - (uint64_t) s->orig_bio->bi_sector); - - kunmap(bv->bv_page); - kunmap(check->bi_io_vec[i].bv_page); + (uint64_t) bio->bi_sector); + kunmap_atomic(p1); } - __bio_for_each_segment(bv, check, i, 0) + bio_for_each_segment_all(bv, check, i) __free_page(bv->bv_page); out_put: bio_put(check); diff --git a/drivers/md/bcache/debug.h b/drivers/md/bcache/debug.h index 1c39b5a..0f4b344 100644 --- a/drivers/md/bcache/debug.h +++ b/drivers/md/bcache/debug.h @@ -29,12 +29,12 @@ void bch_check_keys(struct btree *, const char *, ...); #ifdef CONFIG_BCACHE_DEBUG void bch_btree_verify(struct btree *, struct bset *); -void bch_data_verify(struct search *); +void bch_data_verify(struct cached_dev *, struct bio *); #else /* DEBUG */ static inline void bch_btree_verify(struct btree *b, struct bset *i) {} -static inline void bch_data_verify(struct search *s) {}; +static inline void bch_data_verify(struct cached_dev *dc, struct bio *bio) {}; #endif diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c index 601c96a..7c1275e 100644 --- a/drivers/md/bcache/movinggc.c +++ b/drivers/md/bcache/movinggc.c @@ -12,8 +12,9 @@ #include struct moving_io { + struct closure cl; struct keybuf_key *w; - struct search s; + struct data_insert_op op; struct bbio bio; }; @@ -38,13 +39,13 @@ static bool moving_pred(struct keybuf *buf, struct bkey *k) static void moving_io_destructor(struct closure *cl) { - struct moving_io *io = container_of(cl, struct moving_io, s.cl); + struct moving_io *io = container_of(cl, struct moving_io, cl); kfree(io); } static void write_moving_finish(struct closure *cl) { - struct moving_io *io = container_of(cl, struct moving_io, s.cl); + struct moving_io *io = container_of(cl, struct moving_io, cl); struct bio *bio = &io->bio.bio; struct bio_vec *bv; int i; @@ -52,12 +53,12 @@ static void write_moving_finish(struct closure *cl) bio_for_each_segment_all(bv, bio, i) __free_page(bv->bv_page); - if (io->s.insert_collision) + if (io->op.replace_collision) trace_bcache_gc_copy_collision(&io->w->key); - bch_keybuf_del(&io->s.c->moving_gc_keys, io->w); + bch_keybuf_del(&io->op.c->moving_gc_keys, io->w); - up(&io->s.c->moving_in_flight); + up(&io->op.c->moving_in_flight); closure_return_with_destructor(cl, moving_io_destructor); } @@ -65,12 +66,12 @@ static void write_moving_finish(struct closure *cl) static void read_moving_endio(struct bio *bio, int error) { struct moving_io *io = container_of(bio->bi_private, - struct moving_io, s.cl); + struct moving_io, cl); if (error) - io->s.error = error; + io->op.error = error; - bch_bbio_endio(io->s.c, bio, error, "reading data to move"); + bch_bbio_endio(io->op.c, bio, error, "reading data to move"); } static void moving_init(struct moving_io *io) @@ -84,32 +85,30 @@ static void moving_init(struct moving_io *io) bio->bi_size = KEY_SIZE(&io->w->key) << 9; bio->bi_max_vecs = DIV_ROUND_UP(KEY_SIZE(&io->w->key), PAGE_SECTORS); - bio->bi_private = &io->s.cl; + bio->bi_private = &io->cl; bio->bi_io_vec = bio->bi_inline_vecs; bch_bio_map(bio, NULL); } static void write_moving(struct closure *cl) { - struct search *s = container_of(cl, struct search, cl); - struct moving_io *io = container_of(s, struct moving_io, s); + struct moving_io *io = container_of(cl, struct moving_io, cl); + struct data_insert_op *op = &io->op; - if (!s->error) { + if (!op->error) { moving_init(io); - io->bio.bio.bi_sector = KEY_START(&io->w->key); - s->op.lock = -1; - s->write_prio = 1; - s->cache_bio = &io->bio.bio; + io->bio.bio.bi_sector = KEY_START(&io->w->key); + op->write_prio = 1; + op->bio = &io->bio.bio; - s->writeback = KEY_DIRTY(&io->w->key); - s->csum = KEY_CSUM(&io->w->key); + op->writeback = KEY_DIRTY(&io->w->key); + op->csum = KEY_CSUM(&io->w->key); - bkey_copy(&s->replace_key, &io->w->key); - s->replace = true; + bkey_copy(&op->replace_key, &io->w->key); + op->replace = true; - closure_init(&s->btree, cl); - bch_data_insert(&s->btree); + closure_call(&op->cl, bch_data_insert, NULL, cl); } continue_at(cl, write_moving_finish, system_wq); @@ -117,11 +116,10 @@ static void write_moving(struct closure *cl) static void read_moving_submit(struct closure *cl) { - struct search *s = container_of(cl, struct search, cl); - struct moving_io *io = container_of(s, struct moving_io, s); + struct moving_io *io = container_of(cl, struct moving_io, cl); struct bio *bio = &io->bio.bio; - bch_submit_bbio(bio, s->c, &io->w->key, 0); + bch_submit_bbio(bio, io->op.c, &io->w->key, 0); continue_at(cl, write_moving, system_wq); } @@ -151,8 +149,8 @@ static void read_moving(struct cache_set *c) w->private = io; io->w = w; - io->s.inode = KEY_INODE(&w->key); - io->s.c = c; + io->op.inode = KEY_INODE(&w->key); + io->op.c = c; moving_init(io); bio = &io->bio.bio; @@ -166,7 +164,7 @@ static void read_moving(struct cache_set *c) trace_bcache_gc_copy(&w->key); down(&c->moving_in_flight); - closure_call(&io->s.cl, read_moving_submit, NULL, &cl); + closure_call(&io->cl, read_moving_submit, NULL, &cl); } if (0) { diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 6cee2ae..05c7c21 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -215,9 +215,9 @@ static void bio_csum(struct bio *bio, struct bkey *k) static void bch_data_insert_keys(struct closure *cl) { - struct search *s = container_of(cl, struct search, btree); + struct data_insert_op *op = container_of(cl, struct data_insert_op, cl); atomic_t *journal_ref = NULL; - struct bkey *replace_key = s->replace ? &s->replace_key : NULL; + struct bkey *replace_key = op->replace ? &op->replace_key : NULL; int ret; /* @@ -232,27 +232,26 @@ static void bch_data_insert_keys(struct closure *cl) closure_sync(&s->cl); #endif - if (s->write) - journal_ref = bch_journal(s->c, &s->insert_keys, - s->flush_journal - ? &s->cl : NULL); + if (!op->replace) + journal_ref = bch_journal(op->c, &op->insert_keys, + op->flush_journal ? cl : NULL); - ret = bch_btree_insert(s->c, &s->insert_keys, + ret = bch_btree_insert(op->c, &op->insert_keys, journal_ref, replace_key); if (ret == -ESRCH) { - s->insert_collision = true; + op->replace_collision = true; } else if (ret) { - s->error = -ENOMEM; - s->insert_data_done = true; + op->error = -ENOMEM; + op->insert_data_done = true; } if (journal_ref) atomic_dec_bug(journal_ref); - if (!s->insert_data_done) + if (!op->insert_data_done) continue_at(cl, bch_data_insert_start, bcache_wq); - bch_keylist_free(&s->insert_keys); + bch_keylist_free(&op->insert_keys); closure_return(cl); } @@ -349,10 +348,10 @@ found: * * If s->writeback is true, will not fail. */ -static bool bch_alloc_sectors(struct bkey *k, unsigned sectors, - struct search *s) +static bool bch_alloc_sectors(struct data_insert_op *op, + struct bkey *k, unsigned sectors) { - struct cache_set *c = s->c; + struct cache_set *c = op->c; struct open_bucket *b; BKEY_PADDED(key) alloc; unsigned i; @@ -367,15 +366,15 @@ static bool bch_alloc_sectors(struct bkey *k, unsigned sectors, bkey_init(&alloc.key); spin_lock(&c->data_bucket_lock); - while (!(b = pick_data_bucket(c, k, s->task, &alloc.key))) { - unsigned watermark = s->write_prio + while (!(b = pick_data_bucket(c, k, op->task, &alloc.key))) { + unsigned watermark = op->write_prio ? WATERMARK_MOVINGGC : WATERMARK_NONE; spin_unlock(&c->data_bucket_lock); if (bch_bucket_alloc_set(c, watermark, &alloc.key, - 1, s->writeback)) + 1, op->writeback)) return false; spin_lock(&c->data_bucket_lock); @@ -409,7 +408,7 @@ static bool bch_alloc_sectors(struct bkey *k, unsigned sectors, */ list_move_tail(&b->list, &c->data_buckets); bkey_copy_key(&b->key, k); - b->last = s->task; + b->last = op->task; b->sectors_free -= sectors; @@ -438,8 +437,8 @@ static bool bch_alloc_sectors(struct bkey *k, unsigned sectors, static void bch_data_invalidate(struct closure *cl) { - struct search *s = container_of(cl, struct search, btree); - struct bio *bio = s->cache_bio; + struct data_insert_op *op = container_of(cl, struct data_insert_op, cl); + struct bio *bio = op->bio; pr_debug("invalidating %i sectors from %llu", bio_sectors(bio), (uint64_t) bio->bi_sector); @@ -447,17 +446,17 @@ static void bch_data_invalidate(struct closure *cl) while (bio_sectors(bio)) { unsigned len = min(bio_sectors(bio), 1U << 14); - if (bch_keylist_realloc(&s->insert_keys, 0, s->c)) + if (bch_keylist_realloc(&op->insert_keys, 0, op->c)) goto out; bio->bi_sector += len; bio->bi_size -= len << 9; - bch_keylist_add(&s->insert_keys, - &KEY(s->inode, bio->bi_sector, len)); + bch_keylist_add(&op->insert_keys, + &KEY(op->inode, bio->bi_sector, len)); } - s->insert_data_done = true; + op->insert_data_done = true; bio_put(bio); out: continue_at(cl, bch_data_insert_keys, bcache_wq); @@ -465,7 +464,7 @@ out: static void bch_data_insert_error(struct closure *cl) { - struct search *s = container_of(cl, struct search, btree); + struct data_insert_op *op = container_of(cl, struct data_insert_op, cl); /* * Our data write just errored, which means we've got a bunch of keys to @@ -476,9 +475,9 @@ static void bch_data_insert_error(struct closure *cl) * from the keys we'll accomplish just that. */ - struct bkey *src = s->insert_keys.keys, *dst = s->insert_keys.keys; + struct bkey *src = op->insert_keys.keys, *dst = op->insert_keys.keys; - while (src != s->insert_keys.top) { + while (src != op->insert_keys.top) { struct bkey *n = bkey_next(src); SET_KEY_PTRS(src, 0); @@ -488,7 +487,7 @@ static void bch_data_insert_error(struct closure *cl) src = n; } - s->insert_keys.top = dst; + op->insert_keys.top = dst; bch_data_insert_keys(cl); } @@ -496,32 +495,32 @@ static void bch_data_insert_error(struct closure *cl) static void bch_data_insert_endio(struct bio *bio, int error) { struct closure *cl = bio->bi_private; - struct search *s = container_of(cl, struct search, btree); + struct data_insert_op *op = container_of(cl, struct data_insert_op, cl); if (error) { /* TODO: We could try to recover from this. */ - if (s->writeback) - s->error = error; - else if (s->write) + if (op->writeback) + op->error = error; + else if (!op->replace) set_closure_fn(cl, bch_data_insert_error, bcache_wq); else set_closure_fn(cl, NULL, NULL); } - bch_bbio_endio(s->c, bio, error, "writing data to cache"); + bch_bbio_endio(op->c, bio, error, "writing data to cache"); } static void bch_data_insert_start(struct closure *cl) { - struct search *s = container_of(cl, struct search, btree); - struct bio *bio = s->cache_bio, *n; + struct data_insert_op *op = container_of(cl, struct data_insert_op, cl); + struct bio *bio = op->bio, *n; - if (s->bypass) + if (op->bypass) return bch_data_invalidate(cl); - if (atomic_sub_return(bio_sectors(bio), &s->c->sectors_to_gc) < 0) { - set_gc_sectors(s->c); - wake_up_gc(s->c); + if (atomic_sub_return(bio_sectors(bio), &op->c->sectors_to_gc) < 0) { + set_gc_sectors(op->c); + wake_up_gc(op->c); } /* @@ -533,21 +532,20 @@ static void bch_data_insert_start(struct closure *cl) do { unsigned i; struct bkey *k; - struct bio_set *split = s->d - ? s->d->bio_split : s->c->bio_split; + struct bio_set *split = op->c->bio_split; /* 1 for the device pointer and 1 for the chksum */ - if (bch_keylist_realloc(&s->insert_keys, - 1 + (s->csum ? 1 : 0), - s->c)) + if (bch_keylist_realloc(&op->insert_keys, + 1 + (op->csum ? 1 : 0), + op->c)) continue_at(cl, bch_data_insert_keys, bcache_wq); - k = s->insert_keys.top; + k = op->insert_keys.top; bkey_init(k); - SET_KEY_INODE(k, s->inode); + SET_KEY_INODE(k, op->inode); SET_KEY_OFFSET(k, bio->bi_sector); - if (!bch_alloc_sectors(k, bio_sectors(bio), s)) + if (!bch_alloc_sectors(op, k, bio_sectors(bio))) goto err; n = bch_bio_split(bio, KEY_SIZE(k), GFP_NOIO, split); @@ -555,30 +553,30 @@ static void bch_data_insert_start(struct closure *cl) n->bi_end_io = bch_data_insert_endio; n->bi_private = cl; - if (s->writeback) { + if (op->writeback) { SET_KEY_DIRTY(k, true); for (i = 0; i < KEY_PTRS(k); i++) - SET_GC_MARK(PTR_BUCKET(s->c, k, i), + SET_GC_MARK(PTR_BUCKET(op->c, k, i), GC_MARK_DIRTY); } - SET_KEY_CSUM(k, s->csum); + SET_KEY_CSUM(k, op->csum); if (KEY_CSUM(k)) bio_csum(n, k); trace_bcache_cache_insert(k); - bch_keylist_push(&s->insert_keys); + bch_keylist_push(&op->insert_keys); n->bi_rw |= REQ_WRITE; - bch_submit_bbio(n, s->c, k, 0); + bch_submit_bbio(n, op->c, k, 0); } while (n != bio); - s->insert_data_done = true; + op->insert_data_done = true; continue_at(cl, bch_data_insert_keys, bcache_wq); err: /* bch_alloc_sectors() blocks if s->writeback = true */ - BUG_ON(s->writeback); + BUG_ON(op->writeback); /* * But if it's not a writeback write we'd rather just bail out if @@ -586,24 +584,24 @@ err: * we might be starving btree writes for gc or something. */ - if (s->write) { + if (!op->replace) { /* * Writethrough write: We can't complete the write until we've * updated the index. But we don't want to delay the write while * we wait for buckets to be freed up, so just invalidate the * rest of the write. */ - s->bypass = true; + op->bypass = true; return bch_data_invalidate(cl); } else { /* * From a cache miss, we can just insert the keys for the data * we have written or bail out if we didn't do anything. */ - s->insert_data_done = true; + op->insert_data_done = true; bio_put(bio); - if (!bch_keylist_empty(&s->insert_keys)) + if (!bch_keylist_empty(&op->insert_keys)) continue_at(cl, bch_data_insert_keys, bcache_wq); else closure_return(cl); @@ -631,15 +629,169 @@ err: */ void bch_data_insert(struct closure *cl) { - struct search *s = container_of(cl, struct search, btree); + struct data_insert_op *op = container_of(cl, struct data_insert_op, cl); - bch_keylist_init(&s->insert_keys); - bio_get(s->cache_bio); + trace_bcache_write(op->bio, op->writeback, op->bypass); + + bch_keylist_init(&op->insert_keys); + bio_get(op->bio); bch_data_insert_start(cl); } +/* Congested? */ + +unsigned bch_get_congested(struct cache_set *c) +{ + int i; + long rand; + + if (!c->congested_read_threshold_us && + !c->congested_write_threshold_us) + return 0; + + i = (local_clock_us() - c->congested_last_us) / 1024; + if (i < 0) + return 0; + + i += atomic_read(&c->congested); + if (i >= 0) + return 0; + + i += CONGESTED_MAX; + + if (i > 0) + i = fract_exp_two(i, 6); + + rand = get_random_int(); + i -= bitmap_weight(&rand, BITS_PER_LONG); + + return i > 0 ? i : 1; +} + +static void add_sequential(struct task_struct *t) +{ + ewma_add(t->sequential_io_avg, + t->sequential_io, 8, 0); + + t->sequential_io = 0; +} + +static struct hlist_head *iohash(struct cached_dev *dc, uint64_t k) +{ + return &dc->io_hash[hash_64(k, RECENT_IO_BITS)]; +} + +static bool check_should_bypass(struct cached_dev *dc, struct bio *bio) +{ + struct cache_set *c = dc->disk.c; + unsigned mode = cache_mode(dc, bio); + unsigned sectors, congested = bch_get_congested(c); + struct task_struct *task = current; + + if (atomic_read(&dc->disk.detaching) || + c->gc_stats.in_use > CUTOFF_CACHE_ADD || + (bio->bi_rw & REQ_DISCARD)) + goto skip; + + if (mode == CACHE_MODE_NONE || + (mode == CACHE_MODE_WRITEAROUND && + (bio->bi_rw & REQ_WRITE))) + goto skip; + + if (bio->bi_sector & (c->sb.block_size - 1) || + bio_sectors(bio) & (c->sb.block_size - 1)) { + pr_debug("skipping unaligned io"); + goto skip; + } + + if (!congested && !dc->sequential_cutoff) + goto rescale; + + if (!congested && + mode == CACHE_MODE_WRITEBACK && + (bio->bi_rw & REQ_WRITE) && + (bio->bi_rw & REQ_SYNC)) + goto rescale; + + if (dc->sequential_merge) { + struct io *i; + + spin_lock(&dc->io_lock); + + hlist_for_each_entry(i, iohash(dc, bio->bi_sector), hash) + if (i->last == bio->bi_sector && + time_before(jiffies, i->jiffies)) + goto found; + + i = list_first_entry(&dc->io_lru, struct io, lru); + + add_sequential(task); + i->sequential = 0; +found: + if (i->sequential + bio->bi_size > i->sequential) + i->sequential += bio->bi_size; + + i->last = bio_end_sector(bio); + i->jiffies = jiffies + msecs_to_jiffies(5000); + task->sequential_io = i->sequential; + + hlist_del(&i->hash); + hlist_add_head(&i->hash, iohash(dc, i->last)); + list_move_tail(&i->lru, &dc->io_lru); + + spin_unlock(&dc->io_lock); + } else { + task->sequential_io = bio->bi_size; + + add_sequential(task); + } + + sectors = max(task->sequential_io, + task->sequential_io_avg) >> 9; + + if (dc->sequential_cutoff && + sectors >= dc->sequential_cutoff >> 9) { + trace_bcache_bypass_sequential(bio); + goto skip; + } + + if (congested && sectors >= congested) { + trace_bcache_bypass_congested(bio); + goto skip; + } + +rescale: + bch_rescale_priorities(c, bio_sectors(bio)); + return false; +skip: + bch_mark_sectors_bypassed(c, dc, bio_sectors(bio)); + return true; +} + /* Cache lookup */ +struct search { + /* Stack frame for bio_complete */ + struct closure cl; + + struct bcache_device *d; + + struct bbio bio; + struct bio *orig_bio; + struct bio *cache_miss; + + unsigned insert_bio_sectors; + + unsigned recoverable:1; + unsigned unaligned_bvec:1; + unsigned write:1; + + unsigned long start_time; + + struct btree_op op; + struct data_insert_op iop; +}; + static void bch_cache_read_endio(struct bio *bio, int error) { struct bbio *b = container_of(bio, struct bbio, bio); @@ -654,13 +806,13 @@ static void bch_cache_read_endio(struct bio *bio, int error) */ if (error) - s->error = error; - else if (ptr_stale(s->c, &b->key, 0)) { - atomic_long_inc(&s->c->cache_read_races); - s->error = -EINTR; + s->iop.error = error; + else if (ptr_stale(s->iop.c, &b->key, 0)) { + atomic_long_inc(&s->iop.c->cache_read_races); + s->iop.error = -EINTR; } - bch_bbio_endio(s->c, bio, error, "reading from cache"); + bch_bbio_endio(s->iop.c, bio, error, "reading from cache"); } /* @@ -674,13 +826,13 @@ static int cache_lookup_fn(struct btree_op *op, struct btree *b, struct bkey *k) struct bkey *bio_key; unsigned ptr; - if (bkey_cmp(k, &KEY(s->inode, bio->bi_sector, 0)) <= 0) + if (bkey_cmp(k, &KEY(s->iop.inode, bio->bi_sector, 0)) <= 0) return MAP_CONTINUE; - if (KEY_INODE(k) != s->inode || + if (KEY_INODE(k) != s->iop.inode || KEY_START(k) > bio->bi_sector) { unsigned bio_sectors = bio_sectors(bio); - unsigned sectors = KEY_INODE(k) == s->inode + unsigned sectors = KEY_INODE(k) == s->iop.inode ? min_t(uint64_t, INT_MAX, KEY_START(k) - bio->bi_sector) : INT_MAX; @@ -708,8 +860,8 @@ static int cache_lookup_fn(struct btree_op *op, struct btree *b, struct bkey *k) bio_key = &container_of(n, struct bbio, bio)->key; bch_bkey_copy_single_ptr(bio_key, k, ptr); - bch_cut_front(&KEY(s->inode, n->bi_sector, 0), bio_key); - bch_cut_back(&KEY(s->inode, bio_end_sector(n), 0), bio_key); + bch_cut_front(&KEY(s->iop.inode, n->bi_sector, 0), bio_key); + bch_cut_back(&KEY(s->iop.inode, bio_end_sector(n), 0), bio_key); n->bi_end_io = bch_cache_read_endio; n->bi_private = &s->cl; @@ -731,11 +883,11 @@ static int cache_lookup_fn(struct btree_op *op, struct btree *b, struct bkey *k) static void cache_lookup(struct closure *cl) { - struct search *s = container_of(cl, struct search, btree); + struct search *s = container_of(cl, struct search, iop.cl); struct bio *bio = &s->bio.bio; - int ret = bch_btree_map_keys(&s->op, s->c, - &KEY(s->inode, bio->bi_sector, 0), + int ret = bch_btree_map_keys(&s->op, s->iop.c, + &KEY(s->iop.inode, bio->bi_sector, 0), cache_lookup_fn, MAP_END_KEY); if (ret == -EAGAIN) continue_at(cl, cache_lookup, bcache_wq); @@ -751,7 +903,7 @@ static void request_endio(struct bio *bio, int error) if (error) { struct search *s = container_of(cl, struct search, cl); - s->error = error; + s->iop.error = error; /* Only cache read errors are recoverable */ s->recoverable = false; } @@ -771,8 +923,8 @@ static void bio_complete(struct search *s) part_stat_add(cpu, &s->d->disk->part0, ticks[rw], duration); part_stat_unlock(); - trace_bcache_request_end(s, s->orig_bio); - bio_endio(s->orig_bio, s->error); + trace_bcache_request_end(s->d, s->orig_bio); + bio_endio(s->orig_bio, s->iop.error); s->orig_bio = NULL; } } @@ -792,8 +944,8 @@ static void search_free(struct closure *cl) struct search *s = container_of(cl, struct search, cl); bio_complete(s); - if (s->cache_bio) - bio_put(s->cache_bio); + if (s->iop.bio) + bio_put(s->iop.bio); if (s->unaligned_bvec) mempool_free(s->bio.bio.bi_io_vec, s->d->unaligned_bvec); @@ -808,18 +960,18 @@ static struct search *search_alloc(struct bio *bio, struct bcache_device *d) struct bio_vec *bv; s = mempool_alloc(d->c->search, GFP_NOIO); - memset(s, 0, offsetof(struct search, insert_keys)); + memset(s, 0, offsetof(struct search, iop.insert_keys)); __closure_init(&s->cl, NULL); - s->inode = d->id; - s->c = d->c; + s->iop.inode = d->id; + s->iop.c = d->c; s->d = d; s->op.lock = -1; - s->task = current; + s->iop.task = current; s->orig_bio = bio; s->write = (bio->bi_rw & REQ_WRITE) != 0; - s->flush_journal = (bio->bi_rw & (REQ_FLUSH|REQ_FUA)) != 0; + s->iop.flush_journal = (bio->bi_rw & (REQ_FLUSH|REQ_FUA)) != 0; s->recoverable = 1; s->start_time = jiffies; do_bio_hook(s); @@ -847,148 +999,20 @@ static void cached_dev_bio_complete(struct closure *cl) cached_dev_put(dc); } -unsigned bch_get_congested(struct cache_set *c) -{ - int i; - long rand; - - if (!c->congested_read_threshold_us && - !c->congested_write_threshold_us) - return 0; - - i = (local_clock_us() - c->congested_last_us) / 1024; - if (i < 0) - return 0; - - i += atomic_read(&c->congested); - if (i >= 0) - return 0; - - i += CONGESTED_MAX; - - if (i > 0) - i = fract_exp_two(i, 6); - - rand = get_random_int(); - i -= bitmap_weight(&rand, BITS_PER_LONG); - - return i > 0 ? i : 1; -} - -static void add_sequential(struct task_struct *t) -{ - ewma_add(t->sequential_io_avg, - t->sequential_io, 8, 0); - - t->sequential_io = 0; -} - -static struct hlist_head *iohash(struct cached_dev *dc, uint64_t k) -{ - return &dc->io_hash[hash_64(k, RECENT_IO_BITS)]; -} - -static bool check_should_bypass(struct cached_dev *dc, struct search *s) -{ - struct cache_set *c = s->c; - struct bio *bio = &s->bio.bio; - unsigned mode = cache_mode(dc, bio); - unsigned sectors, congested = bch_get_congested(c); - - if (atomic_read(&dc->disk.detaching) || - c->gc_stats.in_use > CUTOFF_CACHE_ADD || - (bio->bi_rw & REQ_DISCARD)) - goto skip; - - if (mode == CACHE_MODE_NONE || - (mode == CACHE_MODE_WRITEAROUND && - (bio->bi_rw & REQ_WRITE))) - goto skip; - - if (bio->bi_sector & (c->sb.block_size - 1) || - bio_sectors(bio) & (c->sb.block_size - 1)) { - pr_debug("skipping unaligned io"); - goto skip; - } - - if (!congested && !dc->sequential_cutoff) - goto rescale; - - if (!congested && - mode == CACHE_MODE_WRITEBACK && - (bio->bi_rw & REQ_WRITE) && - (bio->bi_rw & REQ_SYNC)) - goto rescale; - - if (dc->sequential_merge) { - struct io *i; - - spin_lock(&dc->io_lock); - - hlist_for_each_entry(i, iohash(dc, bio->bi_sector), hash) - if (i->last == bio->bi_sector && - time_before(jiffies, i->jiffies)) - goto found; - - i = list_first_entry(&dc->io_lru, struct io, lru); - - add_sequential(s->task); - i->sequential = 0; -found: - if (i->sequential + bio->bi_size > i->sequential) - i->sequential += bio->bi_size; - - i->last = bio_end_sector(bio); - i->jiffies = jiffies + msecs_to_jiffies(5000); - s->task->sequential_io = i->sequential; - - hlist_del(&i->hash); - hlist_add_head(&i->hash, iohash(dc, i->last)); - list_move_tail(&i->lru, &dc->io_lru); - - spin_unlock(&dc->io_lock); - } else { - s->task->sequential_io = bio->bi_size; - - add_sequential(s->task); - } - - sectors = max(s->task->sequential_io, - s->task->sequential_io_avg) >> 9; - - if (dc->sequential_cutoff && - sectors >= dc->sequential_cutoff >> 9) { - trace_bcache_bypass_sequential(s->orig_bio); - goto skip; - } - - if (congested && sectors >= congested) { - trace_bcache_bypass_congested(s->orig_bio); - goto skip; - } - -rescale: - bch_rescale_priorities(c, bio_sectors(bio)); - return false; -skip: - bch_mark_sectors_bypassed(s, bio_sectors(bio)); - return true; -} - /* Process reads */ static void cached_dev_cache_miss_done(struct closure *cl) { struct search *s = container_of(cl, struct search, cl); - if (s->insert_collision) - bch_mark_cache_miss_collision(s); + if (s->iop.replace_collision) + bch_mark_cache_miss_collision(s->iop.c, s->d); - if (s->cache_bio) { + if (s->iop.bio) { int i; struct bio_vec *bv; - bio_for_each_segment_all(bv, s->cache_bio, i) + bio_for_each_segment_all(bv, s->iop.bio, i) __free_page(bv->bv_page); } @@ -1006,7 +1030,7 @@ static void cached_dev_read_error(struct closure *cl) /* Retry from the backing device: */ trace_bcache_read_retry(s->orig_bio); - s->error = 0; + s->iop.error = 0; bv = s->bio.bio.bi_io_vec; do_bio_hook(s); s->bio.bio.bi_io_vec = bv; @@ -1041,29 +1065,28 @@ static void cached_dev_read_done(struct closure *cl) * to the buffers the original bio pointed to: */ - if (s->cache_bio) { - bio_reset(s->cache_bio); - s->cache_bio->bi_sector = - s->cache_miss->bi_sector; - s->cache_bio->bi_bdev = s->cache_miss->bi_bdev; - s->cache_bio->bi_size = s->cache_bio_sectors << 9; - bch_bio_map(s->cache_bio, NULL); + if (s->iop.bio) { + bio_reset(s->iop.bio); + s->iop.bio->bi_sector = s->cache_miss->bi_sector; + s->iop.bio->bi_bdev = s->cache_miss->bi_bdev; + s->iop.bio->bi_size = s->insert_bio_sectors << 9; + bch_bio_map(s->iop.bio, NULL); - bio_copy_data(s->cache_miss, s->cache_bio); + bio_copy_data(s->cache_miss, s->iop.bio); bio_put(s->cache_miss); s->cache_miss = NULL; } - if (verify(dc, &s->bio.bio) && s->recoverable) - bch_data_verify(s); + if (verify(dc, &s->bio.bio) && s->recoverable && !s->unaligned_bvec) + bch_data_verify(dc, s->orig_bio); bio_complete(s); - if (s->cache_bio && - !test_bit(CACHE_SET_STOPPING, &s->c->flags)) { - BUG_ON(!s->replace); - closure_call(&s->btree, bch_data_insert, NULL, cl); + if (s->iop.bio && + !test_bit(CACHE_SET_STOPPING, &s->iop.c->flags)) { + BUG_ON(!s->iop.replace); + closure_call(&s->iop.cl, bch_data_insert, NULL, cl); } continue_at(cl, cached_dev_cache_miss_done, NULL); @@ -1074,12 +1097,13 @@ static void cached_dev_read_done_bh(struct closure *cl) struct search *s = container_of(cl, struct search, cl); struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); - bch_mark_cache_accounting(s, !s->cache_miss, s->bypass); - trace_bcache_read(s->orig_bio, !s->cache_miss, s->bypass); + bch_mark_cache_accounting(s->iop.c, s->d, + !s->cache_miss, s->iop.bypass); + trace_bcache_read(s->orig_bio, !s->cache_miss, s->iop.bypass); - if (s->error) + if (s->iop.error) continue_at_nobarrier(cl, cached_dev_read_error, bcache_wq); - else if (s->cache_bio || verify(dc, &s->bio.bio)) + else if (s->iop.bio || verify(dc, &s->bio.bio)) continue_at_nobarrier(cl, cached_dev_read_done, bcache_wq); else continue_at_nobarrier(cl, cached_dev_bio_complete, NULL); @@ -1093,7 +1117,7 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s, struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); struct bio *miss, *cache_bio; - if (s->cache_miss || s->bypass) { + if (s->cache_miss || s->iop.bypass) { miss = bch_bio_split(bio, sectors, GFP_NOIO, s->d->bio_split); ret = miss == bio ? MAP_DONE : MAP_CONTINUE; goto out_submit; @@ -1101,20 +1125,21 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s, if (!(bio->bi_rw & REQ_RAHEAD) && !(bio->bi_rw & REQ_META) && - s->c->gc_stats.in_use < CUTOFF_CACHE_READA) + s->iop.c->gc_stats.in_use < CUTOFF_CACHE_READA) reada = min_t(sector_t, dc->readahead >> 9, bdev_sectors(bio->bi_bdev) - bio_end_sector(bio)); - s->cache_bio_sectors = min(sectors, bio_sectors(bio) + reada); + s->insert_bio_sectors = min(sectors, bio_sectors(bio) + reada); - s->replace_key = KEY(s->inode, bio->bi_sector + - s->cache_bio_sectors, s->cache_bio_sectors); + s->iop.replace_key = KEY(s->iop.inode, + bio->bi_sector + s->insert_bio_sectors, + s->insert_bio_sectors); - ret = bch_btree_insert_check_key(b, &s->op, &s->replace_key); + ret = bch_btree_insert_check_key(b, &s->op, &s->iop.replace_key); if (ret) return ret; - s->replace = true; + s->iop.replace = true; miss = bch_bio_split(bio, sectors, GFP_NOIO, s->d->bio_split); @@ -1122,14 +1147,14 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s, ret = miss == bio ? MAP_DONE : -EINTR; cache_bio = bio_alloc_bioset(GFP_NOWAIT, - DIV_ROUND_UP(s->cache_bio_sectors, PAGE_SECTORS), + DIV_ROUND_UP(s->insert_bio_sectors, PAGE_SECTORS), dc->disk.bio_split); if (!cache_bio) goto out_submit; cache_bio->bi_sector = miss->bi_sector; cache_bio->bi_bdev = miss->bi_bdev; - cache_bio->bi_size = s->cache_bio_sectors << 9; + cache_bio->bi_size = s->insert_bio_sectors << 9; cache_bio->bi_end_io = request_endio; cache_bio->bi_private = &s->cl; @@ -1138,8 +1163,11 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s, if (bio_alloc_pages(cache_bio, __GFP_NOWARN|GFP_NOIO)) goto out_put; + if (reada) + bch_mark_cache_readahead(s->iop.c, s->d); + s->cache_miss = miss; - s->cache_bio = cache_bio; + s->iop.bio = cache_bio; bio_get(cache_bio); closure_bio_submit(cache_bio, &s->cl, s->d); @@ -1157,7 +1185,7 @@ static void cached_dev_read(struct cached_dev *dc, struct search *s) { struct closure *cl = &s->cl; - closure_call(&s->btree, cache_lookup, NULL, cl); + closure_call(&s->iop.cl, cache_lookup, NULL, cl); continue_at(cl, cached_dev_read_done_bh, NULL); } @@ -1179,7 +1207,7 @@ static void cached_dev_write(struct cached_dev *dc, struct search *s) struct bkey start = KEY(dc->disk.id, bio->bi_sector, 0); struct bkey end = KEY(dc->disk.id, bio_end_sector(bio), 0); - bch_keybuf_check_overlapping(&s->c->moving_gc_keys, &start, &end); + bch_keybuf_check_overlapping(&s->iop.c->moving_gc_keys, &start, &end); down_read_non_owner(&dc->writeback_lock); if (bch_keybuf_check_overlapping(&dc->writeback_keys, &start, &end)) { @@ -1187,8 +1215,8 @@ static void cached_dev_write(struct cached_dev *dc, struct search *s) * We overlap with some dirty data undergoing background * writeback, force this write to writeback */ - s->bypass = false; - s->writeback = true; + s->iop.bypass = false; + s->iop.writeback = true; } /* @@ -1199,27 +1227,25 @@ static void cached_dev_write(struct cached_dev *dc, struct search *s) * so we still want to call it. */ if (bio->bi_rw & REQ_DISCARD) - s->bypass = true; + s->iop.bypass = true; if (should_writeback(dc, s->orig_bio, cache_mode(dc, bio), - s->bypass)) { - s->bypass = false; - s->writeback = true; + s->iop.bypass)) { + s->iop.bypass = false; + s->iop.writeback = true; } - trace_bcache_write(s->orig_bio, s->writeback, s->bypass); - - if (s->bypass) { - s->cache_bio = s->orig_bio; - bio_get(s->cache_bio); + if (s->iop.bypass) { + s->iop.bio = s->orig_bio; + bio_get(s->iop.bio); if (!(bio->bi_rw & REQ_DISCARD) || blk_queue_discard(bdev_get_queue(dc->bdev))) closure_bio_submit(bio, cl, s->d); - } else if (s->writeback) { + } else if (s->iop.writeback) { bch_writeback_add(dc); - s->cache_bio = bio; + s->iop.bio = bio; if (bio->bi_rw & REQ_FLUSH) { /* Also need to send a flush to the backing device */ @@ -1234,13 +1260,13 @@ static void cached_dev_write(struct cached_dev *dc, struct search *s) closure_bio_submit(flush, cl, s->d); } } else { - s->cache_bio = bio_clone_bioset(bio, GFP_NOIO, - dc->disk.bio_split); + s->iop.bio = bio_clone_bioset(bio, GFP_NOIO, + dc->disk.bio_split); closure_bio_submit(bio, cl, s->d); } - closure_call(&s->btree, bch_data_insert, NULL, cl); + closure_call(&s->iop.cl, bch_data_insert, NULL, cl); continue_at(cl, cached_dev_write_complete, NULL); } @@ -1249,8 +1275,8 @@ static void cached_dev_nodata(struct closure *cl) struct search *s = container_of(cl, struct search, cl); struct bio *bio = &s->bio.bio; - if (s->flush_journal) - bch_journal_meta(s->c, cl); + if (s->iop.flush_journal) + bch_journal_meta(s->iop.c, cl); /* If it's a flush, we send the flush to the backing device too */ closure_bio_submit(bio, cl, s->d); @@ -1277,7 +1303,7 @@ static void cached_dev_make_request(struct request_queue *q, struct bio *bio) if (cached_dev_get(dc)) { s = search_alloc(bio, d); - trace_bcache_request_start(s, bio); + trace_bcache_request_start(s->d, bio); if (!bio->bi_size) { /* @@ -1288,7 +1314,7 @@ static void cached_dev_make_request(struct request_queue *q, struct bio *bio) cached_dev_nodata, bcache_wq); } else { - s->bypass = check_should_bypass(dc, s); + s->iop.bypass = check_should_bypass(dc, bio); if (rw) cached_dev_write(dc, s); @@ -1378,8 +1404,8 @@ static void flash_dev_nodata(struct closure *cl) { struct search *s = container_of(cl, struct search, cl); - if (s->flush_journal) - bch_journal_meta(s->c, cl); + if (s->iop.flush_journal) + bch_journal_meta(s->iop.c, cl); continue_at(cl, search_free, NULL); } @@ -1400,7 +1426,7 @@ static void flash_dev_make_request(struct request_queue *q, struct bio *bio) cl = &s->cl; bio = &s->bio.bio; - trace_bcache_request_start(s, bio); + trace_bcache_request_start(s->d, bio); if (!bio->bi_size) { /* @@ -1411,17 +1437,17 @@ static void flash_dev_make_request(struct request_queue *q, struct bio *bio) flash_dev_nodata, bcache_wq); } else if (rw) { - bch_keybuf_check_overlapping(&s->c->moving_gc_keys, + bch_keybuf_check_overlapping(&s->iop.c->moving_gc_keys, &KEY(d->id, bio->bi_sector, 0), &KEY(d->id, bio_end_sector(bio), 0)); - s->bypass = (bio->bi_rw & REQ_DISCARD) != 0; - s->writeback = true; - s->cache_bio = bio; + s->iop.bypass = (bio->bi_rw & REQ_DISCARD) != 0; + s->iop.writeback = true; + s->iop.bio = bio; - closure_call(&s->btree, bch_data_insert, NULL, cl); + closure_call(&s->iop.cl, bch_data_insert, NULL, cl); } else { - closure_call(&s->btree, cache_lookup, NULL, cl); + closure_call(&s->iop.cl, cache_lookup, NULL, cl); } continue_at(cl, search_free, NULL); diff --git a/drivers/md/bcache/request.h b/drivers/md/bcache/request.h index f0e930b..54d7de2 100644 --- a/drivers/md/bcache/request.h +++ b/drivers/md/bcache/request.h @@ -3,46 +3,25 @@ #include -struct search { - /* Stack frame for bio_complete */ +struct data_insert_op { struct closure cl; - struct closure btree; - - struct bcache_device *d; struct cache_set *c; struct task_struct *task; - - struct bbio bio; - struct bio *orig_bio; - struct bio *cache_miss; - - /* Bio to be inserted into the cache */ - struct bio *cache_bio; - unsigned cache_bio_sectors; + struct bio *bio; unsigned inode; + uint16_t write_prio; + short error; - unsigned recoverable:1; - unsigned unaligned_bvec:1; - - unsigned write:1; - unsigned writeback:1; - - unsigned csum:1; unsigned bypass:1; + unsigned writeback:1; unsigned flush_journal:1; + unsigned csum:1; - unsigned insert_data_done:1; unsigned replace:1; - unsigned insert_collision:1; - - uint16_t write_prio; - - /* IO error returned to s->bio */ - short error; - unsigned long start_time; + unsigned replace_collision:1; - struct btree_op op; + unsigned insert_data_done:1; /* Anything past this point won't get zeroed in search_alloc() */ struct keylist insert_keys; diff --git a/drivers/md/bcache/stats.c b/drivers/md/bcache/stats.c index ea77263..84d0782 100644 --- a/drivers/md/bcache/stats.c +++ b/drivers/md/bcache/stats.c @@ -7,7 +7,6 @@ #include "bcache.h" #include "stats.h" #include "btree.h" -#include "request.h" #include "sysfs.h" /* @@ -196,35 +195,36 @@ static void mark_cache_stats(struct cache_stat_collector *stats, atomic_inc(&stats->cache_bypass_misses); } -void bch_mark_cache_accounting(struct search *s, bool hit, bool bypass) +void bch_mark_cache_accounting(struct cache_set *c, struct bcache_device *d, + bool hit, bool bypass) { - struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); + struct cached_dev *dc = container_of(d, struct cached_dev, disk); mark_cache_stats(&dc->accounting.collector, hit, bypass); - mark_cache_stats(&s->c->accounting.collector, hit, bypass); + mark_cache_stats(&c->accounting.collector, hit, bypass); #ifdef CONFIG_CGROUP_BCACHE mark_cache_stats(&(bch_bio_to_cgroup(s->orig_bio)->stats), hit, bypass); #endif } -void bch_mark_cache_readahead(struct search *s) +void bch_mark_cache_readahead(struct cache_set *c, struct bcache_device *d) { - struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); + struct cached_dev *dc = container_of(d, struct cached_dev, disk); atomic_inc(&dc->accounting.collector.cache_readaheads); - atomic_inc(&s->c->accounting.collector.cache_readaheads); + atomic_inc(&c->accounting.collector.cache_readaheads); } -void bch_mark_cache_miss_collision(struct search *s) +void bch_mark_cache_miss_collision(struct cache_set *c, struct bcache_device *d) { - struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); + struct cached_dev *dc = container_of(d, struct cached_dev, disk); atomic_inc(&dc->accounting.collector.cache_miss_collisions); - atomic_inc(&s->c->accounting.collector.cache_miss_collisions); + atomic_inc(&c->accounting.collector.cache_miss_collisions); } -void bch_mark_sectors_bypassed(struct search *s, int sectors) +void bch_mark_sectors_bypassed(struct cache_set *c, struct cached_dev *dc, + int sectors) { - struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); atomic_add(sectors, &dc->accounting.collector.sectors_bypassed); - atomic_add(sectors, &s->c->accounting.collector.sectors_bypassed); + atomic_add(sectors, &c->accounting.collector.sectors_bypassed); } void bch_cache_accounting_init(struct cache_accounting *acc, diff --git a/drivers/md/bcache/stats.h b/drivers/md/bcache/stats.h index c7c7a8f..adbff14 100644 --- a/drivers/md/bcache/stats.h +++ b/drivers/md/bcache/stats.h @@ -38,7 +38,9 @@ struct cache_accounting { struct cache_stats day; }; -struct search; +struct cache_set; +struct cached_dev; +struct bcache_device; void bch_cache_accounting_init(struct cache_accounting *acc, struct closure *parent); @@ -50,9 +52,10 @@ void bch_cache_accounting_clear(struct cache_accounting *acc); void bch_cache_accounting_destroy(struct cache_accounting *acc); -void bch_mark_cache_accounting(struct search *s, bool hit, bool bypass); -void bch_mark_cache_readahead(struct search *s); -void bch_mark_cache_miss_collision(struct search *s); -void bch_mark_sectors_bypassed(struct search *s, int sectors); +void bch_mark_cache_accounting(struct cache_set *, struct bcache_device *, + bool, bool); +void bch_mark_cache_readahead(struct cache_set *, struct bcache_device *); +void bch_mark_cache_miss_collision(struct cache_set *, struct bcache_device *); +void bch_mark_sectors_bypassed(struct cache_set *, struct cached_dev *, int); #endif /* _BCACHE_STATS_H_ */ diff --git a/drivers/md/bcache/trace.c b/drivers/md/bcache/trace.c index f7b6c19..adbc3df 100644 --- a/drivers/md/bcache/trace.c +++ b/drivers/md/bcache/trace.c @@ -1,6 +1,5 @@ #include "bcache.h" #include "btree.h" -#include "request.h" #include #include diff --git a/include/trace/events/bcache.h b/include/trace/events/bcache.h index 5ebda97..32c89b3 100644 --- a/include/trace/events/bcache.h +++ b/include/trace/events/bcache.h @@ -6,11 +6,9 @@ #include -struct search; - DECLARE_EVENT_CLASS(bcache_request, - TP_PROTO(struct search *s, struct bio *bio), - TP_ARGS(s, bio), + TP_PROTO(struct bcache_device *d, struct bio *bio), + TP_ARGS(d, bio), TP_STRUCT__entry( __field(dev_t, dev ) @@ -24,8 +22,8 @@ DECLARE_EVENT_CLASS(bcache_request, TP_fast_assign( __entry->dev = bio->bi_bdev->bd_dev; - __entry->orig_major = s->d->disk->major; - __entry->orig_minor = s->d->disk->first_minor; + __entry->orig_major = d->disk->major; + __entry->orig_minor = d->disk->first_minor; __entry->sector = bio->bi_sector; __entry->orig_sector = bio->bi_sector - 16; __entry->nr_sector = bio->bi_size >> 9; @@ -79,13 +77,13 @@ DECLARE_EVENT_CLASS(btree_node, /* request.c */ DEFINE_EVENT(bcache_request, bcache_request_start, - TP_PROTO(struct search *s, struct bio *bio), - TP_ARGS(s, bio) + TP_PROTO(struct bcache_device *d, struct bio *bio), + TP_ARGS(d, bio) ); DEFINE_EVENT(bcache_request, bcache_request_end, - TP_PROTO(struct search *s, struct bio *bio), - TP_ARGS(s, bio) + TP_PROTO(struct bcache_device *d, struct bio *bio), + TP_ARGS(d, bio) ); DECLARE_EVENT_CLASS(bcache_bio, -- cgit v0.10.2 From 2599b53b7b0ea6103d1661dca74d35480cb8fa1f Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 24 Jul 2013 18:11:11 -0700 Subject: bcache: Move sector allocator to alloc.c Just reorganizing things a bit. Signed-off-by: Kent Overstreet diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c index b9bd586..4970ddc 100644 --- a/drivers/md/bcache/alloc.c +++ b/drivers/md/bcache/alloc.c @@ -487,8 +487,188 @@ int bch_bucket_alloc_set(struct cache_set *c, unsigned watermark, return ret; } +/* Sector allocator */ + +struct open_bucket { + struct list_head list; + unsigned last_write_point; + unsigned sectors_free; + BKEY_PADDED(key); +}; + +/* + * We keep multiple buckets open for writes, and try to segregate different + * write streams for better cache utilization: first we look for a bucket where + * the last write to it was sequential with the current write, and failing that + * we look for a bucket that was last used by the same task. + * + * The ideas is if you've got multiple tasks pulling data into the cache at the + * same time, you'll get better cache utilization if you try to segregate their + * data and preserve locality. + * + * For example, say you've starting Firefox at the same time you're copying a + * bunch of files. Firefox will likely end up being fairly hot and stay in the + * cache awhile, but the data you copied might not be; if you wrote all that + * data to the same buckets it'd get invalidated at the same time. + * + * Both of those tasks will be doing fairly random IO so we can't rely on + * detecting sequential IO to segregate their data, but going off of the task + * should be a sane heuristic. + */ +static struct open_bucket *pick_data_bucket(struct cache_set *c, + const struct bkey *search, + unsigned write_point, + struct bkey *alloc) +{ + struct open_bucket *ret, *ret_task = NULL; + + list_for_each_entry_reverse(ret, &c->data_buckets, list) + if (!bkey_cmp(&ret->key, search)) + goto found; + else if (ret->last_write_point == write_point) + ret_task = ret; + + ret = ret_task ?: list_first_entry(&c->data_buckets, + struct open_bucket, list); +found: + if (!ret->sectors_free && KEY_PTRS(alloc)) { + ret->sectors_free = c->sb.bucket_size; + bkey_copy(&ret->key, alloc); + bkey_init(alloc); + } + + if (!ret->sectors_free) + ret = NULL; + + return ret; +} + +/* + * Allocates some space in the cache to write to, and k to point to the newly + * allocated space, and updates KEY_SIZE(k) and KEY_OFFSET(k) (to point to the + * end of the newly allocated space). + * + * May allocate fewer sectors than @sectors, KEY_SIZE(k) indicates how many + * sectors were actually allocated. + * + * If s->writeback is true, will not fail. + */ +bool bch_alloc_sectors(struct cache_set *c, struct bkey *k, unsigned sectors, + unsigned write_point, unsigned write_prio, bool wait) +{ + struct open_bucket *b; + BKEY_PADDED(key) alloc; + unsigned i; + + /* + * We might have to allocate a new bucket, which we can't do with a + * spinlock held. So if we have to allocate, we drop the lock, allocate + * and then retry. KEY_PTRS() indicates whether alloc points to + * allocated bucket(s). + */ + + bkey_init(&alloc.key); + spin_lock(&c->data_bucket_lock); + + while (!(b = pick_data_bucket(c, k, write_point, &alloc.key))) { + unsigned watermark = write_prio + ? WATERMARK_MOVINGGC + : WATERMARK_NONE; + + spin_unlock(&c->data_bucket_lock); + + if (bch_bucket_alloc_set(c, watermark, &alloc.key, 1, wait)) + return false; + + spin_lock(&c->data_bucket_lock); + } + + /* + * If we had to allocate, we might race and not need to allocate the + * second time we call find_data_bucket(). If we allocated a bucket but + * didn't use it, drop the refcount bch_bucket_alloc_set() took: + */ + if (KEY_PTRS(&alloc.key)) + __bkey_put(c, &alloc.key); + + for (i = 0; i < KEY_PTRS(&b->key); i++) + EBUG_ON(ptr_stale(c, &b->key, i)); + + /* Set up the pointer to the space we're allocating: */ + + for (i = 0; i < KEY_PTRS(&b->key); i++) + k->ptr[i] = b->key.ptr[i]; + + sectors = min(sectors, b->sectors_free); + + SET_KEY_OFFSET(k, KEY_OFFSET(k) + sectors); + SET_KEY_SIZE(k, sectors); + SET_KEY_PTRS(k, KEY_PTRS(&b->key)); + + /* + * Move b to the end of the lru, and keep track of what this bucket was + * last used for: + */ + list_move_tail(&b->list, &c->data_buckets); + bkey_copy_key(&b->key, k); + b->last_write_point = write_point; + + b->sectors_free -= sectors; + + for (i = 0; i < KEY_PTRS(&b->key); i++) { + SET_PTR_OFFSET(&b->key, i, PTR_OFFSET(&b->key, i) + sectors); + + atomic_long_add(sectors, + &PTR_CACHE(c, &b->key, i)->sectors_written); + } + + if (b->sectors_free < c->sb.block_size) + b->sectors_free = 0; + + /* + * k takes refcounts on the buckets it points to until it's inserted + * into the btree, but if we're done with this bucket we just transfer + * get_data_bucket()'s refcount. + */ + if (b->sectors_free) + for (i = 0; i < KEY_PTRS(&b->key); i++) + atomic_inc(&PTR_BUCKET(c, &b->key, i)->pin); + + spin_unlock(&c->data_bucket_lock); + return true; +} + /* Init */ +void bch_open_buckets_free(struct cache_set *c) +{ + struct open_bucket *b; + + while (!list_empty(&c->data_buckets)) { + b = list_first_entry(&c->data_buckets, + struct open_bucket, list); + list_del(&b->list); + kfree(b); + } +} + +int bch_open_buckets_alloc(struct cache_set *c) +{ + int i; + + spin_lock_init(&c->data_bucket_lock); + + for (i = 0; i < 6; i++) { + struct open_bucket *b = kzalloc(sizeof(*b), GFP_KERNEL); + if (!b) + return -ENOMEM; + + list_add(&b->list, &c->data_buckets); + } + + return 0; +} + int bch_cache_allocator_start(struct cache *ca) { struct task_struct *k = kthread_run(bch_allocator_thread, diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 20fe96c..e32f6fd 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -1170,6 +1170,8 @@ int __bch_bucket_alloc_set(struct cache_set *, unsigned, struct bkey *, int, bool); int bch_bucket_alloc_set(struct cache_set *, unsigned, struct bkey *, int, bool); +bool bch_alloc_sectors(struct cache_set *, struct bkey *, unsigned, + unsigned, unsigned, bool); __printf(2, 3) bool bch_cache_set_error(struct cache_set *, const char *, ...); @@ -1210,6 +1212,8 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *); void bch_btree_cache_free(struct cache_set *); int bch_btree_cache_alloc(struct cache_set *); void bch_moving_init_cache_set(struct cache_set *); +int bch_open_buckets_alloc(struct cache_set *); +void bch_open_buckets_free(struct cache_set *); int bch_cache_allocator_start(struct cache *ca); int bch_cache_allocator_init(struct cache *ca); diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 05c7c21..cf7850a 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -255,186 +255,6 @@ static void bch_data_insert_keys(struct closure *cl) closure_return(cl); } -struct open_bucket { - struct list_head list; - struct task_struct *last; - unsigned sectors_free; - BKEY_PADDED(key); -}; - -void bch_open_buckets_free(struct cache_set *c) -{ - struct open_bucket *b; - - while (!list_empty(&c->data_buckets)) { - b = list_first_entry(&c->data_buckets, - struct open_bucket, list); - list_del(&b->list); - kfree(b); - } -} - -int bch_open_buckets_alloc(struct cache_set *c) -{ - int i; - - spin_lock_init(&c->data_bucket_lock); - - for (i = 0; i < 6; i++) { - struct open_bucket *b = kzalloc(sizeof(*b), GFP_KERNEL); - if (!b) - return -ENOMEM; - - list_add(&b->list, &c->data_buckets); - } - - return 0; -} - -/* - * We keep multiple buckets open for writes, and try to segregate different - * write streams for better cache utilization: first we look for a bucket where - * the last write to it was sequential with the current write, and failing that - * we look for a bucket that was last used by the same task. - * - * The ideas is if you've got multiple tasks pulling data into the cache at the - * same time, you'll get better cache utilization if you try to segregate their - * data and preserve locality. - * - * For example, say you've starting Firefox at the same time you're copying a - * bunch of files. Firefox will likely end up being fairly hot and stay in the - * cache awhile, but the data you copied might not be; if you wrote all that - * data to the same buckets it'd get invalidated at the same time. - * - * Both of those tasks will be doing fairly random IO so we can't rely on - * detecting sequential IO to segregate their data, but going off of the task - * should be a sane heuristic. - */ -static struct open_bucket *pick_data_bucket(struct cache_set *c, - const struct bkey *search, - struct task_struct *task, - struct bkey *alloc) -{ - struct open_bucket *ret, *ret_task = NULL; - - list_for_each_entry_reverse(ret, &c->data_buckets, list) - if (!bkey_cmp(&ret->key, search)) - goto found; - else if (ret->last == task) - ret_task = ret; - - ret = ret_task ?: list_first_entry(&c->data_buckets, - struct open_bucket, list); -found: - if (!ret->sectors_free && KEY_PTRS(alloc)) { - ret->sectors_free = c->sb.bucket_size; - bkey_copy(&ret->key, alloc); - bkey_init(alloc); - } - - if (!ret->sectors_free) - ret = NULL; - - return ret; -} - -/* - * Allocates some space in the cache to write to, and k to point to the newly - * allocated space, and updates KEY_SIZE(k) and KEY_OFFSET(k) (to point to the - * end of the newly allocated space). - * - * May allocate fewer sectors than @sectors, KEY_SIZE(k) indicates how many - * sectors were actually allocated. - * - * If s->writeback is true, will not fail. - */ -static bool bch_alloc_sectors(struct data_insert_op *op, - struct bkey *k, unsigned sectors) -{ - struct cache_set *c = op->c; - struct open_bucket *b; - BKEY_PADDED(key) alloc; - unsigned i; - - /* - * We might have to allocate a new bucket, which we can't do with a - * spinlock held. So if we have to allocate, we drop the lock, allocate - * and then retry. KEY_PTRS() indicates whether alloc points to - * allocated bucket(s). - */ - - bkey_init(&alloc.key); - spin_lock(&c->data_bucket_lock); - - while (!(b = pick_data_bucket(c, k, op->task, &alloc.key))) { - unsigned watermark = op->write_prio - ? WATERMARK_MOVINGGC - : WATERMARK_NONE; - - spin_unlock(&c->data_bucket_lock); - - if (bch_bucket_alloc_set(c, watermark, &alloc.key, - 1, op->writeback)) - return false; - - spin_lock(&c->data_bucket_lock); - } - - /* - * If we had to allocate, we might race and not need to allocate the - * second time we call find_data_bucket(). If we allocated a bucket but - * didn't use it, drop the refcount bch_bucket_alloc_set() took: - */ - if (KEY_PTRS(&alloc.key)) - __bkey_put(c, &alloc.key); - - for (i = 0; i < KEY_PTRS(&b->key); i++) - EBUG_ON(ptr_stale(c, &b->key, i)); - - /* Set up the pointer to the space we're allocating: */ - - for (i = 0; i < KEY_PTRS(&b->key); i++) - k->ptr[i] = b->key.ptr[i]; - - sectors = min(sectors, b->sectors_free); - - SET_KEY_OFFSET(k, KEY_OFFSET(k) + sectors); - SET_KEY_SIZE(k, sectors); - SET_KEY_PTRS(k, KEY_PTRS(&b->key)); - - /* - * Move b to the end of the lru, and keep track of what this bucket was - * last used for: - */ - list_move_tail(&b->list, &c->data_buckets); - bkey_copy_key(&b->key, k); - b->last = op->task; - - b->sectors_free -= sectors; - - for (i = 0; i < KEY_PTRS(&b->key); i++) { - SET_PTR_OFFSET(&b->key, i, PTR_OFFSET(&b->key, i) + sectors); - - atomic_long_add(sectors, - &PTR_CACHE(c, &b->key, i)->sectors_written); - } - - if (b->sectors_free < c->sb.block_size) - b->sectors_free = 0; - - /* - * k takes refcounts on the buckets it points to until it's inserted - * into the btree, but if we're done with this bucket we just transfer - * get_data_bucket()'s refcount. - */ - if (b->sectors_free) - for (i = 0; i < KEY_PTRS(&b->key); i++) - atomic_inc(&PTR_BUCKET(c, &b->key, i)->pin); - - spin_unlock(&c->data_bucket_lock); - return true; -} - static void bch_data_invalidate(struct closure *cl) { struct data_insert_op *op = container_of(cl, struct data_insert_op, cl); @@ -545,7 +365,9 @@ static void bch_data_insert_start(struct closure *cl) SET_KEY_INODE(k, op->inode); SET_KEY_OFFSET(k, bio->bi_sector); - if (!bch_alloc_sectors(op, k, bio_sectors(bio))) + if (!bch_alloc_sectors(op->c, k, bio_sectors(bio), + op->write_point, op->write_prio, + op->writeback)) goto err; n = bch_bio_split(bio, KEY_SIZE(k), GFP_NOIO, split); @@ -968,7 +790,7 @@ static struct search *search_alloc(struct bio *bio, struct bcache_device *d) s->iop.c = d->c; s->d = d; s->op.lock = -1; - s->iop.task = current; + s->iop.write_point = hash_long((unsigned long) current, 16); s->orig_bio = bio; s->write = (bio->bi_rw & REQ_WRITE) != 0; s->iop.flush_journal = (bio->bi_rw & (REQ_FLUSH|REQ_FUA)) != 0; diff --git a/drivers/md/bcache/request.h b/drivers/md/bcache/request.h index 54d7de2..2cd65bf 100644 --- a/drivers/md/bcache/request.h +++ b/drivers/md/bcache/request.h @@ -6,10 +6,10 @@ struct data_insert_op { struct closure cl; struct cache_set *c; - struct task_struct *task; struct bio *bio; unsigned inode; + uint16_t write_point; uint16_t write_prio; short error; @@ -31,9 +31,6 @@ struct data_insert_op { unsigned bch_get_congested(struct cache_set *); void bch_data_insert(struct closure *cl); -void bch_open_buckets_free(struct cache_set *); -int bch_open_buckets_alloc(struct cache_set *); - void bch_cached_dev_request_init(struct cached_dev *dc); void bch_flash_dev_request_init(struct bcache_device *d); -- cgit v0.10.2 From 81ab4190ac17df41686a37c97f701623276b652a Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 31 Oct 2013 15:46:42 -0700 Subject: bcache: Pull on disk data structures out into a separate header Now, the on disk data structures are in a header that can be exported to userspace - and having them all centralized is nice too. Signed-off-by: Kent Overstreet diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index e32f6fd..045cb99 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -177,6 +177,7 @@ #define pr_fmt(fmt) "bcache: %s() " fmt "\n", __func__ +#include #include #include #include @@ -210,168 +211,6 @@ BITMASK(GC_MARK, struct bucket, gc_mark, 0, 2); #define GC_MARK_METADATA 2 BITMASK(GC_SECTORS_USED, struct bucket, gc_mark, 2, 14); -struct bkey { - uint64_t high; - uint64_t low; - uint64_t ptr[]; -}; - -/* Enough for a key with 6 pointers */ -#define BKEY_PAD 8 - -#define BKEY_PADDED(key) \ - union { struct bkey key; uint64_t key ## _pad[BKEY_PAD]; } - -/* Version 0: Cache device - * Version 1: Backing device - * Version 2: Seed pointer into btree node checksum - * Version 3: Cache device with new UUID format - * Version 4: Backing device with data offset - */ -#define BCACHE_SB_VERSION_CDEV 0 -#define BCACHE_SB_VERSION_BDEV 1 -#define BCACHE_SB_VERSION_CDEV_WITH_UUID 3 -#define BCACHE_SB_VERSION_BDEV_WITH_OFFSET 4 -#define BCACHE_SB_MAX_VERSION 4 - -#define SB_SECTOR 8 -#define SB_SIZE 4096 -#define SB_LABEL_SIZE 32 -#define SB_JOURNAL_BUCKETS 256U -/* SB_JOURNAL_BUCKETS must be divisible by BITS_PER_LONG */ -#define MAX_CACHES_PER_SET 8 - -#define BDEV_DATA_START_DEFAULT 16 /* sectors */ - -struct cache_sb { - uint64_t csum; - uint64_t offset; /* sector where this sb was written */ - uint64_t version; - - uint8_t magic[16]; - - uint8_t uuid[16]; - union { - uint8_t set_uuid[16]; - uint64_t set_magic; - }; - uint8_t label[SB_LABEL_SIZE]; - - uint64_t flags; - uint64_t seq; - uint64_t pad[8]; - - union { - struct { - /* Cache devices */ - uint64_t nbuckets; /* device size */ - - uint16_t block_size; /* sectors */ - uint16_t bucket_size; /* sectors */ - - uint16_t nr_in_set; - uint16_t nr_this_dev; - }; - struct { - /* Backing devices */ - uint64_t data_offset; - - /* - * block_size from the cache device section is still used by - * backing devices, so don't add anything here until we fix - * things to not need it for backing devices anymore - */ - }; - }; - - uint32_t last_mount; /* time_t */ - - uint16_t first_bucket; - union { - uint16_t njournal_buckets; - uint16_t keys; - }; - uint64_t d[SB_JOURNAL_BUCKETS]; /* journal buckets */ -}; - -BITMASK(CACHE_SYNC, struct cache_sb, flags, 0, 1); -BITMASK(CACHE_DISCARD, struct cache_sb, flags, 1, 1); -BITMASK(CACHE_REPLACEMENT, struct cache_sb, flags, 2, 3); -#define CACHE_REPLACEMENT_LRU 0U -#define CACHE_REPLACEMENT_FIFO 1U -#define CACHE_REPLACEMENT_RANDOM 2U - -BITMASK(BDEV_CACHE_MODE, struct cache_sb, flags, 0, 4); -#define CACHE_MODE_WRITETHROUGH 0U -#define CACHE_MODE_WRITEBACK 1U -#define CACHE_MODE_WRITEAROUND 2U -#define CACHE_MODE_NONE 3U -BITMASK(BDEV_STATE, struct cache_sb, flags, 61, 2); -#define BDEV_STATE_NONE 0U -#define BDEV_STATE_CLEAN 1U -#define BDEV_STATE_DIRTY 2U -#define BDEV_STATE_STALE 3U - -/* Version 1: Seed pointer into btree node checksum - */ -#define BCACHE_BSET_VERSION 1 - -/* - * This is the on disk format for btree nodes - a btree node on disk is a list - * of these; within each set the keys are sorted - */ -struct bset { - uint64_t csum; - uint64_t magic; - uint64_t seq; - uint32_t version; - uint32_t keys; - - union { - struct bkey start[0]; - uint64_t d[0]; - }; -}; - -/* - * On disk format for priorities and gens - see super.c near prio_write() for - * more. - */ -struct prio_set { - uint64_t csum; - uint64_t magic; - uint64_t seq; - uint32_t version; - uint32_t pad; - - uint64_t next_bucket; - - struct bucket_disk { - uint16_t prio; - uint8_t gen; - } __attribute((packed)) data[]; -}; - -struct uuid_entry { - union { - struct { - uint8_t uuid[16]; - uint8_t label[32]; - uint32_t first_reg; - uint32_t last_reg; - uint32_t invalidated; - - uint32_t flags; - /* Size of flash only volumes */ - uint64_t sectors; - }; - - uint8_t pad[128]; - }; -}; - -BITMASK(UUID_FLASH_ONLY, struct uuid_entry, flags, 0, 1); - #include "journal.h" #include "stats.h" struct search; @@ -868,12 +707,6 @@ static inline bool key_merging_disabled(struct cache_set *c) #endif } -static inline bool SB_IS_BDEV(const struct cache_sb *sb) -{ - return sb->version == BCACHE_SB_VERSION_BDEV - || sb->version == BCACHE_SB_VERSION_BDEV_WITH_OFFSET; -} - struct bbio { unsigned submit_time_us; union { @@ -927,59 +760,6 @@ static inline unsigned local_clock_us(void) #define prio_buckets(c) \ DIV_ROUND_UP((size_t) (c)->sb.nbuckets, prios_per_bucket(c)) -#define JSET_MAGIC 0x245235c1a3625032ULL -#define PSET_MAGIC 0x6750e15f87337f91ULL -#define BSET_MAGIC 0x90135c78b99e07f5ULL - -#define jset_magic(c) ((c)->sb.set_magic ^ JSET_MAGIC) -#define pset_magic(c) ((c)->sb.set_magic ^ PSET_MAGIC) -#define bset_magic(c) ((c)->sb.set_magic ^ BSET_MAGIC) - -/* Bkey fields: all units are in sectors */ - -#define KEY_FIELD(name, field, offset, size) \ - BITMASK(name, struct bkey, field, offset, size) - -#define PTR_FIELD(name, offset, size) \ - static inline uint64_t name(const struct bkey *k, unsigned i) \ - { return (k->ptr[i] >> offset) & ~(((uint64_t) ~0) << size); } \ - \ - static inline void SET_##name(struct bkey *k, unsigned i, uint64_t v)\ - { \ - k->ptr[i] &= ~(~((uint64_t) ~0 << size) << offset); \ - k->ptr[i] |= v << offset; \ - } - -KEY_FIELD(KEY_PTRS, high, 60, 3) -KEY_FIELD(HEADER_SIZE, high, 58, 2) -KEY_FIELD(KEY_CSUM, high, 56, 2) -KEY_FIELD(KEY_PINNED, high, 55, 1) -KEY_FIELD(KEY_DIRTY, high, 36, 1) - -KEY_FIELD(KEY_SIZE, high, 20, 16) -KEY_FIELD(KEY_INODE, high, 0, 20) - -/* Next time I change the on disk format, KEY_OFFSET() won't be 64 bits */ - -static inline uint64_t KEY_OFFSET(const struct bkey *k) -{ - return k->low; -} - -static inline void SET_KEY_OFFSET(struct bkey *k, uint64_t v) -{ - k->low = v; -} - -PTR_FIELD(PTR_DEV, 51, 12) -PTR_FIELD(PTR_OFFSET, 8, 43) -PTR_FIELD(PTR_GEN, 0, 8) - -#define PTR_CHECK_DEV ((1 << 12) - 1) - -#define PTR(gen, offset, dev) \ - ((((uint64_t) dev) << 51) | ((uint64_t) offset) << 8 | gen) - static inline size_t sector_to_bucket(struct cache_set *c, sector_t s) { return s >> c->bucket_bits; @@ -1018,31 +798,11 @@ static inline struct bucket *PTR_BUCKET(struct cache_set *c, /* Btree key macros */ -/* - * The high bit being set is a relic from when we used it to do binary - * searches - it told you where a key started. It's not used anymore, - * and can probably be safely dropped. - */ -#define KEY(dev, sector, len) \ -((struct bkey) { \ - .high = (1ULL << 63) | ((uint64_t) (len) << 20) | (dev), \ - .low = (sector) \ -}) - static inline void bkey_init(struct bkey *k) { - *k = KEY(0, 0, 0); + *k = ZERO_KEY; } -#define KEY_START(k) (KEY_OFFSET(k) - KEY_SIZE(k)) -#define START_KEY(k) KEY(KEY_INODE(k), KEY_START(k), 0) - -#define MAX_KEY_INODE (~(~0 << 20)) -#define MAX_KEY_OFFSET (((uint64_t) ~0) >> 1) -#define MAX_KEY KEY(MAX_KEY_INODE, MAX_KEY_OFFSET, 0) - -#define ZERO_KEY KEY(0, 0, 0) - /* * This is used for various on disk data structures - cache_sb, prio_set, bset, * jset: The checksum is _always_ the first 8 bytes of these structs diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c index f7b5525..7b8713c 100644 --- a/drivers/md/bcache/bset.c +++ b/drivers/md/bcache/bset.c @@ -684,7 +684,7 @@ void bch_bset_init_next(struct btree *b) } else get_random_bytes(&i->seq, sizeof(uint64_t)); - i->magic = bset_magic(b->c); + i->magic = bset_magic(&b->c->sb); i->version = 0; i->keys = 0; @@ -1034,7 +1034,7 @@ static void __btree_sort(struct btree *b, struct btree_iter *iter, * memcpy() */ - out->magic = bset_magic(b->c); + out->magic = bset_magic(&b->c->sb); out->seq = b->sets[0].data->seq; out->version = b->sets[0].data->version; swap(out, b->sets[0].data); diff --git a/drivers/md/bcache/bset.h b/drivers/md/bcache/bset.h index 8a93056..5cd9056 100644 --- a/drivers/md/bcache/bset.h +++ b/drivers/md/bcache/bset.h @@ -193,37 +193,6 @@ static __always_inline int64_t bkey_cmp(const struct bkey *l, : (int64_t) KEY_OFFSET(l) - (int64_t) KEY_OFFSET(r); } -static inline size_t bkey_u64s(const struct bkey *k) -{ - BUG_ON(KEY_CSUM(k) > 1); - return 2 + KEY_PTRS(k) + (KEY_CSUM(k) ? 1 : 0); -} - -static inline size_t bkey_bytes(const struct bkey *k) -{ - return bkey_u64s(k) * sizeof(uint64_t); -} - -static inline void bkey_copy(struct bkey *dest, const struct bkey *src) -{ - memcpy(dest, src, bkey_bytes(src)); -} - -static inline void bkey_copy_key(struct bkey *dest, const struct bkey *src) -{ - if (!src) - src = &KEY(0, 0, 0); - - SET_KEY_INODE(dest, KEY_INODE(src)); - SET_KEY_OFFSET(dest, KEY_OFFSET(src)); -} - -static inline struct bkey *bkey_next(const struct bkey *k) -{ - uint64_t *d = (void *) k; - return (struct bkey *) (d + bkey_u64s(k)); -} - /* Keylists */ struct keylist { diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index f5aa4ad..aba787d 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -231,7 +231,7 @@ static void bch_btree_node_read_done(struct btree *b) goto err; err = "bad magic"; - if (i->magic != bset_magic(b->c)) + if (i->magic != bset_magic(&b->c->sb)) goto err; err = "bad checksum"; diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c index 86de64a..ecdaa67 100644 --- a/drivers/md/bcache/journal.c +++ b/drivers/md/bcache/journal.c @@ -74,7 +74,7 @@ reread: left = ca->sb.bucket_size - offset; struct list_head *where; size_t blocks, bytes = set_bytes(j); - if (j->magic != jset_magic(ca->set)) + if (j->magic != jset_magic(&ca->sb)) return ret; if (bytes > left << 9) @@ -596,7 +596,7 @@ static void journal_write_unlocked(struct closure *cl) for_each_cache(ca, c, i) w->data->prio_bucket[ca->sb.nr_this_dev] = ca->prio_buckets[0]; - w->data->magic = jset_magic(c); + w->data->magic = jset_magic(&c->sb); w->data->version = BCACHE_JSET_VERSION; w->data->last_seq = last_seq(&c->journal); w->data->csum = csum_set(w->data); diff --git a/drivers/md/bcache/journal.h b/drivers/md/bcache/journal.h index 5e9edb9..a6472fd 100644 --- a/drivers/md/bcache/journal.h +++ b/drivers/md/bcache/journal.h @@ -75,43 +75,6 @@ * nodes that are pinning the oldest journal entries first. */ -#define BCACHE_JSET_VERSION_UUIDv1 1 -/* Always latest UUID format */ -#define BCACHE_JSET_VERSION_UUID 1 -#define BCACHE_JSET_VERSION 1 - -/* - * On disk format for a journal entry: - * seq is monotonically increasing; every journal entry has its own unique - * sequence number. - * - * last_seq is the oldest journal entry that still has keys the btree hasn't - * flushed to disk yet. - * - * version is for on disk format changes. - */ -struct jset { - uint64_t csum; - uint64_t magic; - uint64_t seq; - uint32_t version; - uint32_t keys; - - uint64_t last_seq; - - BKEY_PADDED(uuid_bucket); - BKEY_PADDED(btree_root); - uint16_t btree_level; - uint16_t pad[3]; - - uint64_t prio_bucket[MAX_CACHES_PER_SET]; - - union { - struct bkey start[0]; - uint64_t d[0]; - }; -}; - /* * Only used for holding the journal entries we read in btree_journal_read() * during cache_registration diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index cf7850a..932300f 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -264,16 +264,17 @@ static void bch_data_invalidate(struct closure *cl) bio_sectors(bio), (uint64_t) bio->bi_sector); while (bio_sectors(bio)) { - unsigned len = min(bio_sectors(bio), 1U << 14); + unsigned sectors = min(bio_sectors(bio), + 1U << (KEY_SIZE_BITS - 1)); if (bch_keylist_realloc(&op->insert_keys, 0, op->c)) goto out; - bio->bi_sector += len; - bio->bi_size -= len << 9; + bio->bi_sector += sectors; + bio->bi_size -= sectors << 9; bch_keylist_add(&op->insert_keys, - &KEY(op->inode, bio->bi_sector, len)); + &KEY(op->inode, bio->bi_sector, sectors)); } op->insert_data_done = true; diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index a314c77..c67d19a 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -45,15 +45,6 @@ const char * const bch_cache_modes[] = { NULL }; -struct uuid_entry_v0 { - uint8_t uuid[16]; - uint8_t label[32]; - uint32_t first_reg; - uint32_t last_reg; - uint32_t invalidated; - uint32_t pad; -}; - static struct kobject *bcache_kobj; struct mutex bch_register_lock; LIST_HEAD(bch_cache_sets); @@ -562,7 +553,7 @@ void bch_prio_write(struct cache *ca) } p->next_bucket = ca->prio_buckets[i + 1]; - p->magic = pset_magic(ca); + p->magic = pset_magic(&ca->sb); p->csum = bch_crc64(&p->magic, bucket_bytes(ca) - 8); bucket = bch_bucket_alloc(ca, WATERMARK_PRIO, true); @@ -613,7 +604,7 @@ static void prio_read(struct cache *ca, uint64_t bucket) if (p->csum != bch_crc64(&p->magic, bucket_bytes(ca) - 8)) pr_warn("bad csum reading priorities"); - if (p->magic != pset_magic(ca)) + if (p->magic != pset_magic(&ca->sb)) pr_warn("bad magic reading priorities"); bucket = p->next_bucket; diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h index ea345c6..38ae7a4 100644 --- a/drivers/md/bcache/util.h +++ b/drivers/md/bcache/util.h @@ -27,16 +27,6 @@ struct closure; #endif -#define BITMASK(name, type, field, offset, size) \ -static inline uint64_t name(const type *k) \ -{ return (k->field >> offset) & ~(((uint64_t) ~0) << size); } \ - \ -static inline void SET_##name(type *k, uint64_t v) \ -{ \ - k->field &= ~(~((uint64_t) ~0 << size) << offset); \ - k->field |= v << offset; \ -} - #define DECLARE_HEAP(type, name) \ struct { \ size_t size, used; \ diff --git a/include/uapi/linux/bcache.h b/include/uapi/linux/bcache.h new file mode 100644 index 0000000..164a7e2 --- /dev/null +++ b/include/uapi/linux/bcache.h @@ -0,0 +1,373 @@ +#ifndef _LINUX_BCACHE_H +#define _LINUX_BCACHE_H + +/* + * Bcache on disk data structures + */ + +#include + +#define BITMASK(name, type, field, offset, size) \ +static inline __u64 name(const type *k) \ +{ return (k->field >> offset) & ~(~0ULL << size); } \ + \ +static inline void SET_##name(type *k, __u64 v) \ +{ \ + k->field &= ~(~(~0ULL << size) << offset); \ + k->field |= (v & ~(~0ULL << size)) << offset; \ +} + +/* Btree keys - all units are in sectors */ + +struct bkey { + __u64 high; + __u64 low; + __u64 ptr[]; +}; + +#define KEY_FIELD(name, field, offset, size) \ + BITMASK(name, struct bkey, field, offset, size) + +#define PTR_FIELD(name, offset, size) \ +static inline __u64 name(const struct bkey *k, unsigned i) \ +{ return (k->ptr[i] >> offset) & ~(~0ULL << size); } \ + \ +static inline void SET_##name(struct bkey *k, unsigned i, __u64 v) \ +{ \ + k->ptr[i] &= ~(~(~0ULL << size) << offset); \ + k->ptr[i] |= (v & ~(~0ULL << size)) << offset; \ +} + +#define KEY_SIZE_BITS 16 + +KEY_FIELD(KEY_PTRS, high, 60, 3) +KEY_FIELD(HEADER_SIZE, high, 58, 2) +KEY_FIELD(KEY_CSUM, high, 56, 2) +KEY_FIELD(KEY_PINNED, high, 55, 1) +KEY_FIELD(KEY_DIRTY, high, 36, 1) + +KEY_FIELD(KEY_SIZE, high, 20, KEY_SIZE_BITS) +KEY_FIELD(KEY_INODE, high, 0, 20) + +/* Next time I change the on disk format, KEY_OFFSET() won't be 64 bits */ + +static inline __u64 KEY_OFFSET(const struct bkey *k) +{ + return k->low; +} + +static inline void SET_KEY_OFFSET(struct bkey *k, __u64 v) +{ + k->low = v; +} + +/* + * The high bit being set is a relic from when we used it to do binary + * searches - it told you where a key started. It's not used anymore, + * and can probably be safely dropped. + */ +#define KEY(inode, offset, size) \ +((struct bkey) { \ + .high = (1ULL << 63) | ((__u64) (size) << 20) | (inode), \ + .low = (offset) \ +}) + +#define ZERO_KEY KEY(0, 0, 0) + +#define MAX_KEY_INODE (~(~0 << 20)) +#define MAX_KEY_OFFSET (~0ULL >> 1) +#define MAX_KEY KEY(MAX_KEY_INODE, MAX_KEY_OFFSET, 0) + +#define KEY_START(k) (KEY_OFFSET(k) - KEY_SIZE(k)) +#define START_KEY(k) KEY(KEY_INODE(k), KEY_START(k), 0) + +#define PTR_DEV_BITS 12 + +PTR_FIELD(PTR_DEV, 51, PTR_DEV_BITS) +PTR_FIELD(PTR_OFFSET, 8, 43) +PTR_FIELD(PTR_GEN, 0, 8) + +#define PTR_CHECK_DEV ((1 << PTR_DEV_BITS) - 1) + +#define PTR(gen, offset, dev) \ + ((((__u64) dev) << 51) | ((__u64) offset) << 8 | gen) + +/* Bkey utility code */ + +static inline unsigned long bkey_u64s(const struct bkey *k) +{ + return (sizeof(struct bkey) / sizeof(__u64)) + KEY_PTRS(k); +} + +static inline unsigned long bkey_bytes(const struct bkey *k) +{ + return bkey_u64s(k) * sizeof(__u64); +} + +#define bkey_copy(_dest, _src) memcpy(_dest, _src, bkey_bytes(_src)) + +static inline void bkey_copy_key(struct bkey *dest, const struct bkey *src) +{ + SET_KEY_INODE(dest, KEY_INODE(src)); + SET_KEY_OFFSET(dest, KEY_OFFSET(src)); +} + +static inline struct bkey *bkey_next(const struct bkey *k) +{ + __u64 *d = (void *) k; + return (struct bkey *) (d + bkey_u64s(k)); +} + +static inline struct bkey *bkey_last(const struct bkey *k, unsigned nr_keys) +{ + __u64 *d = (void *) k; + return (struct bkey *) (d + nr_keys); +} +/* Enough for a key with 6 pointers */ +#define BKEY_PAD 8 + +#define BKEY_PADDED(key) \ + union { struct bkey key; __u64 key ## _pad[BKEY_PAD]; } + +/* Superblock */ + +/* Version 0: Cache device + * Version 1: Backing device + * Version 2: Seed pointer into btree node checksum + * Version 3: Cache device with new UUID format + * Version 4: Backing device with data offset + */ +#define BCACHE_SB_VERSION_CDEV 0 +#define BCACHE_SB_VERSION_BDEV 1 +#define BCACHE_SB_VERSION_CDEV_WITH_UUID 3 +#define BCACHE_SB_VERSION_BDEV_WITH_OFFSET 4 +#define BCACHE_SB_MAX_VERSION 4 + +#define SB_SECTOR 8 +#define SB_SIZE 4096 +#define SB_LABEL_SIZE 32 +#define SB_JOURNAL_BUCKETS 256U +/* SB_JOURNAL_BUCKETS must be divisible by BITS_PER_LONG */ +#define MAX_CACHES_PER_SET 8 + +#define BDEV_DATA_START_DEFAULT 16 /* sectors */ + +struct cache_sb { + __u64 csum; + __u64 offset; /* sector where this sb was written */ + __u64 version; + + __u8 magic[16]; + + __u8 uuid[16]; + union { + __u8 set_uuid[16]; + __u64 set_magic; + }; + __u8 label[SB_LABEL_SIZE]; + + __u64 flags; + __u64 seq; + __u64 pad[8]; + + union { + struct { + /* Cache devices */ + __u64 nbuckets; /* device size */ + + __u16 block_size; /* sectors */ + __u16 bucket_size; /* sectors */ + + __u16 nr_in_set; + __u16 nr_this_dev; + }; + struct { + /* Backing devices */ + __u64 data_offset; + + /* + * block_size from the cache device section is still used by + * backing devices, so don't add anything here until we fix + * things to not need it for backing devices anymore + */ + }; + }; + + __u32 last_mount; /* time_t */ + + __u16 first_bucket; + union { + __u16 njournal_buckets; + __u16 keys; + }; + __u64 d[SB_JOURNAL_BUCKETS]; /* journal buckets */ +}; + +static inline _Bool SB_IS_BDEV(const struct cache_sb *sb) +{ + return sb->version == BCACHE_SB_VERSION_BDEV + || sb->version == BCACHE_SB_VERSION_BDEV_WITH_OFFSET; +} + +BITMASK(CACHE_SYNC, struct cache_sb, flags, 0, 1); +BITMASK(CACHE_DISCARD, struct cache_sb, flags, 1, 1); +BITMASK(CACHE_REPLACEMENT, struct cache_sb, flags, 2, 3); +#define CACHE_REPLACEMENT_LRU 0U +#define CACHE_REPLACEMENT_FIFO 1U +#define CACHE_REPLACEMENT_RANDOM 2U + +BITMASK(BDEV_CACHE_MODE, struct cache_sb, flags, 0, 4); +#define CACHE_MODE_WRITETHROUGH 0U +#define CACHE_MODE_WRITEBACK 1U +#define CACHE_MODE_WRITEAROUND 2U +#define CACHE_MODE_NONE 3U +BITMASK(BDEV_STATE, struct cache_sb, flags, 61, 2); +#define BDEV_STATE_NONE 0U +#define BDEV_STATE_CLEAN 1U +#define BDEV_STATE_DIRTY 2U +#define BDEV_STATE_STALE 3U + +/* + * Magic numbers + * + * The various other data structures have their own magic numbers, which are + * xored with the first part of the cache set's UUID + */ + +#define JSET_MAGIC 0x245235c1a3625032ULL +#define PSET_MAGIC 0x6750e15f87337f91ULL +#define BSET_MAGIC 0x90135c78b99e07f5ULL + +static inline __u64 jset_magic(struct cache_sb *sb) +{ + return sb->set_magic ^ JSET_MAGIC; +} + +static inline __u64 pset_magic(struct cache_sb *sb) +{ + return sb->set_magic ^ PSET_MAGIC; +} + +static inline __u64 bset_magic(struct cache_sb *sb) +{ + return sb->set_magic ^ BSET_MAGIC; +} + +/* + * Journal + * + * On disk format for a journal entry: + * seq is monotonically increasing; every journal entry has its own unique + * sequence number. + * + * last_seq is the oldest journal entry that still has keys the btree hasn't + * flushed to disk yet. + * + * version is for on disk format changes. + */ + +#define BCACHE_JSET_VERSION_UUIDv1 1 +#define BCACHE_JSET_VERSION_UUID 1 /* Always latest UUID format */ +#define BCACHE_JSET_VERSION 1 + +struct jset { + __u64 csum; + __u64 magic; + __u64 seq; + __u32 version; + __u32 keys; + + __u64 last_seq; + + BKEY_PADDED(uuid_bucket); + BKEY_PADDED(btree_root); + __u16 btree_level; + __u16 pad[3]; + + __u64 prio_bucket[MAX_CACHES_PER_SET]; + + union { + struct bkey start[0]; + __u64 d[0]; + }; +}; + +/* Bucket prios/gens */ + +struct prio_set { + __u64 csum; + __u64 magic; + __u64 seq; + __u32 version; + __u32 pad; + + __u64 next_bucket; + + struct bucket_disk { + __u16 prio; + __u8 gen; + } __attribute((packed)) data[]; +}; + +/* UUIDS - per backing device/flash only volume metadata */ + +struct uuid_entry { + union { + struct { + __u8 uuid[16]; + __u8 label[32]; + __u32 first_reg; + __u32 last_reg; + __u32 invalidated; + + __u32 flags; + /* Size of flash only volumes */ + __u64 sectors; + }; + + __u8 pad[128]; + }; +}; + +BITMASK(UUID_FLASH_ONLY, struct uuid_entry, flags, 0, 1); + +/* Btree nodes */ + +/* Version 1: Seed pointer into btree node checksum + */ +#define BCACHE_BSET_CSUM 1 +#define BCACHE_BSET_VERSION 1 + +/* + * Btree nodes + * + * On disk a btree node is a list/log of these; within each set the keys are + * sorted + */ +struct bset { + __u64 csum; + __u64 magic; + __u64 seq; + __u32 version; + __u32 keys; + + union { + struct bkey start[0]; + __u64 d[0]; + }; +}; + +/* OBSOLETE */ + +/* UUIDS - per backing device/flash only volume metadata */ + +struct uuid_entry_v0 { + __u8 uuid[16]; + __u8 label[32]; + __u32 first_reg; + __u32 last_reg; + __u32 invalidated; + __u32 pad; +}; + +#endif /* _LINUX_BCACHE_H */ -- cgit v0.10.2 From e58ff155034791ed3a5563d24a50fae0a8c1617c Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 24 Jul 2013 18:14:44 -0700 Subject: bcache: Fix bch_ptr_bad() Previously, bch_ptr_bad() could return false when there was a pointer to a nonexistant device... it only filtered out keys with PTR_CHECK_DEV pointers. This behaviour was intended for multiple cache device support; for that, just because the device for one of the pointers has gone away doesn't mean we want to filter out the rest of the pointers. But we don't yet explicitly filter/check individual pointers, so without that this behaviour was wrong - a corrupt bkey with a bad device pointer could cause us to deref a bad pointer. Doh. Signed-off-by: Kent Overstreet diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c index 7b8713c..f32216c 100644 --- a/drivers/md/bcache/bset.c +++ b/drivers/md/bcache/bset.c @@ -116,47 +116,46 @@ bool bch_ptr_bad(struct btree *b, const struct bkey *k) bch_ptr_invalid(b, k)) return true; - if (KEY_PTRS(k) && PTR_DEV(k, 0) == PTR_CHECK_DEV) - return true; + for (i = 0; i < KEY_PTRS(k); i++) { + if (!ptr_available(b->c, k, i)) + return true; - for (i = 0; i < KEY_PTRS(k); i++) - if (ptr_available(b->c, k, i)) { - g = PTR_BUCKET(b->c, k, i); - stale = ptr_stale(b->c, k, i); + g = PTR_BUCKET(b->c, k, i); + stale = ptr_stale(b->c, k, i); - btree_bug_on(stale > 96, b, - "key too stale: %i, need_gc %u", - stale, b->c->need_gc); + btree_bug_on(stale > 96, b, + "key too stale: %i, need_gc %u", + stale, b->c->need_gc); - btree_bug_on(stale && KEY_DIRTY(k) && KEY_SIZE(k), - b, "stale dirty pointer"); + btree_bug_on(stale && KEY_DIRTY(k) && KEY_SIZE(k), + b, "stale dirty pointer"); - if (stale) - return true; + if (stale) + return true; #ifdef CONFIG_BCACHE_EDEBUG - if (!mutex_trylock(&b->c->bucket_lock)) - continue; - - if (b->level) { - if (KEY_DIRTY(k) || - g->prio != BTREE_PRIO || - (b->c->gc_mark_valid && - GC_MARK(g) != GC_MARK_METADATA)) - goto bug; - - } else { - if (g->prio == BTREE_PRIO) - goto bug; - - if (KEY_DIRTY(k) && - b->c->gc_mark_valid && - GC_MARK(g) != GC_MARK_DIRTY) - goto bug; - } - mutex_unlock(&b->c->bucket_lock); -#endif + if (!mutex_trylock(&b->c->bucket_lock)) + continue; + + if (b->level) { + if (KEY_DIRTY(k) || + g->prio != BTREE_PRIO || + (b->c->gc_mark_valid && + GC_MARK(g) != GC_MARK_METADATA)) + goto bug; + + } else { + if (g->prio == BTREE_PRIO) + goto bug; + + if (KEY_DIRTY(k) && + b->c->gc_mark_valid && + GC_MARK(g) != GC_MARK_DIRTY) + goto bug; } + mutex_unlock(&b->c->bucket_lock); +#endif + } return false; #ifdef CONFIG_BCACHE_EDEBUG -- cgit v0.10.2 From 280481d06c8a683d9aaa26125476222e76b733c5 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 24 Oct 2013 16:36:03 -0700 Subject: bcache: Debug code improvements Couple changes: * Consolidate bch_check_keys() and bch_check_key_order(), and move the checks that only check_key_order() could do to bch_btree_iter_next(). * Get rid of CONFIG_BCACHE_EDEBUG - now, all that code is compiled in when CONFIG_BCACHE_DEBUG is enabled, and there's now a sysfs file to flip on the EDEBUG checks at runtime. * Dropped an old not terribly useful check in rw_unlock(), and refactored/improved a some of the other debug code. Signed-off-by: Kent Overstreet diff --git a/drivers/md/bcache/Kconfig b/drivers/md/bcache/Kconfig index f950c9d..2638417 100644 --- a/drivers/md/bcache/Kconfig +++ b/drivers/md/bcache/Kconfig @@ -13,15 +13,8 @@ config BCACHE_DEBUG ---help--- Don't select this option unless you're a developer - Enables extra debugging tools (primarily a fuzz tester) - -config BCACHE_EDEBUG - bool "Extended runtime checks" - depends on BCACHE - ---help--- - Don't select this option unless you're a developer - - Enables extra runtime checks which significantly affect performance + Enables extra debugging tools, allows expensive runtime checks to be + turned on. config BCACHE_CLOSURES_DEBUG bool "Debug closures" diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c index 4970ddc..ed5920b 100644 --- a/drivers/md/bcache/alloc.c +++ b/drivers/md/bcache/alloc.c @@ -398,8 +398,7 @@ long bch_bucket_alloc(struct cache *ca, unsigned watermark, bool wait) out: wake_up_process(ca->alloc_thread); -#ifdef CONFIG_BCACHE_EDEBUG - { + if (expensive_debug_checks(ca->set)) { size_t iter; long i; @@ -413,7 +412,7 @@ out: fifo_for_each(i, &ca->unused, iter) BUG_ON(i == r); } -#endif + b = ca->buckets + r; BUG_ON(atomic_read(&b->pin) != 1); diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 045cb99..d03bc6f 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -690,6 +690,7 @@ struct cache_set { unsigned short journal_delay_ms; unsigned verify:1; unsigned key_merging_disabled:1; + unsigned expensive_debug_checks:1; unsigned gc_always_rewrite:1; unsigned shrinker_disabled:1; unsigned copy_gc_enabled:1; @@ -698,15 +699,6 @@ struct cache_set { struct hlist_head bucket_hash[1 << BUCKET_HASH_BITS]; }; -static inline bool key_merging_disabled(struct cache_set *c) -{ -#ifdef CONFIG_BCACHE_DEBUG - return c->key_merging_disabled; -#else - return 0; -#endif -} - struct bbio { unsigned submit_time_us; union { diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c index f32216c..6bffde4 100644 --- a/drivers/md/bcache/bset.c +++ b/drivers/md/bcache/bset.c @@ -106,6 +106,43 @@ bad: return true; } +static bool ptr_bad_expensive_checks(struct btree *b, const struct bkey *k, + unsigned ptr) +{ + struct bucket *g = PTR_BUCKET(b->c, k, ptr); + char buf[80]; + + if (mutex_trylock(&b->c->bucket_lock)) { + if (b->level) { + if (KEY_DIRTY(k) || + g->prio != BTREE_PRIO || + (b->c->gc_mark_valid && + GC_MARK(g) != GC_MARK_METADATA)) + goto err; + + } else { + if (g->prio == BTREE_PRIO) + goto err; + + if (KEY_DIRTY(k) && + b->c->gc_mark_valid && + GC_MARK(g) != GC_MARK_DIRTY) + goto err; + } + mutex_unlock(&b->c->bucket_lock); + } + + return false; +err: + mutex_unlock(&b->c->bucket_lock); + bch_bkey_to_text(buf, sizeof(buf), k); + btree_bug(b, +"inconsistent pointer %s: bucket %zu pin %i prio %i gen %i last_gc %i mark %llu gc_gen %i", + buf, PTR_BUCKET_NR(b->c, k, ptr), atomic_read(&g->pin), + g->prio, g->gen, g->last_gc, GC_MARK(g), g->gc_gen); + return true; +} + bool bch_ptr_bad(struct btree *b, const struct bkey *k) { struct bucket *g; @@ -133,46 +170,12 @@ bool bch_ptr_bad(struct btree *b, const struct bkey *k) if (stale) return true; -#ifdef CONFIG_BCACHE_EDEBUG - if (!mutex_trylock(&b->c->bucket_lock)) - continue; - - if (b->level) { - if (KEY_DIRTY(k) || - g->prio != BTREE_PRIO || - (b->c->gc_mark_valid && - GC_MARK(g) != GC_MARK_METADATA)) - goto bug; - - } else { - if (g->prio == BTREE_PRIO) - goto bug; - - if (KEY_DIRTY(k) && - b->c->gc_mark_valid && - GC_MARK(g) != GC_MARK_DIRTY) - goto bug; - } - mutex_unlock(&b->c->bucket_lock); -#endif + if (expensive_debug_checks(b->c) && + ptr_bad_expensive_checks(b, k, i)) + return true; } return false; -#ifdef CONFIG_BCACHE_EDEBUG -bug: - mutex_unlock(&b->c->bucket_lock); - - { - char buf[80]; - - bch_bkey_to_text(buf, sizeof(buf), k); - btree_bug(b, -"inconsistent pointer %s: bucket %zu pin %i prio %i gen %i last_gc %i mark %llu gc_gen %i", - buf, PTR_BUCKET_NR(b->c, k, i), atomic_read(&g->pin), - g->prio, g->gen, g->last_gc, GC_MARK(g), g->gc_gen); - } - return true; -#endif } /* Key/pointer manipulation */ @@ -821,16 +824,16 @@ struct bkey *__bch_bset_search(struct btree *b, struct bset_tree *t, } else i = bset_search_write_set(b, t, search); -#ifdef CONFIG_BCACHE_EDEBUG - BUG_ON(bset_written(b, t) && - i.l != t->data->start && - bkey_cmp(tree_to_prev_bkey(t, - inorder_to_tree(bkey_to_cacheline(t, i.l), t)), - search) > 0); + if (expensive_debug_checks(b->c)) { + BUG_ON(bset_written(b, t) && + i.l != t->data->start && + bkey_cmp(tree_to_prev_bkey(t, + inorder_to_tree(bkey_to_cacheline(t, i.l), t)), + search) > 0); - BUG_ON(i.r != end(t->data) && - bkey_cmp(i.r, search) <= 0); -#endif + BUG_ON(i.r != end(t->data) && + bkey_cmp(i.r, search) <= 0); + } while (likely(i.l != i.r) && bkey_cmp(i.l, search) <= 0) @@ -871,12 +874,16 @@ void bch_btree_iter_push(struct btree_iter *iter, struct bkey *k, } struct bkey *__bch_btree_iter_init(struct btree *b, struct btree_iter *iter, - struct bkey *search, struct bset_tree *start) + struct bkey *search, struct bset_tree *start) { struct bkey *ret = NULL; iter->size = ARRAY_SIZE(iter->data); iter->used = 0; +#ifdef CONFIG_BCACHE_DEBUG + iter->b = b; +#endif + for (; start <= &b->sets[b->nsets]; start++) { ret = bch_bset_search(b, start, search); bch_btree_iter_push(iter, ret, end(start->data)); @@ -891,6 +898,8 @@ struct bkey *bch_btree_iter_next(struct btree_iter *iter) struct bkey *ret = NULL; if (!btree_iter_end(iter)) { + bch_btree_iter_next_check(iter); + ret = iter->data->k; iter->data->k = bkey_next(iter->data->k); @@ -1002,7 +1011,6 @@ static void btree_mergesort(struct btree *b, struct bset *out, out->keys = last ? (uint64_t *) bkey_next(last) - out->d : 0; pr_debug("sorted %i keys", out->keys); - bch_check_key_order(b, out); } static void __btree_sort(struct btree *b, struct btree_iter *iter, @@ -1063,15 +1071,15 @@ static void __btree_sort(struct btree *b, struct btree_iter *iter, void bch_btree_sort_partial(struct btree *b, unsigned start) { - size_t oldsize = 0, order = b->page_order, keys = 0; + size_t order = b->page_order, keys = 0; struct btree_iter iter; + int oldsize = bch_count_data(b); + __bch_btree_iter_init(b, &iter, NULL, &b->sets[start]); BUG_ON(b->sets[b->nsets].data == write_block(b) && (b->sets[b->nsets].size || b->nsets)); - if (b->written) - oldsize = bch_count_data(b); if (start) { unsigned i; @@ -1087,7 +1095,7 @@ void bch_btree_sort_partial(struct btree *b, unsigned start) __btree_sort(b, &iter, start, order, false); - EBUG_ON(b->written && bch_count_data(b) != oldsize); + EBUG_ON(b->written && oldsize >= 0 && bch_count_data(b) != oldsize); } void bch_btree_sort_and_fix_extents(struct btree *b, struct btree_iter *iter) diff --git a/drivers/md/bcache/bset.h b/drivers/md/bcache/bset.h index 5cd9056..a043a92 100644 --- a/drivers/md/bcache/bset.h +++ b/drivers/md/bcache/bset.h @@ -148,6 +148,9 @@ struct btree_iter { size_t size, used; +#ifdef CONFIG_BCACHE_DEBUG + struct btree *b; +#endif struct btree_iter_set { struct bkey *k, *end; } data[MAX_BSETS]; diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index aba787d..fa4d0b1 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -216,6 +216,10 @@ static void bch_btree_node_read_done(struct btree *b) iter->size = b->c->sb.bucket_size / b->c->sb.block_size; iter->used = 0; +#ifdef CONFIG_BCACHE_DEBUG + iter->b = b; +#endif + if (!i->seq) goto err; @@ -454,7 +458,7 @@ void bch_btree_node_write(struct btree *b, struct closure *parent) BUG_ON(b->written >= btree_blocks(b)); BUG_ON(b->written && !i->keys); BUG_ON(b->sets->data->seq != i->seq); - bch_check_key_order(b, i); + bch_check_keys(b, "writing"); cancel_delayed_work(&b->work); @@ -1917,7 +1921,7 @@ static bool bch_btree_insert_keys(struct btree *b, struct btree_op *op, struct bkey *replace_key) { bool ret = false; - unsigned oldsize = bch_count_data(b); + int oldsize = bch_count_data(b); while (!bch_keylist_empty(insert_keys)) { struct bset *i = write_block(b); diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h index 8fc1e89..27e90b1 100644 --- a/drivers/md/bcache/btree.h +++ b/drivers/md/bcache/btree.h @@ -259,14 +259,6 @@ static inline void rw_lock(bool w, struct btree *b, int level) static inline void rw_unlock(bool w, struct btree *b) { -#ifdef CONFIG_BCACHE_EDEBUG - unsigned i; - - if (w && b->key.ptr[0]) - for (i = 0; i <= b->nsets; i++) - bch_check_key_order(b, b->sets[i].data); -#endif - if (w) b->seq++; (w ? up_write : up_read)(&b->lock); diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c index d9ccb31..e99e6b8 100644 --- a/drivers/md/bcache/debug.c +++ b/drivers/md/bcache/debug.c @@ -76,29 +76,17 @@ int bch_bkey_to_text(char *buf, size_t size, const struct bkey *k) return out - buf; } -int bch_btree_to_text(char *buf, size_t size, const struct btree *b) -{ - return scnprintf(buf, size, "%zu level %i/%i", - PTR_BUCKET_NR(b->c, &b->key, 0), - b->level, b->c->root ? b->c->root->level : -1); -} - -#if defined(CONFIG_BCACHE_DEBUG) || defined(CONFIG_BCACHE_EDEBUG) - -static bool skipped_backwards(struct btree *b, struct bkey *k) -{ - return bkey_cmp(k, (!b->level) - ? &START_KEY(bkey_next(k)) - : bkey_next(k)) > 0; -} +#ifdef CONFIG_BCACHE_DEBUG static void dump_bset(struct btree *b, struct bset *i) { - struct bkey *k; + struct bkey *k, *next; unsigned j; char buf[80]; - for (k = i->start; k < end(i); k = bkey_next(k)) { + for (k = i->start; k < end(i); k = next) { + next = bkey_next(k); + bch_bkey_to_text(buf, sizeof(buf), k); printk(KERN_ERR "block %zu key %zi/%u: %s", index(i, b), (uint64_t *) k - i->d, i->keys, buf); @@ -114,15 +102,21 @@ static void dump_bset(struct btree *b, struct bset *i) printk(" %s\n", bch_ptr_status(b->c, k)); - if (bkey_next(k) < end(i) && - skipped_backwards(b, k)) + if (next < end(i) && + bkey_cmp(k, !b->level ? &START_KEY(next) : next) > 0) printk(KERN_ERR "Key skipped backwards\n"); } } -#endif +static void bch_dump_bucket(struct btree *b) +{ + unsigned i; -#ifdef CONFIG_BCACHE_DEBUG + console_lock(); + for (i = 0; i <= b->nsets; i++) + dump_bset(b, b->sets[i].data); + console_unlock(); +} void bch_btree_verify(struct btree *b, struct bset *new) { @@ -211,11 +205,7 @@ out_put: bio_put(check); } -#endif - -#ifdef CONFIG_BCACHE_EDEBUG - -unsigned bch_count_data(struct btree *b) +int __bch_count_data(struct btree *b) { unsigned ret = 0; struct btree_iter iter; @@ -227,72 +217,60 @@ unsigned bch_count_data(struct btree *b) return ret; } -static void vdump_bucket_and_panic(struct btree *b, const char *fmt, - va_list args) -{ - unsigned i; - char buf[80]; - - console_lock(); - - for (i = 0; i <= b->nsets; i++) - dump_bset(b, b->sets[i].data); - - vprintk(fmt, args); - - console_unlock(); - - bch_btree_to_text(buf, sizeof(buf), b); - panic("at %s\n", buf); -} - -void bch_check_key_order_msg(struct btree *b, struct bset *i, - const char *fmt, ...) -{ - struct bkey *k; - - if (!i->keys) - return; - - for (k = i->start; bkey_next(k) < end(i); k = bkey_next(k)) - if (skipped_backwards(b, k)) { - va_list args; - va_start(args, fmt); - - vdump_bucket_and_panic(b, fmt, args); - va_end(args); - } -} - -void bch_check_keys(struct btree *b, const char *fmt, ...) +void __bch_check_keys(struct btree *b, const char *fmt, ...) { va_list args; struct bkey *k, *p = NULL; struct btree_iter iter; - - if (b->level) - return; + const char *err; for_each_key(b, k, &iter) { - if (p && bkey_cmp(&START_KEY(p), &START_KEY(k)) > 0) { - printk(KERN_ERR "Keys out of order:\n"); - goto bug; - } - - if (bch_ptr_invalid(b, k)) - continue; - - if (p && bkey_cmp(p, &START_KEY(k)) > 0) { - printk(KERN_ERR "Overlapping keys:\n"); - goto bug; + if (!b->level) { + err = "Keys out of order"; + if (p && bkey_cmp(&START_KEY(p), &START_KEY(k)) > 0) + goto bug; + + if (bch_ptr_invalid(b, k)) + continue; + + err = "Overlapping keys"; + if (p && bkey_cmp(p, &START_KEY(k)) > 0) + goto bug; + } else { + if (bch_ptr_bad(b, k)) + continue; + + err = "Duplicate keys"; + if (p && !bkey_cmp(p, k)) + goto bug; } p = k; } + + err = "Key larger than btree node key"; + if (p && bkey_cmp(p, &b->key) > 0) + goto bug; + return; bug: + bch_dump_bucket(b); + va_start(args, fmt); - vdump_bucket_and_panic(b, fmt, args); + vprintk(fmt, args); va_end(args); + + panic("bcache error: %s:\n", err); +} + +void bch_btree_iter_next_check(struct btree_iter *iter) +{ + struct bkey *k = iter->data->k, *next = bkey_next(k); + + if (next < iter->data->end && + bkey_cmp(k, iter->b->level ? next : &START_KEY(next)) > 0) { + bch_dump_bucket(iter->b); + panic("Key skipped backwards\n"); + } } #endif diff --git a/drivers/md/bcache/debug.h b/drivers/md/bcache/debug.h index 0f4b344..7914ba0 100644 --- a/drivers/md/bcache/debug.h +++ b/drivers/md/bcache/debug.h @@ -4,40 +4,42 @@ /* Btree/bkey debug printing */ int bch_bkey_to_text(char *buf, size_t size, const struct bkey *k); -int bch_btree_to_text(char *buf, size_t size, const struct btree *b); - -#ifdef CONFIG_BCACHE_EDEBUG - -unsigned bch_count_data(struct btree *); -void bch_check_key_order_msg(struct btree *, struct bset *, const char *, ...); -void bch_check_keys(struct btree *, const char *, ...); - -#define bch_check_key_order(b, i) \ - bch_check_key_order_msg(b, i, "keys out of order") -#define EBUG_ON(cond) BUG_ON(cond) - -#else /* EDEBUG */ - -#define bch_count_data(b) 0 -#define bch_check_key_order(b, i) do {} while (0) -#define bch_check_key_order_msg(b, i, ...) do {} while (0) -#define bch_check_keys(b, ...) do {} while (0) -#define EBUG_ON(cond) do {} while (0) - -#endif #ifdef CONFIG_BCACHE_DEBUG void bch_btree_verify(struct btree *, struct bset *); void bch_data_verify(struct cached_dev *, struct bio *); +int __bch_count_data(struct btree *); +void __bch_check_keys(struct btree *, const char *, ...); +void bch_btree_iter_next_check(struct btree_iter *); + +#define EBUG_ON(cond) BUG_ON(cond) +#define expensive_debug_checks(c) ((c)->expensive_debug_checks) +#define key_merging_disabled(c) ((c)->key_merging_disabled) #else /* DEBUG */ static inline void bch_btree_verify(struct btree *b, struct bset *i) {} -static inline void bch_data_verify(struct cached_dev *dc, struct bio *bio) {}; +static inline void bch_data_verify(struct cached_dev *dc, struct bio *bio) {} +static inline int __bch_count_data(struct btree *b) { return -1; } +static inline void __bch_check_keys(struct btree *b, const char *fmt, ...) {} +static inline void bch_btree_iter_next_check(struct btree_iter *iter) {} + +#define EBUG_ON(cond) do { if (cond); } while (0) +#define expensive_debug_checks(c) 0 +#define key_merging_disabled(c) 0 #endif +#define bch_count_data(b) \ + (expensive_debug_checks((b)->c) ? __bch_count_data(b) : -1) + +#define bch_check_keys(b, ...) \ +do { \ + if (expensive_debug_checks((b)->c)) \ + __bch_check_keys(b, __VA_ARGS__); \ +} while (0) + #ifdef CONFIG_DEBUG_FS void bch_debug_init_cache_set(struct cache_set *); #else diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index ab286b9..9687771 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -102,6 +102,7 @@ rw_attribute(io_error_halflife); rw_attribute(verify); rw_attribute(key_merging_disabled); rw_attribute(gc_always_rewrite); +rw_attribute(expensive_debug_checks); rw_attribute(freelist_percent); rw_attribute(cache_replacement_policy); rw_attribute(btree_shrinker_disabled); @@ -517,6 +518,8 @@ lock_root: sysfs_print(active_journal_entries, fifo_used(&c->journal.pin)); sysfs_printf(verify, "%i", c->verify); sysfs_printf(key_merging_disabled, "%i", c->key_merging_disabled); + sysfs_printf(expensive_debug_checks, + "%i", c->expensive_debug_checks); sysfs_printf(gc_always_rewrite, "%i", c->gc_always_rewrite); sysfs_printf(btree_shrinker_disabled, "%i", c->shrinker_disabled); sysfs_printf(copy_gc_enabled, "%i", c->copy_gc_enabled); @@ -599,6 +602,7 @@ STORE(__bch_cache_set) sysfs_strtoul(journal_delay_ms, c->journal_delay_ms); sysfs_strtoul(verify, c->verify); sysfs_strtoul(key_merging_disabled, c->key_merging_disabled); + sysfs_strtoul(expensive_debug_checks, c->expensive_debug_checks); sysfs_strtoul(gc_always_rewrite, c->gc_always_rewrite); sysfs_strtoul(btree_shrinker_disabled, c->shrinker_disabled); sysfs_strtoul(copy_gc_enabled, c->copy_gc_enabled); @@ -674,6 +678,7 @@ static struct attribute *bch_cache_set_internal_files[] = { #ifdef CONFIG_BCACHE_DEBUG &sysfs_verify, &sysfs_key_merging_disabled, + &sysfs_expensive_debug_checks, #endif &sysfs_gc_always_rewrite, &sysfs_btree_shrinker_disabled, diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h index 38ae7a4..8ce5aab 100644 --- a/drivers/md/bcache/util.h +++ b/drivers/md/bcache/util.h @@ -15,12 +15,12 @@ struct closure; -#ifdef CONFIG_BCACHE_EDEBUG +#ifdef CONFIG_BCACHE_DEBUG #define atomic_dec_bug(v) BUG_ON(atomic_dec_return(v) < 0) #define atomic_inc_bug(v, i) BUG_ON(atomic_inc_return(v) <= i) -#else /* EDEBUG */ +#else /* DEBUG */ #define atomic_dec_bug(v) atomic_dec(v) #define atomic_inc_bug(v, i) atomic_inc(v) -- cgit v0.10.2 From 3a3b6a4e075188342b58d4b6560f5540af64cac0 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 24 Jul 2013 16:46:42 -0700 Subject: bcache: Don't bother with bucket refcount for btree node allocations The bucket refcount (dropped with bkey_put()) is only needed to prevent the newly allocated bucket from being garbage collected until we've added a pointer to it somewhere. But for btree node allocations, the fact that we have btree nodes locked is enough to guard against races with garbage collection. Eventually the per bucket refcount is going to be replaced with something specific to bch_alloc_sectors(). Signed-off-by: Kent Overstreet diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c index ed5920b..2b46bf1 100644 --- a/drivers/md/bcache/alloc.c +++ b/drivers/md/bcache/alloc.c @@ -472,7 +472,7 @@ int __bch_bucket_alloc_set(struct cache_set *c, unsigned watermark, return 0; err: bch_bucket_free(c, k); - __bkey_put(c, k); + bkey_put(c, k); return -1; } @@ -588,7 +588,7 @@ bool bch_alloc_sectors(struct cache_set *c, struct bkey *k, unsigned sectors, * didn't use it, drop the refcount bch_bucket_alloc_set() took: */ if (KEY_PTRS(&alloc.key)) - __bkey_put(c, &alloc.key); + bkey_put(c, &alloc.key); for (i = 0; i < KEY_PTRS(&b->key); i++) EBUG_ON(ptr_stale(c, &b->key, i)); diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index fa4d0b1..7dff73b 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -180,7 +180,7 @@ static inline bool should_split(struct btree *b) /* Btree key manipulation */ -void __bkey_put(struct cache_set *c, struct bkey *k) +void bkey_put(struct cache_set *c, struct bkey *k) { unsigned i; @@ -189,12 +189,6 @@ void __bkey_put(struct cache_set *c, struct bkey *k) atomic_dec_bug(&PTR_BUCKET(c, k, i)->pin); } -static void bkey_put(struct cache_set *c, struct bkey *k, int level) -{ - if ((level && KEY_OFFSET(k)) || !level) - __bkey_put(c, k); -} - /* Btree IO */ static uint64_t btree_csum_set(struct btree *b, struct bset *i) @@ -1068,6 +1062,7 @@ retry: if (__bch_bucket_alloc_set(c, WATERMARK_METADATA, &k.key, 1, true)) goto err; + bkey_put(c, &k.key); SET_KEY_SIZE(&k.key, c->btree_pages * PAGE_SECTORS); b = mca_alloc(c, &k.key, level); @@ -1077,7 +1072,6 @@ retry: if (!b) { cache_bug(c, "Tried to allocate bucket that was in btree cache"); - __bkey_put(c, &k.key); goto retry; } @@ -1090,7 +1084,6 @@ retry: return b; err_free: bch_bucket_free(c, &k.key); - __bkey_put(c, &k.key); err: mutex_unlock(&c->bucket_lock); @@ -1217,7 +1210,6 @@ static struct btree *btree_gc_alloc(struct btree *b, struct bkey *k) if (!IS_ERR_OR_NULL(n)) { swap(b, n); - __bkey_put(b->c, &b->key); memcpy(k->ptr, b->key.ptr, sizeof(uint64_t) * KEY_PTRS(&b->key)); @@ -1932,19 +1924,12 @@ static bool bch_btree_insert_keys(struct btree *b, struct btree_op *op, break; if (bkey_cmp(k, &b->key) <= 0) { - bkey_put(b->c, k, b->level); + if (!b->level) + bkey_put(b->c, k); ret |= btree_insert_key(b, op, k, replace_key); bch_keylist_pop_front(insert_keys); } else if (bkey_cmp(&START_KEY(k), &b->key) < 0) { -#if 0 - if (replace_key) { - bkey_put(b->c, k, b->level); - bch_keylist_pop_front(insert_keys); - op->insert_collision = true; - break; - } -#endif BKEY_PADDED(key) temp; bkey_copy(&temp.key, insert_keys->keys); @@ -2071,11 +2056,9 @@ static int btree_split(struct btree *b, struct btree_op *op, return 0; err_free2: - __bkey_put(n2->c, &n2->key); btree_node_free(n2); rw_unlock(true, n2); err_free1: - __bkey_put(n1->c, &n1->key); btree_node_free(n1); rw_unlock(true, n1); err: @@ -2225,7 +2208,7 @@ int bch_btree_insert(struct cache_set *c, struct keylist *keys, pr_err("error %i", ret); while ((k = bch_keylist_pop(keys))) - bkey_put(c, k, 0); + bkey_put(c, k); } else if (op.op.insert_collision) ret = -ESRCH; @@ -2251,7 +2234,6 @@ void bch_btree_set_root(struct btree *b) mutex_unlock(&b->c->bucket_lock); b->c->root = b; - __bkey_put(b->c, &b->key); bch_journal_meta(b->c, &cl); closure_sync(&cl); diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h index 27e90b1..d4b705e 100644 --- a/drivers/md/bcache/btree.h +++ b/drivers/md/bcache/btree.h @@ -216,7 +216,7 @@ static inline struct bkey *bch_btree_iter_init(struct btree *b, return __bch_btree_iter_init(b, iter, search, b->sets); } -void __bkey_put(struct cache_set *c, struct bkey *k); +void bkey_put(struct cache_set *c, struct bkey *k); /* Looping macros */ diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index c67d19a..05f8ccb 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -426,7 +426,7 @@ static int __uuid_write(struct cache_set *c) closure_sync(&cl); bkey_copy(&c->uuid_bucket, &k.key); - __bkey_put(c, &k.key); + bkey_put(c, &k.key); return 0; } -- cgit v0.10.2 From d5cc66e95744065f96024add4bf7d7e019be54ac Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 24 Jul 2013 23:06:40 -0700 Subject: bcache: bch_(btree|extent)_ptr_invalid() Trying to treat btree pointers and leaf node pointers the same way was a mistake - going to start being more explicit about the type of key/pointer we're dealing with. This is the first part of that refactoring; this patch shouldn't change any actual behaviour. Signed-off-by: Kent Overstreet diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c index 6bffde4..b615348 100644 --- a/drivers/md/bcache/bset.c +++ b/drivers/md/bcache/bset.c @@ -73,19 +73,9 @@ void bch_keylist_pop_front(struct keylist *l) /* Pointer validation */ -bool __bch_ptr_invalid(struct cache_set *c, int level, const struct bkey *k) +static bool __ptr_invalid(struct cache_set *c, const struct bkey *k) { unsigned i; - char buf[80]; - - if (level && (!KEY_PTRS(k) || !KEY_SIZE(k) || KEY_DIRTY(k))) - goto bad; - - if (!level && KEY_SIZE(k) > KEY_OFFSET(k)) - goto bad; - - if (!KEY_SIZE(k)) - return true; for (i = 0; i < KEY_PTRS(k); i++) if (ptr_available(c, k, i)) { @@ -96,13 +86,46 @@ bool __bch_ptr_invalid(struct cache_set *c, int level, const struct bkey *k) if (KEY_SIZE(k) + r > c->sb.bucket_size || bucket < ca->sb.first_bucket || bucket >= ca->sb.nbuckets) - goto bad; + return true; } return false; +} + +bool bch_btree_ptr_invalid(struct cache_set *c, const struct bkey *k) +{ + char buf[80]; + + if (!KEY_PTRS(k) || !KEY_SIZE(k) || KEY_DIRTY(k)) + goto bad; + + if (__ptr_invalid(c, k)) + goto bad; + + return false; +bad: + bch_bkey_to_text(buf, sizeof(buf), k); + cache_bug(c, "spotted btree ptr %s: %s", buf, bch_ptr_status(c, k)); + return true; +} + +bool bch_extent_ptr_invalid(struct cache_set *c, const struct bkey *k) +{ + char buf[80]; + + if (!KEY_SIZE(k)) + return true; + + if (KEY_SIZE(k) > KEY_OFFSET(k)) + goto bad; + + if (__ptr_invalid(c, k)) + goto bad; + + return false; bad: bch_bkey_to_text(buf, sizeof(buf), k); - cache_bug(c, "spotted bad key %s: %s", buf, bch_ptr_status(c, k)); + cache_bug(c, "spotted extent %s: %s", buf, bch_ptr_status(c, k)); return true; } diff --git a/drivers/md/bcache/bset.h b/drivers/md/bcache/bset.h index a043a92..e673860 100644 --- a/drivers/md/bcache/bset.h +++ b/drivers/md/bcache/bset.h @@ -277,7 +277,9 @@ static inline bool bch_cut_back(const struct bkey *where, struct bkey *k) } const char *bch_ptr_status(struct cache_set *, const struct bkey *); -bool __bch_ptr_invalid(struct cache_set *, int level, const struct bkey *); +bool bch_btree_ptr_invalid(struct cache_set *, const struct bkey *); +bool bch_extent_ptr_invalid(struct cache_set *, const struct bkey *); + bool bch_ptr_bad(struct btree *, const struct bkey *); static inline uint8_t gen_after(uint8_t a, uint8_t b) diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h index d4b705e..e11bb85 100644 --- a/drivers/md/bcache/btree.h +++ b/drivers/md/bcache/btree.h @@ -204,11 +204,6 @@ static inline void set_gc_sectors(struct cache_set *c) atomic_set(&c->sectors_to_gc, c->sb.bucket_size * c->nbuckets / 8); } -static inline bool bch_ptr_invalid(struct btree *b, const struct bkey *k) -{ - return __bch_ptr_invalid(b->c, b->level, k); -} - static inline struct bkey *bch_btree_iter_init(struct btree *b, struct btree_iter *iter, struct bkey *search) @@ -216,6 +211,14 @@ static inline struct bkey *bch_btree_iter_init(struct btree *b, return __bch_btree_iter_init(b, iter, search, b->sets); } +static inline bool bch_ptr_invalid(struct btree *b, const struct bkey *k) +{ + if (b->level) + return bch_btree_ptr_invalid(b->c, k); + else + return bch_extent_ptr_invalid(b->c, k); +} + void bkey_put(struct cache_set *c, struct bkey *k); /* Looping macros */ diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 05f8ccb..7ab926d 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -373,7 +373,7 @@ static char *uuid_read(struct cache_set *c, struct jset *j, struct closure *cl) { struct bkey *k = &j->uuid_bucket; - if (__bch_ptr_invalid(c, 1, k)) + if (bch_btree_ptr_invalid(c, k)) return "bad uuid pointer"; bkey_copy(&c->uuid_bucket, k); @@ -1522,7 +1522,7 @@ static void run_cache_set(struct cache_set *c) k = &j->btree_root; err = "bad btree root"; - if (__bch_ptr_invalid(c, j->btree_level + 1, k)) + if (bch_btree_ptr_invalid(c, k)) goto err; err = "error reading btree root"; -- cgit v0.10.2 From 0eacac22034ca21c73fe49e800d0b938b2047250 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 1 Jul 2013 19:29:05 -0700 Subject: bcache: PRECEDING_KEY() btree_insert_key() was open coding this, this is just refactoring. Signed-off-by: Kent Overstreet diff --git a/drivers/md/bcache/bset.h b/drivers/md/bcache/bset.h index e673860..bc99755 100644 --- a/drivers/md/bcache/bset.h +++ b/drivers/md/bcache/bset.h @@ -353,12 +353,30 @@ void bch_bset_fix_lookup_table(struct btree *, struct bkey *); struct bkey *__bch_bset_search(struct btree *, struct bset_tree *, const struct bkey *); +/* + * Returns the first key that is strictly greater than search + */ static inline struct bkey *bch_bset_search(struct btree *b, struct bset_tree *t, const struct bkey *search) { return search ? __bch_bset_search(b, t, search) : t->data->start; } +#define PRECEDING_KEY(_k) \ +({ \ + struct bkey *_ret = NULL; \ + \ + if (KEY_INODE(_k) || KEY_OFFSET(_k)) { \ + _ret = &KEY(KEY_INODE(_k), KEY_OFFSET(_k), 0); \ + \ + if (!_ret->low) \ + _ret->high--; \ + _ret->low--; \ + } \ + \ + _ret; \ +}) + bool bch_bkey_try_merge(struct btree *, struct bkey *, struct bkey *); void bch_btree_sort_lazy(struct btree *); void bch_btree_sort_into(struct btree *, struct btree *); diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 7dff73b..773b0e9 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -1844,19 +1844,14 @@ static bool btree_insert_key(struct btree *b, struct btree_op *op, if (!b->level) { struct btree_iter iter; - struct bkey search = KEY(KEY_INODE(k), KEY_START(k), 0); /* * bset_search() returns the first key that is strictly greater * than the search key - but for back merging, we want to find - * the first key that is greater than or equal to KEY_START(k) - - * unless KEY_START(k) is 0. + * the previous key. */ - if (KEY_OFFSET(&search)) - SET_KEY_OFFSET(&search, KEY_OFFSET(&search) - 1); - prev = NULL; - m = bch_btree_iter_init(b, &iter, &search); + m = bch_btree_iter_init(b, &iter, PRECEDING_KEY(&START_KEY(k))); if (fix_overlapping_extents(b, k, &iter, replace_key)) { op->insert_collision = true; -- cgit v0.10.2 From f269af5a078302712de8ee70d273eba2eb4485ca Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 23 Jul 2013 20:48:29 -0700 Subject: bcache: Add btree_node_write_sync() More refactoring - mostly making the interfaces more explicit about what we actually want to do. Signed-off-by: Kent Overstreet diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 773b0e9..7d6204c 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -474,6 +474,15 @@ void bch_btree_node_write(struct btree *b, struct closure *parent) bch_bset_init_next(b); } +static void bch_btree_node_write_sync(struct btree *b) +{ + struct closure cl; + + closure_init_stack(&cl); + bch_btree_node_write(b, &cl); + closure_sync(&cl); +} + static void btree_node_write_work(struct work_struct *w) { struct btree *b = container_of(to_delayed_work(w), struct btree, work); @@ -655,10 +664,8 @@ static int mca_reap(struct btree *b, unsigned min_order, bool flush) return -ENOMEM; } - if (btree_node_dirty(b)) { - bch_btree_node_write(b, &cl); - closure_sync(&cl); - } + if (btree_node_dirty(b)) + bch_btree_node_write_sync(b); /* wait for any in flight btree write */ closure_wait_event(&b->io.wait, &cl, @@ -1411,9 +1418,6 @@ static int bch_btree_gc_root(struct btree *b, struct btree_op *op, struct btree *n = NULL; unsigned keys = 0; int ret = 0, stale = btree_gc_mark_node(b, &keys, gc); - struct closure cl; - - closure_init_stack(&cl); if (b->level || stale > 10) n = btree_node_alloc_replacement(b); @@ -1424,12 +1428,10 @@ static int bch_btree_gc_root(struct btree *b, struct btree_op *op, if (b->level) ret = btree_gc_recurse(b, op, writes, gc); - if (!b->written || btree_node_dirty(b)) { - bch_btree_node_write(b, n ? &cl : NULL); - } + if (!b->written || btree_node_dirty(b)) + bch_btree_node_write_sync(b); if (!IS_ERR_OR_NULL(n)) { - closure_sync(&cl); bch_btree_set_root(b); btree_node_free(n); rw_unlock(true, b); @@ -2104,15 +2106,10 @@ static int bch_btree_insert_node(struct btree *b, struct btree_op *op, if (bch_btree_insert_keys(b, op, insert_keys, replace_key)) { - if (!b->level) { + if (!b->level) bch_btree_leaf_dirty(b, journal_ref); - } else { - struct closure cl; - - closure_init_stack(&cl); - bch_btree_node_write(b, &cl); - closure_sync(&cl); - } + else + bch_btree_node_write_sync(b); } } } while (!bch_keylist_empty(&split_keys)); -- cgit v0.10.2 From 8835c1234dd9a838993a2d5cb7572f57992ebbee Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 24 Jul 2013 23:18:05 -0700 Subject: bcache: Add make_btree_freeing_key() Refactoring, prep work for incremental garbage collection. Signed-off-by: Kent Overstreet diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 7d6204c..a3f8ca4 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -1107,6 +1107,22 @@ static struct btree *btree_node_alloc_replacement(struct btree *b) return n; } +static void make_btree_freeing_key(struct btree *b, struct bkey *k) +{ + unsigned i; + + bkey_copy(k, &b->key); + bkey_copy_key(k, &ZERO_KEY); + + for (i = 0; i < KEY_PTRS(k); i++) { + uint8_t g = PTR_BUCKET(b->c, k, i)->gen + 1; + + SET_PTR_GEN(k, i, g); + } + + atomic_inc(&b->c->prio_blocked); +} + /* Garbage collection */ uint8_t __bch_btree_mark_key(struct cache_set *c, int level, struct bkey *k) @@ -2030,20 +2046,9 @@ static int btree_split(struct btree *b, struct btree_op *op, closure_sync(&cl); bch_btree_set_root(n1); } else { - unsigned i; - - bkey_copy(parent_keys->top, &b->key); - bkey_copy_key(parent_keys->top, &ZERO_KEY); - - for (i = 0; i < KEY_PTRS(&b->key); i++) { - uint8_t g = PTR_BUCKET(b->c, &b->key, i)->gen + 1; - - SET_PTR_GEN(parent_keys->top, i, g); - } - - bch_keylist_push(parent_keys); closure_sync(&cl); - atomic_inc(&b->c->prio_blocked); + make_btree_freeing_key(b, parent_keys->top); + bch_keylist_push(parent_keys); } rw_unlock(true, n1); -- cgit v0.10.2 From a1f0358b2bf69be216cb6e4ea40fe7ae4d38b8a6 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 10 Sep 2013 19:07:00 -0700 Subject: bcache: Incremental gc Big garbage collection rewrite; now, garbage collection uses the same mechanisms as used elsewhere for inserting/updating btree node pointers, instead of rewriting interior btree nodes in place. This makes the code significantly cleaner and less fragile, and means we can now make garbage collection incremental - it doesn't have to hold a write lock on the root of the btree for the entire duration of garbage collection. This means that there's less of a latency hit for doing garbage collection, which means we can gc more frequently (and do a better job of reclaiming from the cache), and we can coalesce across more btree nodes (improving our space efficiency). Signed-off-by: Kent Overstreet diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index d03bc6f..d6970a6 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -477,7 +477,6 @@ struct gc_stat { size_t nkeys; uint64_t data; /* sectors */ - uint64_t dirty; /* sectors */ unsigned in_use; /* percent */ }; diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index a3f8ca4..7d283d2 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -1176,12 +1176,10 @@ uint8_t __bch_btree_mark_key(struct cache_set *c, int level, struct bkey *k) #define btree_mark_key(b, k) __bch_btree_mark_key(b->c, b->level, k) -static int btree_gc_mark_node(struct btree *b, unsigned *keys, - struct gc_stat *gc) +static bool btree_gc_mark_node(struct btree *b, struct gc_stat *gc) { uint8_t stale = 0; - unsigned last_dev = -1; - struct bcache_device *d = NULL; + unsigned keys = 0, good_keys = 0; struct bkey *k; struct btree_iter iter; struct bset_tree *t; @@ -1189,27 +1187,17 @@ static int btree_gc_mark_node(struct btree *b, unsigned *keys, gc->nodes++; for_each_key_filter(b, k, &iter, bch_ptr_invalid) { - if (last_dev != KEY_INODE(k)) { - last_dev = KEY_INODE(k); - - d = KEY_INODE(k) < b->c->nr_uuids - ? b->c->devices[last_dev] - : NULL; - } - stale = max(stale, btree_mark_key(b, k)); + keys++; if (bch_ptr_bad(b, k)) continue; - *keys += bkey_u64s(k); - gc->key_bytes += bkey_u64s(k); gc->nkeys++; + good_keys++; gc->data += KEY_SIZE(k); - if (KEY_DIRTY(k)) - gc->dirty += KEY_SIZE(k); } for (t = b->sets; t <= &b->sets[b->nsets]; t++) @@ -1218,79 +1206,74 @@ static int btree_gc_mark_node(struct btree *b, unsigned *keys, bkey_cmp(&b->key, &t->end) < 0, b, "found short btree key in gc"); - return stale; -} + if (b->c->gc_always_rewrite) + return true; -static struct btree *btree_gc_alloc(struct btree *b, struct bkey *k) -{ - /* - * We block priorities from being written for the duration of garbage - * collection, so we can't sleep in btree_alloc() -> - * bch_bucket_alloc_set(), or we'd risk deadlock - so we don't pass it - * our closure. - */ - struct btree *n = btree_node_alloc_replacement(b); - - if (!IS_ERR_OR_NULL(n)) { - swap(b, n); + if (stale > 10) + return true; - memcpy(k->ptr, b->key.ptr, - sizeof(uint64_t) * KEY_PTRS(&b->key)); - - btree_node_free(n); - up_write(&n->lock); - } + if ((keys - good_keys) * 2 > keys) + return true; - return b; + return false; } -/* - * Leaving this at 2 until we've got incremental garbage collection done; it - * could be higher (and has been tested with 4) except that garbage collection - * could take much longer, adversely affecting latency. - */ -#define GC_MERGE_NODES 2U +#define GC_MERGE_NODES 4U struct gc_merge_info { struct btree *b; - struct bkey *k; unsigned keys; }; -static void btree_gc_coalesce(struct btree *b, struct gc_stat *gc, - struct gc_merge_info *r) +static int bch_btree_insert_node(struct btree *, struct btree_op *, + struct keylist *, atomic_t *, struct bkey *); + +static int btree_gc_coalesce(struct btree *b, struct btree_op *op, + struct keylist *keylist, struct gc_stat *gc, + struct gc_merge_info *r) { - unsigned nodes = 0, keys = 0, blocks; - int i; + unsigned i, nodes = 0, keys = 0, blocks; + struct btree *new_nodes[GC_MERGE_NODES]; struct closure cl; + struct bkey *k; + memset(new_nodes, 0, sizeof(new_nodes)); closure_init_stack(&cl); - while (nodes < GC_MERGE_NODES && r[nodes].b) + while (nodes < GC_MERGE_NODES && !IS_ERR_OR_NULL(r[nodes].b)) keys += r[nodes++].keys; blocks = btree_default_blocks(b->c) * 2 / 3; if (nodes < 2 || __set_blocks(b->sets[0].data, keys, b->c) > blocks * (nodes - 1)) - return; - - for (i = nodes - 1; i >= 0; --i) { - if (r[i].b->written) - r[i].b = btree_gc_alloc(r[i].b, r[i].k); + return 0; - if (r[i].b->written) - return; + for (i = 0; i < nodes; i++) { + new_nodes[i] = btree_node_alloc_replacement(r[i].b); + if (IS_ERR_OR_NULL(new_nodes[i])) + goto out_nocoalesce; } for (i = nodes - 1; i > 0; --i) { - struct bset *n1 = r[i].b->sets->data; - struct bset *n2 = r[i - 1].b->sets->data; + struct bset *n1 = new_nodes[i]->sets->data; + struct bset *n2 = new_nodes[i - 1]->sets->data; struct bkey *k, *last = NULL; keys = 0; - if (i == 1) { + if (i > 1) { + for (k = n2->start; + k < end(n2); + k = bkey_next(k)) { + if (__set_blocks(n1, n1->keys + keys + + bkey_u64s(k), b->c) > blocks) + break; + + last = k; + keys += bkey_u64s(k); + } + } else { /* * Last node we're not getting rid of - we're getting * rid of the node at r[0]. Have to try and fit all of @@ -1299,37 +1282,27 @@ static void btree_gc_coalesce(struct btree *b, struct gc_stat *gc, * length keys (shouldn't be possible in practice, * though) */ - if (__set_blocks(n1, n1->keys + r->keys, - b->c) > btree_blocks(r[i].b)) - return; + if (__set_blocks(n1, n1->keys + n2->keys, + b->c) > btree_blocks(new_nodes[i])) + goto out_nocoalesce; keys = n2->keys; + /* Take the key of the node we're getting rid of */ last = &r->b->key; - } else - for (k = n2->start; - k < end(n2); - k = bkey_next(k)) { - if (__set_blocks(n1, n1->keys + keys + - bkey_u64s(k), b->c) > blocks) - break; - - last = k; - keys += bkey_u64s(k); - } + } BUG_ON(__set_blocks(n1, n1->keys + keys, - b->c) > btree_blocks(r[i].b)); + b->c) > btree_blocks(new_nodes[i])); - if (last) { - bkey_copy_key(&r[i].b->key, last); - bkey_copy_key(r[i].k, last); - } + if (last) + bkey_copy_key(&new_nodes[i]->key, last); memcpy(end(n1), n2->start, (void *) node(n2, keys) - (void *) n2->start); n1->keys += keys; + r[i].keys = n1->keys; memmove(n2->start, node(n2, keys), @@ -1337,93 +1310,175 @@ static void btree_gc_coalesce(struct btree *b, struct gc_stat *gc, n2->keys -= keys; - r[i].keys = n1->keys; - r[i - 1].keys = n2->keys; + if (bch_keylist_realloc(keylist, + KEY_PTRS(&new_nodes[i]->key), b->c)) + goto out_nocoalesce; + + bch_btree_node_write(new_nodes[i], &cl); + bch_keylist_add(keylist, &new_nodes[i]->key); } - btree_node_free(r->b); - up_write(&r->b->lock); + for (i = 0; i < nodes; i++) { + if (bch_keylist_realloc(keylist, KEY_PTRS(&r[i].b->key), b->c)) + goto out_nocoalesce; - trace_bcache_btree_gc_coalesce(nodes); + make_btree_freeing_key(r[i].b, keylist->top); + bch_keylist_push(keylist); + } + + /* We emptied out this node */ + BUG_ON(new_nodes[0]->sets->data->keys); + btree_node_free(new_nodes[0]); + rw_unlock(true, new_nodes[0]); + closure_sync(&cl); + + for (i = 0; i < nodes; i++) { + btree_node_free(r[i].b); + rw_unlock(true, r[i].b); + + r[i].b = new_nodes[i]; + } + + bch_btree_insert_node(b, op, keylist, NULL, NULL); + BUG_ON(!bch_keylist_empty(keylist)); + + memmove(r, r + 1, sizeof(r[0]) * (nodes - 1)); + r[nodes - 1].b = ERR_PTR(-EINTR); + + trace_bcache_btree_gc_coalesce(nodes); gc->nodes--; - nodes--; - memmove(&r[0], &r[1], sizeof(struct gc_merge_info) * nodes); - memset(&r[nodes], 0, sizeof(struct gc_merge_info)); + /* Invalidated our iterator */ + return -EINTR; + +out_nocoalesce: + closure_sync(&cl); + + while ((k = bch_keylist_pop(keylist))) + if (!bkey_cmp(k, &ZERO_KEY)) + atomic_dec(&b->c->prio_blocked); + + for (i = 0; i < nodes; i++) + if (!IS_ERR_OR_NULL(new_nodes[i])) { + btree_node_free(new_nodes[i]); + rw_unlock(true, new_nodes[i]); + } + return 0; } -static int btree_gc_recurse(struct btree *b, struct btree_op *op, - struct closure *writes, struct gc_stat *gc) +static unsigned btree_gc_count_keys(struct btree *b) { - void write(struct btree *r) - { - if (!r->written || btree_node_dirty(r)) - bch_btree_node_write(r, writes); + struct bkey *k; + struct btree_iter iter; + unsigned ret = 0; - up_write(&r->lock); - } + for_each_key_filter(b, k, &iter, bch_ptr_bad) + ret += bkey_u64s(k); - int ret = 0, stale; + return ret; +} + +static int btree_gc_recurse(struct btree *b, struct btree_op *op, + struct closure *writes, struct gc_stat *gc) +{ unsigned i; + int ret = 0; + bool should_rewrite; + struct btree *n; + struct bkey *k; + struct keylist keys; + struct btree_iter iter; struct gc_merge_info r[GC_MERGE_NODES]; + struct gc_merge_info *last = r + GC_MERGE_NODES - 1; - memset(r, 0, sizeof(r)); + bch_keylist_init(&keys); + bch_btree_iter_init(b, &iter, &b->c->gc_done); - while ((r->k = bch_next_recurse_key(b, &b->c->gc_done))) { - r->b = bch_btree_node_get(b->c, r->k, b->level - 1, true); + for (i = 0; i < GC_MERGE_NODES; i++) + r[i].b = ERR_PTR(-EINTR); - if (IS_ERR(r->b)) { - ret = PTR_ERR(r->b); - break; + while (1) { + k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad); + if (k) { + r->b = bch_btree_node_get(b->c, k, b->level - 1, true); + if (IS_ERR(r->b)) { + ret = PTR_ERR(r->b); + break; + } + + r->keys = btree_gc_count_keys(r->b); + + ret = btree_gc_coalesce(b, op, &keys, gc, r); + if (ret) + break; } - r->keys = 0; - stale = btree_gc_mark_node(r->b, &r->keys, gc); + if (!last->b) + break; - if (!b->written && - (r->b->level || stale > 10 || - b->c->gc_always_rewrite)) - r->b = btree_gc_alloc(r->b, r->k); + if (!IS_ERR(last->b)) { + should_rewrite = btree_gc_mark_node(last->b, gc); + if (should_rewrite) { + n = btree_node_alloc_replacement(last->b); - if (r->b->level) - ret = btree_gc_recurse(r->b, op, writes, gc); + if (!IS_ERR_OR_NULL(n)) { + bch_btree_node_write_sync(n); + bch_keylist_add(&keys, &n->key); - if (ret) { - write(r->b); - break; - } + make_btree_freeing_key(last->b, + keys.top); + bch_keylist_push(&keys); + + btree_node_free(last->b); - bkey_copy_key(&b->c->gc_done, r->k); + bch_btree_insert_node(b, op, &keys, + NULL, NULL); + BUG_ON(!bch_keylist_empty(&keys)); - if (!b->written) - btree_gc_coalesce(b, gc, r); + rw_unlock(true, last->b); + last->b = n; + + /* Invalidated our iterator */ + ret = -EINTR; + break; + } + } - if (r[GC_MERGE_NODES - 1].b) - write(r[GC_MERGE_NODES - 1].b); + if (last->b->level) { + ret = btree_gc_recurse(last->b, op, writes, gc); + if (ret) + break; + } - memmove(&r[1], &r[0], - sizeof(struct gc_merge_info) * (GC_MERGE_NODES - 1)); + bkey_copy_key(&b->c->gc_done, &last->b->key); + + /* + * Must flush leaf nodes before gc ends, since replace + * operations aren't journalled + */ + if (btree_node_dirty(last->b)) + bch_btree_node_write(last->b, writes); + rw_unlock(true, last->b); + } + + memmove(r + 1, r, sizeof(r[0]) * (GC_MERGE_NODES - 1)); + r->b = NULL; - /* When we've got incremental GC working, we'll want to do - * if (should_resched()) - * return -EAGAIN; - */ - cond_resched(); -#if 0 if (need_resched()) { ret = -EAGAIN; break; } -#endif } - for (i = 1; i < GC_MERGE_NODES && r[i].b; i++) - write(r[i].b); + for (i = 0; i < GC_MERGE_NODES; i++) + if (!IS_ERR_OR_NULL(r[i].b)) { + if (btree_node_dirty(r[i].b)) + bch_btree_node_write(r[i].b, writes); + rw_unlock(true, r[i].b); + } - /* Might have freed some children, must remove their keys */ - if (!b->written) - bch_btree_sort(b); + bch_keylist_free(&keys); return ret; } @@ -1432,27 +1487,31 @@ static int bch_btree_gc_root(struct btree *b, struct btree_op *op, struct closure *writes, struct gc_stat *gc) { struct btree *n = NULL; - unsigned keys = 0; - int ret = 0, stale = btree_gc_mark_node(b, &keys, gc); + int ret = 0; + bool should_rewrite; - if (b->level || stale > 10) + should_rewrite = btree_gc_mark_node(b, gc); + if (should_rewrite) { n = btree_node_alloc_replacement(b); - if (!IS_ERR_OR_NULL(n)) - swap(b, n); - - if (b->level) - ret = btree_gc_recurse(b, op, writes, gc); + if (!IS_ERR_OR_NULL(n)) { + bch_btree_node_write_sync(n); + bch_btree_set_root(n); + btree_node_free(b); + rw_unlock(true, n); - if (!b->written || btree_node_dirty(b)) - bch_btree_node_write_sync(b); + return -EINTR; + } + } - if (!IS_ERR_OR_NULL(n)) { - bch_btree_set_root(b); - btree_node_free(n); - rw_unlock(true, b); + if (b->level) { + ret = btree_gc_recurse(b, op, writes, gc); + if (ret) + return ret; } + bkey_copy_key(&b->c->gc_done, &b->key); + return ret; } @@ -1550,29 +1609,20 @@ static void bch_btree_gc(struct cache_set *c) btree_gc_start(c); - atomic_inc(&c->prio_blocked); - - ret = btree_root(gc_root, c, &op, &writes, &stats); - closure_sync(&writes); - - if (ret) { - pr_warn("gc failed!"); - return; - } + do { + ret = btree_root(gc_root, c, &op, &writes, &stats); + closure_sync(&writes); - /* Possibly wait for new UUIDs or whatever to hit disk */ - bch_journal_meta(c, &writes); - closure_sync(&writes); + if (ret && ret != -EAGAIN) + pr_warn("gc failed!"); + } while (ret); available = bch_btree_gc_finish(c); - - atomic_dec(&c->prio_blocked); wake_up_allocators(c); bch_time_stats_update(&c->btree_gc_time, start_time); stats.key_bytes *= sizeof(uint64_t); - stats.dirty <<= 9; stats.data <<= 9; stats.in_use = (c->nbuckets - available) * 100 / c->nbuckets; memcpy(&c->gc_stats, &stats, sizeof(struct gc_stat)); @@ -1585,14 +1635,28 @@ static void bch_btree_gc(struct cache_set *c) static int bch_gc_thread(void *arg) { struct cache_set *c = arg; + struct cache *ca; + unsigned i; while (1) { +again: bch_btree_gc(c); set_current_state(TASK_INTERRUPTIBLE); if (kthread_should_stop()) break; + mutex_lock(&c->bucket_lock); + + for_each_cache(ca, c, i) + if (ca->invalidate_needs_gc) { + mutex_unlock(&c->bucket_lock); + set_current_state(TASK_RUNNING); + goto again; + } + + mutex_unlock(&c->bucket_lock); + try_to_freeze(); schedule(); } @@ -2083,8 +2147,6 @@ static int bch_btree_insert_node(struct btree *b, struct btree_op *op, bch_keylist_init(&split_keys); - BUG_ON(b->level); - do { BUG_ON(b->level && replace_key); diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h index e11bb85..b5a46af 100644 --- a/drivers/md/bcache/btree.h +++ b/drivers/md/bcache/btree.h @@ -201,7 +201,7 @@ static inline bool bkey_written(struct btree *b, struct bkey *k) static inline void set_gc_sectors(struct cache_set *c) { - atomic_set(&c->sectors_to_gc, c->sb.bucket_size * c->nbuckets / 8); + atomic_set(&c->sectors_to_gc, c->sb.bucket_size * c->nbuckets / 16); } static inline struct bkey *bch_btree_iter_init(struct btree *b, diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index 9687771..c5f73e3 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -489,7 +489,6 @@ lock_root: sysfs_print(btree_used_percent, btree_used(c)); sysfs_print(btree_nodes, c->gc_stats.nodes); - sysfs_hprint(dirty_data, c->gc_stats.dirty); sysfs_hprint(average_key_size, average_key_size(c)); sysfs_print(cache_read_races, @@ -642,7 +641,6 @@ static struct attribute *bch_cache_set_files[] = { &sysfs_cache_available_percent, &sysfs_average_key_size, - &sysfs_dirty_data, &sysfs_errors, &sysfs_io_error_limit, -- cgit v0.10.2 From bc9389eefe479b7b7b323c2729b61a7155d2d0ea Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 10 Sep 2013 19:07:35 -0700 Subject: bcache: Avoid deadlocking in garbage collection Not a complete fix - we could still deadlock if btree_insert_node() has to split... Signed-off-by: Kent Overstreet diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 7d283d2..bf8f517 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -1059,14 +1059,14 @@ static void btree_node_free(struct btree *b) mutex_unlock(&b->c->bucket_lock); } -struct btree *bch_btree_node_alloc(struct cache_set *c, int level) +struct btree *bch_btree_node_alloc(struct cache_set *c, int level, bool wait) { BKEY_PADDED(key) k; struct btree *b = ERR_PTR(-EAGAIN); mutex_lock(&c->bucket_lock); retry: - if (__bch_bucket_alloc_set(c, WATERMARK_METADATA, &k.key, 1, true)) + if (__bch_bucket_alloc_set(c, WATERMARK_METADATA, &k.key, 1, wait)) goto err; bkey_put(c, &k.key); @@ -1098,9 +1098,9 @@ err: return b; } -static struct btree *btree_node_alloc_replacement(struct btree *b) +static struct btree *btree_node_alloc_replacement(struct btree *b, bool wait) { - struct btree *n = bch_btree_node_alloc(b->c, b->level); + struct btree *n = bch_btree_node_alloc(b->c, b->level, wait); if (!IS_ERR_OR_NULL(n)) bch_btree_sort_into(b, n); @@ -1250,7 +1250,7 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op, return 0; for (i = 0; i < nodes; i++) { - new_nodes[i] = btree_node_alloc_replacement(r[i].b); + new_nodes[i] = btree_node_alloc_replacement(r[i].b, false); if (IS_ERR_OR_NULL(new_nodes[i])) goto out_nocoalesce; } @@ -1420,7 +1420,8 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op, if (!IS_ERR(last->b)) { should_rewrite = btree_gc_mark_node(last->b, gc); if (should_rewrite) { - n = btree_node_alloc_replacement(last->b); + n = btree_node_alloc_replacement(last->b, + false); if (!IS_ERR_OR_NULL(n)) { bch_btree_node_write_sync(n); @@ -1492,7 +1493,7 @@ static int bch_btree_gc_root(struct btree *b, struct btree_op *op, should_rewrite = btree_gc_mark_node(b, gc); if (should_rewrite) { - n = btree_node_alloc_replacement(b); + n = btree_node_alloc_replacement(b, false); if (!IS_ERR_OR_NULL(n)) { bch_btree_node_write_sync(n); @@ -2038,7 +2039,7 @@ static int btree_split(struct btree *b, struct btree_op *op, closure_init_stack(&cl); - n1 = btree_node_alloc_replacement(b); + n1 = btree_node_alloc_replacement(b, true); if (IS_ERR(n1)) goto err; @@ -2049,12 +2050,12 @@ static int btree_split(struct btree *b, struct btree_op *op, trace_bcache_btree_node_split(b, n1->sets[0].data->keys); - n2 = bch_btree_node_alloc(b->c, b->level); + n2 = bch_btree_node_alloc(b->c, b->level, true); if (IS_ERR(n2)) goto err_free1; if (!b->parent) { - n3 = bch_btree_node_alloc(b->c, b->level + 1); + n3 = bch_btree_node_alloc(b->c, b->level + 1, true); if (IS_ERR(n3)) goto err_free2; } diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h index b5a46af..767e755 100644 --- a/drivers/md/bcache/btree.h +++ b/drivers/md/bcache/btree.h @@ -271,7 +271,7 @@ void bch_btree_node_read(struct btree *); void bch_btree_node_write(struct btree *, struct closure *); void bch_btree_set_root(struct btree *); -struct btree *bch_btree_node_alloc(struct cache_set *, int); +struct btree *bch_btree_node_alloc(struct cache_set *, int, bool); struct btree *bch_btree_node_get(struct cache_set *, struct bkey *, int, bool); int bch_btree_insert_check_key(struct btree *, struct btree_op *, diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 7ab926d..e21200e 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1601,7 +1601,7 @@ static void run_cache_set(struct cache_set *c) goto err; err = "cannot allocate new btree root"; - c->root = bch_btree_node_alloc(c, 0); + c->root = bch_btree_node_alloc(c, 0, true); if (IS_ERR_OR_NULL(c->root)) goto err; -- cgit v0.10.2 From 50310164bcd789eb3690f45a9baf8a507bf93358 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 10 Sep 2013 17:18:59 -0700 Subject: bcache: Kill bch_next_recurse_key() This dates from before the btree iterator, and now it's finally gone Signed-off-by: Kent Overstreet diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c index b615348..b0fe0dc 100644 --- a/drivers/md/bcache/bset.c +++ b/drivers/md/bcache/bset.c @@ -952,14 +952,6 @@ struct bkey *bch_btree_iter_next_filter(struct btree_iter *iter, return ret; } -struct bkey *bch_next_recurse_key(struct btree *b, struct bkey *search) -{ - struct btree_iter iter; - - bch_btree_iter_init(b, &iter, search); - return bch_btree_iter_next_filter(&iter, b, bch_ptr_bad); -} - /* Mergesort */ static void sort_key_next(struct btree_iter *iter, diff --git a/drivers/md/bcache/bset.h b/drivers/md/bcache/bset.h index bc99755..1d3c24f 100644 --- a/drivers/md/bcache/bset.h +++ b/drivers/md/bcache/bset.h @@ -303,7 +303,6 @@ static inline bool ptr_available(struct cache_set *c, const struct bkey *k, typedef bool (*ptr_filter_fn)(struct btree *, const struct bkey *); -struct bkey *bch_next_recurse_key(struct btree *, struct bkey *); struct bkey *bch_btree_iter_next(struct btree_iter *); struct bkey *bch_btree_iter_next_filter(struct btree_iter *, struct btree *, ptr_filter_fn); diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index bf8f517..adc5bb0 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -1680,9 +1680,9 @@ int bch_gc_thread_start(struct cache_set *c) static int bch_btree_check_recurse(struct btree *b, struct btree_op *op, unsigned long **seen) { - int ret; + int ret = 0; unsigned i; - struct bkey *k; + struct bkey *k, *p = NULL; struct bucket *g; struct btree_iter iter; @@ -1709,19 +1709,18 @@ static int bch_btree_check_recurse(struct btree *b, struct btree_op *op, } if (b->level) { - k = bch_next_recurse_key(b, &ZERO_KEY); + bch_btree_iter_init(b, &iter, NULL); - while (k) { - struct bkey *p = bch_next_recurse_key(b, k); - if (p) - btree_node_prefetch(b->c, p, b->level - 1); + do { + k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad); + if (k) + btree_node_prefetch(b->c, k, b->level - 1); - ret = btree(check_recurse, k, b, op, seen); - if (ret) - return ret; + if (p) + ret = btree(check_recurse, p, b, op, seen); - k = p; - } + p = k; + } while (p && !ret); } return 0; -- cgit v0.10.2 From 8aee122071a69ca6fa3314da7713bdf0b61dc07c Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 30 Jul 2013 22:34:40 -0700 Subject: bcache: Kill sequential_merge option It never really made sense to expose this, so just kill it. Signed-off-by: Kent Overstreet diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index d6970a6..3227355 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -364,7 +364,6 @@ struct cached_dev { unsigned sequential_cutoff; unsigned readahead; - unsigned sequential_merge:1; unsigned verify:1; unsigned partial_stripes_expensive:1; diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 932300f..f645da6 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -510,6 +510,7 @@ static bool check_should_bypass(struct cached_dev *dc, struct bio *bio) unsigned mode = cache_mode(dc, bio); unsigned sectors, congested = bch_get_congested(c); struct task_struct *task = current; + struct io *i; if (atomic_read(&dc->disk.detaching) || c->gc_stats.in_use > CUTOFF_CACHE_ADD || @@ -536,38 +537,30 @@ static bool check_should_bypass(struct cached_dev *dc, struct bio *bio) (bio->bi_rw & REQ_SYNC)) goto rescale; - if (dc->sequential_merge) { - struct io *i; + spin_lock(&dc->io_lock); - spin_lock(&dc->io_lock); + hlist_for_each_entry(i, iohash(dc, bio->bi_sector), hash) + if (i->last == bio->bi_sector && + time_before(jiffies, i->jiffies)) + goto found; - hlist_for_each_entry(i, iohash(dc, bio->bi_sector), hash) - if (i->last == bio->bi_sector && - time_before(jiffies, i->jiffies)) - goto found; + i = list_first_entry(&dc->io_lru, struct io, lru); - i = list_first_entry(&dc->io_lru, struct io, lru); - - add_sequential(task); - i->sequential = 0; + add_sequential(task); + i->sequential = 0; found: - if (i->sequential + bio->bi_size > i->sequential) - i->sequential += bio->bi_size; - - i->last = bio_end_sector(bio); - i->jiffies = jiffies + msecs_to_jiffies(5000); - task->sequential_io = i->sequential; + if (i->sequential + bio->bi_size > i->sequential) + i->sequential += bio->bi_size; - hlist_del(&i->hash); - hlist_add_head(&i->hash, iohash(dc, i->last)); - list_move_tail(&i->lru, &dc->io_lru); + i->last = bio_end_sector(bio); + i->jiffies = jiffies + msecs_to_jiffies(5000); + task->sequential_io = i->sequential; - spin_unlock(&dc->io_lock); - } else { - task->sequential_io = bio->bi_size; + hlist_del(&i->hash); + hlist_add_head(&i->hash, iohash(dc, i->last)); + list_move_tail(&i->lru, &dc->io_lru); - add_sequential(task); - } + spin_unlock(&dc->io_lock); sectors = max(task->sequential_io, task->sequential_io_avg) >> 9; diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index e21200e..041dd9d 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1079,7 +1079,6 @@ static int cached_dev_init(struct cached_dev *dc, unsigned block_size) spin_lock_init(&dc->io_lock); bch_cache_accounting_init(&dc->accounting, &dc->disk.cl); - dc->sequential_merge = true; dc->sequential_cutoff = 4 << 20; for (io = dc->io; io < dc->io + RECENT_IO; io++) { diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index c5f73e3..4b67244 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -72,7 +72,6 @@ rw_attribute(congested_read_threshold_us); rw_attribute(congested_write_threshold_us); rw_attribute(sequential_cutoff); -rw_attribute(sequential_merge); rw_attribute(data_csum); rw_attribute(cache_mode); rw_attribute(writeback_metadata); @@ -161,7 +160,6 @@ SHOW(__bch_cached_dev) sysfs_hprint(stripe_size, dc->disk.stripe_size << 9); var_printf(partial_stripes_expensive, "%u"); - var_printf(sequential_merge, "%i"); var_hprint(sequential_cutoff); var_hprint(readahead); @@ -207,7 +205,6 @@ STORE(__cached_dev) dc->writeback_rate_p_term_inverse, 1, INT_MAX); d_strtoul(writeback_rate_d_smooth); - d_strtoul(sequential_merge); d_strtoi_h(sequential_cutoff); d_strtoi_h(readahead); @@ -319,7 +316,6 @@ static struct attribute *bch_cached_dev_files[] = { &sysfs_stripe_size, &sysfs_partial_stripes_expensive, &sysfs_sequential_cutoff, - &sysfs_sequential_merge, &sysfs_clear_stats, &sysfs_running, &sysfs_state, -- cgit v0.10.2 From 65d22e911bfc4f46cda4751f1b1926b43c316c14 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 31 Jul 2013 00:03:54 -0700 Subject: bcache: Move spinlock into struct time_stats Minor cleanup. Signed-off-by: Kent Overstreet diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 3227355..816d079 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -666,11 +666,9 @@ struct cache_set { unsigned congested_read_threshold_us; unsigned congested_write_threshold_us; - spinlock_t sort_time_lock; struct time_stats sort_time; struct time_stats btree_gc_time; struct time_stats btree_split_time; - spinlock_t btree_read_time_lock; struct time_stats btree_read_time; struct time_stats try_harder_time; diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c index b0fe0dc..1457339 100644 --- a/drivers/md/bcache/bset.c +++ b/drivers/md/bcache/bset.c @@ -1077,11 +1077,8 @@ static void __btree_sort(struct btree *b, struct btree_iter *iter, if (b->written) bset_build_written_tree(b); - if (!start) { - spin_lock(&b->c->sort_time_lock); + if (!start) bch_time_stats_update(&b->c->sort_time, start_time); - spin_unlock(&b->c->sort_time_lock); - } } void bch_btree_sort_partial(struct btree *b, unsigned start) @@ -1128,9 +1125,7 @@ void bch_btree_sort_into(struct btree *b, struct btree *new) btree_mergesort(b, new->sets->data, &iter, false, true); - spin_lock(&b->c->sort_time_lock); bch_time_stats_update(&b->c->sort_time, start_time); - spin_unlock(&b->c->sort_time_lock); bkey_copy_key(&new->key, &b->key); new->sets->size = 0; diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index adc5bb0..1a7530c 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -317,10 +317,7 @@ void bch_btree_node_read(struct btree *b) goto err; bch_btree_node_read_done(b); - - spin_lock(&b->c->btree_read_time_lock); bch_time_stats_update(&b->c->btree_read_time, start_time); - spin_unlock(&b->c->btree_read_time_lock); return; err: diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 041dd9d..4813ef6 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1435,9 +1435,13 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) init_waitqueue_head(&c->try_wait); init_waitqueue_head(&c->bucket_wait); closure_init_unlocked(&c->uuid_write); - spin_lock_init(&c->sort_time_lock); mutex_init(&c->sort_lock); - spin_lock_init(&c->btree_read_time_lock); + + spin_lock_init(&c->sort_time.lock); + spin_lock_init(&c->btree_gc_time.lock); + spin_lock_init(&c->btree_split_time.lock); + spin_lock_init(&c->btree_read_time.lock); + spin_lock_init(&c->try_harder_time.lock); bch_moving_init_cache_set(c); diff --git a/drivers/md/bcache/util.c b/drivers/md/bcache/util.c index 420dad5..462214e 100644 --- a/drivers/md/bcache/util.c +++ b/drivers/md/bcache/util.c @@ -168,10 +168,14 @@ int bch_parse_uuid(const char *s, char *uuid) void bch_time_stats_update(struct time_stats *stats, uint64_t start_time) { - uint64_t now = local_clock(); - uint64_t duration = time_after64(now, start_time) + uint64_t now, duration, last; + + spin_lock(&stats->lock); + + now = local_clock(); + duration = time_after64(now, start_time) ? now - start_time : 0; - uint64_t last = time_after64(now, stats->last) + last = time_after64(now, stats->last) ? now - stats->last : 0; stats->max_duration = max(stats->max_duration, duration); @@ -188,6 +192,8 @@ void bch_time_stats_update(struct time_stats *stats, uint64_t start_time) } stats->last = now ?: 1; + + spin_unlock(&stats->lock); } /** diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h index 8ce5aab..362c4b3 100644 --- a/drivers/md/bcache/util.h +++ b/drivers/md/bcache/util.h @@ -378,6 +378,7 @@ ssize_t bch_snprint_string_list(char *buf, size_t size, const char * const list[ ssize_t bch_read_string_list(const char *buf, const char * const list[]); struct time_stats { + spinlock_t lock; /* * all fields are in nanoseconds, averages are ewmas stored left shifted * by 8 -- cgit v0.10.2 From 17e21a9f248d3d330acdfb2405c23b8d84c9c23a Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 26 Jul 2013 12:32:38 -0700 Subject: bcache: Have btree_split() insert into parent directly The flow control in btree_insert_node() was... fragile... before, this'll use more stack (but since our btrees are never more than depth 1, that shouldn't matter) and it should be significantly clearer and less fragile. Signed-off-by: Kent Overstreet diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 1a7530c..6def7c9 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -2025,15 +2025,16 @@ static bool bch_btree_insert_keys(struct btree *b, struct btree_op *op, static int btree_split(struct btree *b, struct btree_op *op, struct keylist *insert_keys, - struct keylist *parent_keys, struct bkey *replace_key) { bool split; struct btree *n1, *n2 = NULL, *n3 = NULL; uint64_t start_time = local_clock(); struct closure cl; + struct keylist parent_keys; closure_init_stack(&cl); + bch_keylist_init(&parent_keys); n1 = btree_node_alloc_replacement(b, true); if (IS_ERR(n1)) @@ -2078,7 +2079,7 @@ static int btree_split(struct btree *b, struct btree_op *op, bkey_copy_key(&n2->key, &b->key); - bch_keylist_add(parent_keys, &n2->key); + bch_keylist_add(&parent_keys, &n2->key); bch_btree_node_write(n2, &cl); rw_unlock(true, n2); } else { @@ -2087,33 +2088,39 @@ static int btree_split(struct btree *b, struct btree_op *op, bch_btree_insert_keys(n1, op, insert_keys, replace_key); } - bch_keylist_add(parent_keys, &n1->key); + bch_keylist_add(&parent_keys, &n1->key); bch_btree_node_write(n1, &cl); if (n3) { /* Depth increases, make a new root */ - bkey_copy_key(&n3->key, &MAX_KEY); - bch_btree_insert_keys(n3, op, parent_keys, NULL); + bch_btree_insert_keys(n3, op, &parent_keys, NULL); bch_btree_node_write(n3, &cl); closure_sync(&cl); bch_btree_set_root(n3); rw_unlock(true, n3); + + btree_node_free(b); } else if (!b->parent) { /* Root filled up but didn't need to be split */ - - bch_keylist_reset(parent_keys); closure_sync(&cl); bch_btree_set_root(n1); + + btree_node_free(b); } else { + /* Split a non root node */ closure_sync(&cl); - make_btree_freeing_key(b, parent_keys->top); - bch_keylist_push(parent_keys); + make_btree_freeing_key(b, parent_keys.top); + bch_keylist_push(&parent_keys); + + btree_node_free(b); + + bch_btree_insert_node(b->parent, op, &parent_keys, NULL, NULL); + BUG_ON(!bch_keylist_empty(&parent_keys)); } rw_unlock(true, n1); - btree_node_free(b); bch_time_stats_update(&b->c->btree_split_time, start_time); @@ -2139,46 +2146,32 @@ static int bch_btree_insert_node(struct btree *b, struct btree_op *op, atomic_t *journal_ref, struct bkey *replace_key) { - int ret = 0; - struct keylist split_keys; - - bch_keylist_init(&split_keys); + BUG_ON(b->level && replace_key); - do { - BUG_ON(b->level && replace_key); - - if (should_split(b)) { - if (current->bio_list) { - op->lock = b->c->root->level + 1; - ret = -EAGAIN; - } else if (op->lock <= b->c->root->level) { - op->lock = b->c->root->level + 1; - ret = -EINTR; - } else { - struct btree *parent = b->parent; - - ret = btree_split(b, op, insert_keys, - &split_keys, replace_key); - insert_keys = &split_keys; - replace_key = NULL; - b = parent; - if (!ret) - ret = -EINTR; - } + if (should_split(b)) { + if (current->bio_list) { + op->lock = b->c->root->level + 1; + return -EAGAIN; + } else if (op->lock <= b->c->root->level) { + op->lock = b->c->root->level + 1; + return -EINTR; } else { - BUG_ON(write_block(b) != b->sets[b->nsets].data); - - if (bch_btree_insert_keys(b, op, insert_keys, - replace_key)) { - if (!b->level) - bch_btree_leaf_dirty(b, journal_ref); - else - bch_btree_node_write_sync(b); - } + /* Invalidated all iterators */ + return btree_split(b, op, insert_keys, replace_key) ?: + -EINTR; } - } while (!bch_keylist_empty(&split_keys)); + } else { + BUG_ON(write_block(b) != b->sets[b->nsets].data); - return ret; + if (bch_btree_insert_keys(b, op, insert_keys, replace_key)) { + if (!b->level) + bch_btree_leaf_dirty(b, journal_ref); + else + bch_btree_node_write_sync(b); + } + + return 0; + } } int bch_btree_insert_check_key(struct btree *b, struct btree_op *op, -- cgit v0.10.2 From 48a915a87f0bd98c3d68d029acf223a2e5116f07 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 31 Oct 2013 15:43:22 -0700 Subject: bcache: Better full stripe scanning The old scanning-by-stripe code burned too much CPU, this should be better. Signed-off-by: Kent Overstreet diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 816d079..ab0b215 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -237,7 +237,7 @@ struct keybuf { struct rb_root keys; -#define KEYBUF_NR 100 +#define KEYBUF_NR 500 DECLARE_ARRAY_ALLOCATOR(struct keybuf_key, freelist, KEYBUF_NR); }; @@ -273,9 +273,10 @@ struct bcache_device { atomic_t detaching; int flush_done; - uint64_t nr_stripes; + unsigned nr_stripes; unsigned stripe_size; atomic_t *stripe_sectors_dirty; + unsigned long *full_dirty_stripes; unsigned long sectors_dirty_last; long sectors_dirty_derivative; diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 6def7c9..5e2765a 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -2378,6 +2378,7 @@ static inline int keybuf_nonoverlapping_cmp(struct keybuf_key *l, struct refill { struct btree_op op; + unsigned nr_found; struct keybuf *buf; struct bkey *end; keybuf_pred_fn *pred; @@ -2414,6 +2415,8 @@ static int refill_keybuf_fn(struct btree_op *op, struct btree *b, if (RB_INSERT(&buf->keys, w, node, keybuf_cmp)) array_free(&buf->freelist, w); + else + refill->nr_found++; if (array_freelist_empty(&buf->freelist)) ret = MAP_DONE; @@ -2434,18 +2437,18 @@ void bch_refill_keybuf(struct cache_set *c, struct keybuf *buf, cond_resched(); bch_btree_op_init(&refill.op, -1); - refill.buf = buf; - refill.end = end; - refill.pred = pred; + refill.nr_found = 0; + refill.buf = buf; + refill.end = end; + refill.pred = pred; bch_btree_map_keys(&refill.op, c, &buf->last_scanned, refill_keybuf_fn, MAP_END_KEY); - pr_debug("found %s keys from %llu:%llu to %llu:%llu", - RB_EMPTY_ROOT(&buf->keys) ? "no" : - array_freelist_empty(&buf->freelist) ? "some" : "a few", - KEY_INODE(&start), KEY_OFFSET(&start), - KEY_INODE(&buf->last_scanned), KEY_OFFSET(&buf->last_scanned)); + trace_bcache_keyscan(refill.nr_found, + KEY_INODE(&start), KEY_OFFSET(&start), + KEY_INODE(&buf->last_scanned), + KEY_OFFSET(&buf->last_scanned)); spin_lock(&buf->lock); diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 4813ef6..43fcfe3 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -738,6 +738,10 @@ static void bcache_device_free(struct bcache_device *d) mempool_destroy(d->unaligned_bvec); if (d->bio_split) bioset_free(d->bio_split); + if (is_vmalloc_addr(d->full_dirty_stripes)) + vfree(d->full_dirty_stripes); + else + kfree(d->full_dirty_stripes); if (is_vmalloc_addr(d->stripe_sectors_dirty)) vfree(d->stripe_sectors_dirty); else @@ -757,8 +761,12 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size, d->nr_stripes = DIV_ROUND_UP_ULL(sectors, d->stripe_size); - if (!d->nr_stripes || d->nr_stripes > SIZE_MAX / sizeof(atomic_t)) + if (!d->nr_stripes || + d->nr_stripes > INT_MAX || + d->nr_stripes > SIZE_MAX / sizeof(atomic_t)) { + pr_err("nr_stripes too large"); return -ENOMEM; + } n = d->nr_stripes * sizeof(atomic_t); d->stripe_sectors_dirty = n < PAGE_SIZE << 6 @@ -767,6 +775,13 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size, if (!d->stripe_sectors_dirty) return -ENOMEM; + n = BITS_TO_LONGS(d->nr_stripes) * sizeof(unsigned long); + d->full_dirty_stripes = n < PAGE_SIZE << 6 + ? kzalloc(n, GFP_KERNEL) + : vzalloc(n); + if (!d->full_dirty_stripes) + return -ENOMEM; + if (!(d->bio_split = bioset_create(4, offsetof(struct bbio, bio))) || !(d->unaligned_bvec = mempool_create_kmalloc_pool(1, sizeof(struct bio_vec) * BIO_MAX_PAGES)) || diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index ab0f6b4..22e21dc 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -292,14 +292,12 @@ void bcache_dev_sectors_dirty_add(struct cache_set *c, unsigned inode, uint64_t offset, int nr_sectors) { struct bcache_device *d = c->devices[inode]; - unsigned stripe_offset; - uint64_t stripe = offset; + unsigned stripe_offset, stripe, sectors_dirty; if (!d) return; - do_div(stripe, d->stripe_size); - + stripe = offset_to_stripe(d, offset); stripe_offset = offset & (d->stripe_size - 1); while (nr_sectors) { @@ -309,7 +307,16 @@ void bcache_dev_sectors_dirty_add(struct cache_set *c, unsigned inode, if (nr_sectors < 0) s = -s; - atomic_add(s, d->stripe_sectors_dirty + stripe); + if (stripe >= d->nr_stripes) + return; + + sectors_dirty = atomic_add_return(s, + d->stripe_sectors_dirty + stripe); + if (sectors_dirty == d->stripe_size) + set_bit(stripe, d->full_dirty_stripes); + else + clear_bit(stripe, d->full_dirty_stripes); + nr_sectors -= s; stripe_offset = 0; stripe++; @@ -321,59 +328,70 @@ static bool dirty_pred(struct keybuf *buf, struct bkey *k) return KEY_DIRTY(k); } -static bool dirty_full_stripe_pred(struct keybuf *buf, struct bkey *k) +static void refill_full_stripes(struct cached_dev *dc) { - uint64_t stripe = KEY_START(k); - unsigned nr_sectors = KEY_SIZE(k); - struct cached_dev *dc = container_of(buf, struct cached_dev, - writeback_keys); + struct keybuf *buf = &dc->writeback_keys; + unsigned start_stripe, stripe, next_stripe; + bool wrapped = false; + + stripe = offset_to_stripe(&dc->disk, KEY_OFFSET(&buf->last_scanned)); - if (!KEY_DIRTY(k)) - return false; + if (stripe >= dc->disk.nr_stripes) + stripe = 0; - do_div(stripe, dc->disk.stripe_size); + start_stripe = stripe; while (1) { - if (atomic_read(dc->disk.stripe_sectors_dirty + stripe) == - dc->disk.stripe_size) - return true; + stripe = find_next_bit(dc->disk.full_dirty_stripes, + dc->disk.nr_stripes, stripe); - if (nr_sectors <= dc->disk.stripe_size) - return false; + if (stripe == dc->disk.nr_stripes) + goto next; - nr_sectors -= dc->disk.stripe_size; - stripe++; + next_stripe = find_next_zero_bit(dc->disk.full_dirty_stripes, + dc->disk.nr_stripes, stripe); + + buf->last_scanned = KEY(dc->disk.id, + stripe * dc->disk.stripe_size, 0); + + bch_refill_keybuf(dc->disk.c, buf, + &KEY(dc->disk.id, + next_stripe * dc->disk.stripe_size, 0), + dirty_pred); + + if (array_freelist_empty(&buf->freelist)) + return; + + stripe = next_stripe; +next: + if (wrapped && stripe > start_stripe) + return; + + if (stripe == dc->disk.nr_stripes) { + stripe = 0; + wrapped = true; + } } } static bool refill_dirty(struct cached_dev *dc) { struct keybuf *buf = &dc->writeback_keys; - bool searched_from_start = false; struct bkey end = KEY(dc->disk.id, MAX_KEY_OFFSET, 0); + bool searched_from_start = false; + + if (dc->partial_stripes_expensive) { + refill_full_stripes(dc); + if (array_freelist_empty(&buf->freelist)) + return false; + } if (bkey_cmp(&buf->last_scanned, &end) >= 0) { buf->last_scanned = KEY(dc->disk.id, 0, 0); searched_from_start = true; } - if (dc->partial_stripes_expensive) { - uint64_t i; - - for (i = 0; i < dc->disk.nr_stripes; i++) - if (atomic_read(dc->disk.stripe_sectors_dirty + i) == - dc->disk.stripe_size) - goto full_stripes; - - goto normal_refill; -full_stripes: - searched_from_start = false; /* not searching entire btree */ - bch_refill_keybuf(dc->disk.c, buf, &end, - dirty_full_stripe_pred); - } else { -normal_refill: - bch_refill_keybuf(dc->disk.c, buf, &end, dirty_pred); - } + bch_refill_keybuf(dc->disk.c, buf, &end, dirty_pred); return bkey_cmp(&buf->last_scanned, &end) >= 0 && searched_from_start; } diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h index 60516bf..fe7d9d5 100644 --- a/drivers/md/bcache/writeback.h +++ b/drivers/md/bcache/writeback.h @@ -14,22 +14,27 @@ static inline uint64_t bcache_dev_sectors_dirty(struct bcache_device *d) return ret; } -static inline bool bcache_dev_stripe_dirty(struct bcache_device *d, +static inline unsigned offset_to_stripe(struct bcache_device *d, + uint64_t offset) +{ + do_div(offset, d->stripe_size); + return offset; +} + +static inline bool bcache_dev_stripe_dirty(struct cached_dev *dc, uint64_t offset, unsigned nr_sectors) { - uint64_t stripe = offset; - - do_div(stripe, d->stripe_size); + unsigned stripe = offset_to_stripe(&dc->disk, offset); while (1) { - if (atomic_read(d->stripe_sectors_dirty + stripe)) + if (atomic_read(dc->disk.stripe_sectors_dirty + stripe)) return true; - if (nr_sectors <= d->stripe_size) + if (nr_sectors <= dc->disk.stripe_size) return false; - nr_sectors -= d->stripe_size; + nr_sectors -= dc->disk.stripe_size; stripe++; } } @@ -45,7 +50,7 @@ static inline bool should_writeback(struct cached_dev *dc, struct bio *bio, return false; if (dc->partial_stripes_expensive && - bcache_dev_stripe_dirty(&dc->disk, bio->bi_sector, + bcache_dev_stripe_dirty(dc, bio->bi_sector, bio_sectors(bio))) return true; diff --git a/include/trace/events/bcache.h b/include/trace/events/bcache.h index 32c89b3..e2b9576 100644 --- a/include/trace/events/bcache.h +++ b/include/trace/events/bcache.h @@ -368,6 +368,35 @@ DEFINE_EVENT(btree_node, bcache_btree_set_root, TP_ARGS(b) ); +TRACE_EVENT(bcache_keyscan, + TP_PROTO(unsigned nr_found, + unsigned start_inode, uint64_t start_offset, + unsigned end_inode, uint64_t end_offset), + TP_ARGS(nr_found, + start_inode, start_offset, + end_inode, end_offset), + + TP_STRUCT__entry( + __field(__u32, nr_found ) + __field(__u32, start_inode ) + __field(__u64, start_offset ) + __field(__u32, end_inode ) + __field(__u64, end_offset ) + ), + + TP_fast_assign( + __entry->nr_found = nr_found; + __entry->start_inode = start_inode; + __entry->start_offset = start_offset; + __entry->end_inode = end_inode; + __entry->end_offset = end_offset; + ), + + TP_printk("found %u keys from %u:%llu to %u:%llu", __entry->nr_found, + __entry->start_inode, __entry->start_offset, + __entry->end_inode, __entry->end_offset) +); + /* Allocator */ TRACE_EVENT(bcache_alloc_invalidate, -- cgit v0.10.2 From c4d951ddb66fe1d087447b0ba65c4fa4446f1083 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 21 Aug 2013 17:49:09 -0700 Subject: bcache: Fix sysfs splat on shutdown with flash only devs Whoops. Signed-off-by: Kent Overstreet diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index ab0b215..97ef126 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -266,12 +266,10 @@ struct bcache_device { struct gendisk *disk; - /* If nonzero, we're closing */ - atomic_t closing; - - /* If nonzero, we're detaching/unregistering from cache set */ - atomic_t detaching; - int flush_done; + unsigned long flags; +#define BCACHE_DEV_CLOSING 0 +#define BCACHE_DEV_DETACHING 1 +#define BCACHE_DEV_UNLINK_DONE 2 unsigned nr_stripes; unsigned stripe_size; diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index f645da6..9f5a138 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -512,7 +512,7 @@ static bool check_should_bypass(struct cached_dev *dc, struct bio *bio) struct task_struct *task = current; struct io *i; - if (atomic_read(&dc->disk.detaching) || + if (test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) || c->gc_stats.in_use > CUTOFF_CACHE_ADD || (bio->bi_rw & REQ_DISCARD)) goto skip; diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 43fcfe3..fa1d530 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -621,7 +621,7 @@ static void prio_read(struct cache *ca, uint64_t bucket) static int open_dev(struct block_device *b, fmode_t mode) { struct bcache_device *d = b->bd_disk->private_data; - if (atomic_read(&d->closing)) + if (test_bit(BCACHE_DEV_CLOSING, &d->flags)) return -ENXIO; closure_get(&d->cl); @@ -650,20 +650,24 @@ static const struct block_device_operations bcache_ops = { void bcache_device_stop(struct bcache_device *d) { - if (!atomic_xchg(&d->closing, 1)) + if (!test_and_set_bit(BCACHE_DEV_CLOSING, &d->flags)) closure_queue(&d->cl); } static void bcache_device_unlink(struct bcache_device *d) { - unsigned i; - struct cache *ca; + lockdep_assert_held(&bch_register_lock); - sysfs_remove_link(&d->c->kobj, d->name); - sysfs_remove_link(&d->kobj, "cache"); + if (d->c && !test_and_set_bit(BCACHE_DEV_UNLINK_DONE, &d->flags)) { + unsigned i; + struct cache *ca; - for_each_cache(ca, d->c, i) - bd_unlink_disk_holder(ca->bdev, d->disk); + sysfs_remove_link(&d->c->kobj, d->name); + sysfs_remove_link(&d->kobj, "cache"); + + for_each_cache(ca, d->c, i) + bd_unlink_disk_holder(ca->bdev, d->disk); + } } static void bcache_device_link(struct bcache_device *d, struct cache_set *c, @@ -687,19 +691,16 @@ static void bcache_device_detach(struct bcache_device *d) { lockdep_assert_held(&bch_register_lock); - if (atomic_read(&d->detaching)) { + if (test_bit(BCACHE_DEV_DETACHING, &d->flags)) { struct uuid_entry *u = d->c->uuids + d->id; SET_UUID_FLASH_ONLY(u, 0); memcpy(u->uuid, invalid_uuid, 16); u->invalidated = cpu_to_le32(get_seconds()); bch_uuid_write(d->c); - - atomic_set(&d->detaching, 0); } - if (!d->flush_done) - bcache_device_unlink(d); + bcache_device_unlink(d); d->c->devices[d->id] = NULL; closure_put(&d->c->caching); @@ -879,7 +880,7 @@ static void cached_dev_detach_finish(struct work_struct *w) struct closure cl; closure_init_stack(&cl); - BUG_ON(!atomic_read(&dc->disk.detaching)); + BUG_ON(!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags)); BUG_ON(atomic_read(&dc->count)); mutex_lock(&bch_register_lock); @@ -893,6 +894,8 @@ static void cached_dev_detach_finish(struct work_struct *w) bcache_device_detach(&dc->disk); list_move(&dc->list, &uncached_devices); + clear_bit(BCACHE_DEV_DETACHING, &dc->disk.flags); + mutex_unlock(&bch_register_lock); pr_info("Caching disabled for %s", bdevname(dc->bdev, buf)); @@ -905,10 +908,10 @@ void bch_cached_dev_detach(struct cached_dev *dc) { lockdep_assert_held(&bch_register_lock); - if (atomic_read(&dc->disk.closing)) + if (test_bit(BCACHE_DEV_CLOSING, &dc->disk.flags)) return; - if (atomic_xchg(&dc->disk.detaching, 1)) + if (test_and_set_bit(BCACHE_DEV_DETACHING, &dc->disk.flags)) return; /* @@ -1064,11 +1067,7 @@ static void cached_dev_flush(struct closure *cl) struct bcache_device *d = &dc->disk; mutex_lock(&bch_register_lock); - d->flush_done = 1; - - if (d->c) - bcache_device_unlink(d); - + bcache_device_unlink(d); mutex_unlock(&bch_register_lock); bch_cache_accounting_destroy(&dc->accounting); diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index 4b67244..194d437 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -370,7 +370,7 @@ STORE(__bch_flash_dev) } if (attr == &sysfs_unregister) { - atomic_set(&d->detaching, 1); + set_bit(BCACHE_DEV_DETACHING, &d->flags); bcache_device_stop(d); } diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index 22e21dc..99053b1 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -89,7 +89,7 @@ static unsigned writeback_delay(struct cached_dev *dc, unsigned sectors) { uint64_t ret; - if (atomic_read(&dc->disk.detaching) || + if (test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) || !dc->writeback_percent) return 0; @@ -404,7 +404,7 @@ static int bch_writeback_thread(void *arg) while (!kthread_should_stop()) { down_write(&dc->writeback_lock); if (!atomic_read(&dc->has_dirty) || - (!atomic_read(&dc->disk.detaching) && + (!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) && !dc->writeback_running)) { up_write(&dc->writeback_lock); set_current_state(TASK_INTERRUPTIBLE); @@ -437,7 +437,7 @@ static int bch_writeback_thread(void *arg) while (delay && !kthread_should_stop() && - !atomic_read(&dc->disk.detaching)) + !test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags)) delay = schedule_timeout_interruptible(delay); } } diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h index fe7d9d5..c9ddcf4 100644 --- a/drivers/md/bcache/writeback.h +++ b/drivers/md/bcache/writeback.h @@ -45,7 +45,7 @@ static inline bool should_writeback(struct cached_dev *dc, struct bio *bio, unsigned in_use = dc->disk.c->gc_stats.in_use; if (cache_mode != CACHE_MODE_WRITEBACK || - atomic_read(&dc->disk.detaching) || + test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) || in_use > CUTOFF_WRITEBACK_SYNC) return false; -- cgit v0.10.2 From 28935ab5163c49ca5c199d67335e5e3c72c50853 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 31 Jul 2013 01:12:02 -0700 Subject: bcache: Use ida for bcache block dev minor Signed-off-by: Kent Overstreet diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index fa1d530..70708ab 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -50,7 +51,8 @@ struct mutex bch_register_lock; LIST_HEAD(bch_cache_sets); static LIST_HEAD(uncached_devices); -static int bcache_major, bcache_minor; +static int bcache_major; +static DEFINE_IDA(bcache_minor); static wait_queue_head_t unregister_wait; struct workqueue_struct *bcache_wq; @@ -731,8 +733,10 @@ static void bcache_device_free(struct bcache_device *d) del_gendisk(d->disk); if (d->disk && d->disk->queue) blk_cleanup_queue(d->disk->queue); - if (d->disk) + if (d->disk) { + ida_simple_remove(&bcache_minor, d->disk->first_minor); put_disk(d->disk); + } bio_split_pool_free(&d->bio_split_hook); if (d->unaligned_bvec) @@ -756,6 +760,7 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size, { struct request_queue *q; size_t n; + int minor; if (!d->stripe_size) d->stripe_size = 1 << 31; @@ -783,22 +788,31 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size, if (!d->full_dirty_stripes) return -ENOMEM; + minor = ida_simple_get(&bcache_minor, 0, MINORMASK + 1, GFP_KERNEL); + if (minor < 0) + return minor; + if (!(d->bio_split = bioset_create(4, offsetof(struct bbio, bio))) || !(d->unaligned_bvec = mempool_create_kmalloc_pool(1, sizeof(struct bio_vec) * BIO_MAX_PAGES)) || bio_split_pool_init(&d->bio_split_hook) || - !(d->disk = alloc_disk(1)) || - !(q = blk_alloc_queue(GFP_KERNEL))) + !(d->disk = alloc_disk(1))) { + ida_simple_remove(&bcache_minor, minor); return -ENOMEM; + } set_capacity(d->disk, sectors); - snprintf(d->disk->disk_name, DISK_NAME_LEN, "bcache%i", bcache_minor); + snprintf(d->disk->disk_name, DISK_NAME_LEN, "bcache%i", minor); d->disk->major = bcache_major; - d->disk->first_minor = bcache_minor++; + d->disk->first_minor = minor; d->disk->fops = &bcache_ops; d->disk->private_data = d; + q = blk_alloc_queue(GFP_KERNEL); + if (!q) + return -ENOMEM; + blk_queue_make_request(q, NULL); d->disk->queue = q; q->queuedata = d; -- cgit v0.10.2 From 098fb25498214069e6bbf908515f2952dd7654d0 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 21 Aug 2013 18:36:45 -0700 Subject: bcache: Delete some slower inline asm Never saw a profile of bset_search_tree() where it wasn't bottlenecked on memory until I got my new Haswell machine, but when I tried it there it was suddenly burning 20% of the cpu in the inner loop on shrd... Turns out, the version of shrd that takes 64 bit operands has a 9 cycle latency. hah. Signed-off-by: Kent Overstreet diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c index 1457339..7d388b8 100644 --- a/drivers/md/bcache/bset.c +++ b/drivers/md/bcache/bset.c @@ -481,16 +481,8 @@ static struct bkey *table_to_bkey(struct bset_tree *t, unsigned cacheline) static inline uint64_t shrd128(uint64_t high, uint64_t low, uint8_t shift) { -#ifdef CONFIG_X86_64 - asm("shrd %[shift],%[high],%[low]" - : [low] "+Rm" (low) - : [high] "R" (high), - [shift] "ci" (shift) - : "cc"); -#else low >>= shift; low |= (high << 1) << (63U - shift); -#endif return low; } -- cgit v0.10.2 From 5ceaaad7047745c1c02150c39d3fb623b7948d48 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 10 Sep 2013 14:27:42 -0700 Subject: bcache: Bypass torture test More testing ftw! Also, now verify mode doesn't break if you read dirty data. Signed-off-by: Kent Overstreet diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 97ef126..4beb55a 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -364,6 +364,7 @@ struct cached_dev { unsigned readahead; unsigned verify:1; + unsigned bypass_torture_test:1; unsigned partial_stripes_expensive:1; unsigned writeback_metadata:1; diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c index e99e6b8..264fcfb 100644 --- a/drivers/md/bcache/debug.c +++ b/drivers/md/bcache/debug.c @@ -189,13 +189,14 @@ void bch_data_verify(struct cached_dev *dc, struct bio *bio) void *p1 = kmap_atomic(bv->bv_page); void *p2 = page_address(check->bi_io_vec[i].bv_page); - if (memcmp(p1 + bv->bv_offset, - p2 + bv->bv_offset, - bv->bv_len)) - printk(KERN_ERR - "bcache (%s): verify failed at sector %llu\n", - bdevname(dc->bdev, name), - (uint64_t) bio->bi_sector); + cache_set_err_on(memcmp(p1 + bv->bv_offset, + p2 + bv->bv_offset, + bv->bv_len), + dc->disk.c, + "verify failed at dev %s sector %llu", + bdevname(dc->bdev, name), + (uint64_t) bio->bi_sector); + kunmap_atomic(p1); } diff --git a/drivers/md/bcache/debug.h b/drivers/md/bcache/debug.h index 7914ba0..2ede60e 100644 --- a/drivers/md/bcache/debug.h +++ b/drivers/md/bcache/debug.h @@ -16,6 +16,7 @@ void bch_btree_iter_next_check(struct btree_iter *); #define EBUG_ON(cond) BUG_ON(cond) #define expensive_debug_checks(c) ((c)->expensive_debug_checks) #define key_merging_disabled(c) ((c)->key_merging_disabled) +#define bypass_torture_test(d) ((d)->bypass_torture_test) #else /* DEBUG */ @@ -28,6 +29,7 @@ static inline void bch_btree_iter_next_check(struct btree_iter *iter) {} #define EBUG_ON(cond) do { if (cond); } while (0) #define expensive_debug_checks(c) 0 #define key_merging_disabled(c) 0 +#define bypass_torture_test(d) 0 #endif diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 9f5a138..fbcc851 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -528,6 +528,13 @@ static bool check_should_bypass(struct cached_dev *dc, struct bio *bio) goto skip; } + if (bypass_torture_test(dc)) { + if ((get_random_int() & 3) == 3) + goto skip; + else + goto rescale; + } + if (!congested && !dc->sequential_cutoff) goto rescale; @@ -601,6 +608,7 @@ struct search { unsigned recoverable:1; unsigned unaligned_bvec:1; unsigned write:1; + unsigned read_dirty_data:1; unsigned long start_time; @@ -669,6 +677,9 @@ static int cache_lookup_fn(struct btree_op *op, struct btree *b, struct bkey *k) PTR_BUCKET(b->c, k, ptr)->prio = INITIAL_PRIO; + if (KEY_DIRTY(k)) + s->read_dirty_data = true; + n = bch_bio_split(bio, min_t(uint64_t, INT_MAX, KEY_OFFSET(k) - bio->bi_sector), GFP_NOIO, s->d->bio_split); @@ -894,7 +905,8 @@ static void cached_dev_read_done(struct closure *cl) s->cache_miss = NULL; } - if (verify(dc, &s->bio.bio) && s->recoverable && !s->unaligned_bvec) + if (verify(dc, &s->bio.bio) && s->recoverable && + !s->unaligned_bvec && !s->read_dirty_data) bch_data_verify(dc, s->orig_bio); bio_complete(s); diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index 194d437..80d4c2b 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -99,6 +99,7 @@ rw_attribute(errors); rw_attribute(io_error_limit); rw_attribute(io_error_halflife); rw_attribute(verify); +rw_attribute(bypass_torture_test); rw_attribute(key_merging_disabled); rw_attribute(gc_always_rewrite); rw_attribute(expensive_debug_checks); @@ -123,6 +124,7 @@ SHOW(__bch_cached_dev) sysfs_printf(data_csum, "%i", dc->disk.data_csum); var_printf(verify, "%i"); + var_printf(bypass_torture_test, "%i"); var_printf(writeback_metadata, "%i"); var_printf(writeback_running, "%i"); var_print(writeback_delay); @@ -191,6 +193,7 @@ STORE(__cached_dev) sysfs_strtoul(data_csum, dc->disk.data_csum); d_strtoul(verify); + d_strtoul(bypass_torture_test); d_strtoul(writeback_metadata); d_strtoul(writeback_running); d_strtoul(writeback_delay); @@ -323,6 +326,7 @@ static struct attribute *bch_cached_dev_files[] = { &sysfs_readahead, #ifdef CONFIG_BCACHE_DEBUG &sysfs_verify, + &sysfs_bypass_torture_test, #endif NULL }; -- cgit v0.10.2 From c86949486d41d9e7d7681fc72923555114fd702f Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 10 Sep 2013 21:41:34 -0700 Subject: bcache: defensively handle format strings Just to be safe, call the error reporting function with "%s" to avoid any possible future format string leak. Signed-off-by: Kees Cook Signed-off-by: Kent Overstreet diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 70708ab..dec15cd 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1670,7 +1670,7 @@ static void run_cache_set(struct cache_set *c) err: closure_sync(&cl); /* XXX: test this, it's broken */ - bch_cache_set_error(c, err); + bch_cache_set_error(c, "%s", err); } static bool can_attach_cache(struct cache *ca, struct cache_set *c) -- cgit v0.10.2 From f618ef7c47934d1686a764d0c9f70f23e566683f Mon Sep 17 00:00:00 2001 From: Paul Gortmaker Date: Thu, 14 Nov 2013 08:26:02 -0700 Subject: blk-mq: remove newly added instances of __cpuinit The new blk-mq code added new instances of __cpuinit usage. We removed this a couple versions ago; we now want to remove the compat no-op stubs. Introducing new users is not what we want to see at this point in time, as it will break once the stubs are gone. Signed-off-by: Paul Gortmaker Signed-off-by: Jens Axboe diff --git a/block/blk-mq-cpu.c b/block/blk-mq-cpu.c index f8ea39d..0045ace 100644 --- a/block/blk-mq-cpu.c +++ b/block/blk-mq-cpu.c @@ -13,8 +13,8 @@ static LIST_HEAD(blk_mq_cpu_notify_list); static DEFINE_SPINLOCK(blk_mq_cpu_notify_lock); -static int __cpuinit blk_mq_main_cpu_notify(struct notifier_block *self, - unsigned long action, void *hcpu) +static int blk_mq_main_cpu_notify(struct notifier_block *self, + unsigned long action, void *hcpu) { unsigned int cpu = (unsigned long) hcpu; struct blk_mq_cpu_notifier *notify; @@ -28,8 +28,8 @@ static int __cpuinit blk_mq_main_cpu_notify(struct notifier_block *self, return NOTIFY_OK; } -static void __cpuinit blk_mq_cpu_notify(void *data, unsigned long action, - unsigned int cpu) +static void blk_mq_cpu_notify(void *data, unsigned long action, + unsigned int cpu) { if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) { /* diff --git a/block/blk-mq.c b/block/blk-mq.c index 88d4e86..3638a58 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1444,7 +1444,7 @@ void blk_mq_free_queue(struct request_queue *q) EXPORT_SYMBOL(blk_mq_free_queue); /* Basically redo blk_mq_init_queue with queue frozen */ -static void __cpuinit blk_mq_queue_reinit(struct request_queue *q) +static void blk_mq_queue_reinit(struct request_queue *q) { blk_mq_freeze_queue(q); @@ -1461,8 +1461,8 @@ static void __cpuinit blk_mq_queue_reinit(struct request_queue *q) blk_mq_unfreeze_queue(q); } -static int __cpuinit blk_mq_queue_reinit_notify(struct notifier_block *nb, - unsigned long action, void *hcpu) +static int blk_mq_queue_reinit_notify(struct notifier_block *nb, + unsigned long action, void *hcpu) { struct request_queue *q; -- cgit v0.10.2 From 1cf7e9c68fe84248174e998922b39e508375e7c1 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 1 Nov 2013 10:52:52 -0600 Subject: virtio_blk: blk-mq support Switch virtio-blk from the dual support for old-style requests and bios to use the block-multiqueue. Acked-by: Asias He Signed-off-by: Jens Axboe Signed-off-by: Christoph Hellwig diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 5cdf88b..7455fe2 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -11,12 +11,11 @@ #include #include #include +#include +#include #define PART_BITS 4 -static bool use_bio; -module_param(use_bio, bool, S_IRUGO); - static int major; static DEFINE_IDA(vd_index_ida); @@ -26,13 +25,11 @@ struct virtio_blk { struct virtio_device *vdev; struct virtqueue *vq; - wait_queue_head_t queue_wait; + spinlock_t vq_lock; /* The disk structure for the kernel. */ struct gendisk *disk; - mempool_t *pool; - /* Process context for config space updates */ struct work_struct config_work; @@ -47,31 +44,17 @@ struct virtio_blk /* Ida index - used to track minor number allocations. */ int index; - - /* Scatterlist: can be too big for stack. */ - struct scatterlist sg[/*sg_elems*/]; }; struct virtblk_req { struct request *req; - struct bio *bio; struct virtio_blk_outhdr out_hdr; struct virtio_scsi_inhdr in_hdr; - struct work_struct work; - struct virtio_blk *vblk; - int flags; u8 status; struct scatterlist sg[]; }; -enum { - VBLK_IS_FLUSH = 1, - VBLK_REQ_FLUSH = 2, - VBLK_REQ_DATA = 4, - VBLK_REQ_FUA = 8, -}; - static inline int virtblk_result(struct virtblk_req *vbr) { switch (vbr->status) { @@ -84,22 +67,6 @@ static inline int virtblk_result(struct virtblk_req *vbr) } } -static inline struct virtblk_req *virtblk_alloc_req(struct virtio_blk *vblk, - gfp_t gfp_mask) -{ - struct virtblk_req *vbr; - - vbr = mempool_alloc(vblk->pool, gfp_mask); - if (!vbr) - return NULL; - - vbr->vblk = vblk; - if (use_bio) - sg_init_table(vbr->sg, vblk->sg_elems); - - return vbr; -} - static int __virtblk_add_req(struct virtqueue *vq, struct virtblk_req *vbr, struct scatterlist *data_sg, @@ -143,83 +110,8 @@ static int __virtblk_add_req(struct virtqueue *vq, return virtqueue_add_sgs(vq, sgs, num_out, num_in, vbr, GFP_ATOMIC); } -static void virtblk_add_req(struct virtblk_req *vbr, bool have_data) -{ - struct virtio_blk *vblk = vbr->vblk; - DEFINE_WAIT(wait); - int ret; - - spin_lock_irq(vblk->disk->queue->queue_lock); - while (unlikely((ret = __virtblk_add_req(vblk->vq, vbr, vbr->sg, - have_data)) < 0)) { - prepare_to_wait_exclusive(&vblk->queue_wait, &wait, - TASK_UNINTERRUPTIBLE); - - spin_unlock_irq(vblk->disk->queue->queue_lock); - io_schedule(); - spin_lock_irq(vblk->disk->queue->queue_lock); - - finish_wait(&vblk->queue_wait, &wait); - } - - virtqueue_kick(vblk->vq); - spin_unlock_irq(vblk->disk->queue->queue_lock); -} - -static void virtblk_bio_send_flush(struct virtblk_req *vbr) -{ - vbr->flags |= VBLK_IS_FLUSH; - vbr->out_hdr.type = VIRTIO_BLK_T_FLUSH; - vbr->out_hdr.sector = 0; - vbr->out_hdr.ioprio = 0; - - virtblk_add_req(vbr, false); -} - -static void virtblk_bio_send_data(struct virtblk_req *vbr) -{ - struct virtio_blk *vblk = vbr->vblk; - struct bio *bio = vbr->bio; - bool have_data; - - vbr->flags &= ~VBLK_IS_FLUSH; - vbr->out_hdr.type = 0; - vbr->out_hdr.sector = bio->bi_sector; - vbr->out_hdr.ioprio = bio_prio(bio); - - if (blk_bio_map_sg(vblk->disk->queue, bio, vbr->sg)) { - have_data = true; - if (bio->bi_rw & REQ_WRITE) - vbr->out_hdr.type |= VIRTIO_BLK_T_OUT; - else - vbr->out_hdr.type |= VIRTIO_BLK_T_IN; - } else - have_data = false; - - virtblk_add_req(vbr, have_data); -} - -static void virtblk_bio_send_data_work(struct work_struct *work) -{ - struct virtblk_req *vbr; - - vbr = container_of(work, struct virtblk_req, work); - - virtblk_bio_send_data(vbr); -} - -static void virtblk_bio_send_flush_work(struct work_struct *work) -{ - struct virtblk_req *vbr; - - vbr = container_of(work, struct virtblk_req, work); - - virtblk_bio_send_flush(vbr); -} - static inline void virtblk_request_done(struct virtblk_req *vbr) { - struct virtio_blk *vblk = vbr->vblk; struct request *req = vbr->req; int error = virtblk_result(vbr); @@ -231,90 +123,43 @@ static inline void virtblk_request_done(struct virtblk_req *vbr) req->errors = (error != 0); } - __blk_end_request_all(req, error); - mempool_free(vbr, vblk->pool); -} - -static inline void virtblk_bio_flush_done(struct virtblk_req *vbr) -{ - struct virtio_blk *vblk = vbr->vblk; - - if (vbr->flags & VBLK_REQ_DATA) { - /* Send out the actual write data */ - INIT_WORK(&vbr->work, virtblk_bio_send_data_work); - queue_work(virtblk_wq, &vbr->work); - } else { - bio_endio(vbr->bio, virtblk_result(vbr)); - mempool_free(vbr, vblk->pool); - } -} - -static inline void virtblk_bio_data_done(struct virtblk_req *vbr) -{ - struct virtio_blk *vblk = vbr->vblk; - - if (unlikely(vbr->flags & VBLK_REQ_FUA)) { - /* Send out a flush before end the bio */ - vbr->flags &= ~VBLK_REQ_DATA; - INIT_WORK(&vbr->work, virtblk_bio_send_flush_work); - queue_work(virtblk_wq, &vbr->work); - } else { - bio_endio(vbr->bio, virtblk_result(vbr)); - mempool_free(vbr, vblk->pool); - } -} - -static inline void virtblk_bio_done(struct virtblk_req *vbr) -{ - if (unlikely(vbr->flags & VBLK_IS_FLUSH)) - virtblk_bio_flush_done(vbr); - else - virtblk_bio_data_done(vbr); + blk_mq_end_io(req, error); } static void virtblk_done(struct virtqueue *vq) { struct virtio_blk *vblk = vq->vdev->priv; - bool bio_done = false, req_done = false; + bool req_done = false; struct virtblk_req *vbr; unsigned long flags; unsigned int len; - spin_lock_irqsave(vblk->disk->queue->queue_lock, flags); + spin_lock_irqsave(&vblk->vq_lock, flags); do { virtqueue_disable_cb(vq); while ((vbr = virtqueue_get_buf(vblk->vq, &len)) != NULL) { - if (vbr->bio) { - virtblk_bio_done(vbr); - bio_done = true; - } else { - virtblk_request_done(vbr); - req_done = true; - } + virtblk_request_done(vbr); + req_done = true; } } while (!virtqueue_enable_cb(vq)); + spin_unlock_irqrestore(&vblk->vq_lock, flags); + /* In case queue is stopped waiting for more buffers. */ if (req_done) - blk_start_queue(vblk->disk->queue); - spin_unlock_irqrestore(vblk->disk->queue->queue_lock, flags); - - if (bio_done) - wake_up(&vblk->queue_wait); + blk_mq_start_stopped_hw_queues(vblk->disk->queue); } -static bool do_req(struct request_queue *q, struct virtio_blk *vblk, - struct request *req) +static int virtio_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req) { + struct virtio_blk *vblk = hctx->queue->queuedata; + struct virtblk_req *vbr = req->special; + unsigned long flags; unsigned int num; - struct virtblk_req *vbr; + const bool last = (req->cmd_flags & REQ_END) != 0; - vbr = virtblk_alloc_req(vblk, GFP_ATOMIC); - if (!vbr) - /* When another request finishes we'll try again. */ - return false; + BUG_ON(req->nr_phys_segments + 2 > vblk->sg_elems); vbr->req = req; - vbr->bio = NULL; if (req->cmd_flags & REQ_FLUSH) { vbr->out_hdr.type = VIRTIO_BLK_T_FLUSH; vbr->out_hdr.sector = 0; @@ -342,7 +187,7 @@ static bool do_req(struct request_queue *q, struct virtio_blk *vblk, } } - num = blk_rq_map_sg(q, vbr->req, vblk->sg); + num = blk_rq_map_sg(hctx->queue, vbr->req, vbr->sg); if (num) { if (rq_data_dir(vbr->req) == WRITE) vbr->out_hdr.type |= VIRTIO_BLK_T_OUT; @@ -350,63 +195,18 @@ static bool do_req(struct request_queue *q, struct virtio_blk *vblk, vbr->out_hdr.type |= VIRTIO_BLK_T_IN; } - if (__virtblk_add_req(vblk->vq, vbr, vblk->sg, num) < 0) { - mempool_free(vbr, vblk->pool); - return false; - } - - return true; -} - -static void virtblk_request(struct request_queue *q) -{ - struct virtio_blk *vblk = q->queuedata; - struct request *req; - unsigned int issued = 0; - - while ((req = blk_peek_request(q)) != NULL) { - BUG_ON(req->nr_phys_segments + 2 > vblk->sg_elems); - - /* If this request fails, stop queue and wait for something to - finish to restart it. */ - if (!do_req(q, vblk, req)) { - blk_stop_queue(q); - break; - } - blk_start_request(req); - issued++; - } - - if (issued) + spin_lock_irqsave(&vblk->vq_lock, flags); + if (__virtblk_add_req(vblk->vq, vbr, vbr->sg, num) < 0) { + spin_unlock_irqrestore(&vblk->vq_lock, flags); + blk_mq_stop_hw_queue(hctx); virtqueue_kick(vblk->vq); -} - -static void virtblk_make_request(struct request_queue *q, struct bio *bio) -{ - struct virtio_blk *vblk = q->queuedata; - struct virtblk_req *vbr; - - BUG_ON(bio->bi_phys_segments + 2 > vblk->sg_elems); - - vbr = virtblk_alloc_req(vblk, GFP_NOIO); - if (!vbr) { - bio_endio(bio, -ENOMEM); - return; + return BLK_MQ_RQ_QUEUE_BUSY; } + spin_unlock_irqrestore(&vblk->vq_lock, flags); - vbr->bio = bio; - vbr->flags = 0; - if (bio->bi_rw & REQ_FLUSH) - vbr->flags |= VBLK_REQ_FLUSH; - if (bio->bi_rw & REQ_FUA) - vbr->flags |= VBLK_REQ_FUA; - if (bio->bi_size) - vbr->flags |= VBLK_REQ_DATA; - - if (unlikely(vbr->flags & VBLK_REQ_FLUSH)) - virtblk_bio_send_flush(vbr); - else - virtblk_bio_send_data(vbr); + if (last) + virtqueue_kick(vblk->vq); + return BLK_MQ_RQ_QUEUE_OK; } /* return id (s/n) string for *disk to *id_str @@ -680,12 +480,35 @@ static const struct device_attribute dev_attr_cache_type_rw = __ATTR(cache_type, S_IRUGO|S_IWUSR, virtblk_cache_type_show, virtblk_cache_type_store); +static struct blk_mq_ops virtio_mq_ops = { + .queue_rq = virtio_queue_rq, + .map_queue = blk_mq_map_queue, + .alloc_hctx = blk_mq_alloc_single_hw_queue, + .free_hctx = blk_mq_free_single_hw_queue, +}; + +static struct blk_mq_reg virtio_mq_reg = { + .ops = &virtio_mq_ops, + .nr_hw_queues = 1, + .queue_depth = 64, + .numa_node = NUMA_NO_NODE, + .flags = BLK_MQ_F_SHOULD_MERGE, +}; + +static void virtblk_init_vbr(void *data, struct blk_mq_hw_ctx *hctx, + struct request *rq, unsigned int nr) +{ + struct virtio_blk *vblk = data; + struct virtblk_req *vbr = rq->special; + + sg_init_table(vbr->sg, vblk->sg_elems); +} + static int virtblk_probe(struct virtio_device *vdev) { struct virtio_blk *vblk; struct request_queue *q; int err, index; - int pool_size; u64 cap; u32 v, blk_size, sg_elems, opt_io_size; @@ -709,17 +532,14 @@ static int virtblk_probe(struct virtio_device *vdev) /* We need an extra sg elements at head and tail. */ sg_elems += 2; - vdev->priv = vblk = kmalloc(sizeof(*vblk) + - sizeof(vblk->sg[0]) * sg_elems, GFP_KERNEL); + vdev->priv = vblk = kmalloc(sizeof(*vblk), GFP_KERNEL); if (!vblk) { err = -ENOMEM; goto out_free_index; } - init_waitqueue_head(&vblk->queue_wait); vblk->vdev = vdev; vblk->sg_elems = sg_elems; - sg_init_table(vblk->sg, vblk->sg_elems); mutex_init(&vblk->config_lock); INIT_WORK(&vblk->config_work, virtblk_config_changed_work); @@ -728,31 +548,27 @@ static int virtblk_probe(struct virtio_device *vdev) err = init_vq(vblk); if (err) goto out_free_vblk; - - pool_size = sizeof(struct virtblk_req); - if (use_bio) - pool_size += sizeof(struct scatterlist) * sg_elems; - vblk->pool = mempool_create_kmalloc_pool(1, pool_size); - if (!vblk->pool) { - err = -ENOMEM; - goto out_free_vq; - } + spin_lock_init(&vblk->vq_lock); /* FIXME: How many partitions? How long is a piece of string? */ vblk->disk = alloc_disk(1 << PART_BITS); if (!vblk->disk) { err = -ENOMEM; - goto out_mempool; + goto out_free_vq; } - q = vblk->disk->queue = blk_init_queue(virtblk_request, NULL); + virtio_mq_reg.cmd_size = + sizeof(struct virtblk_req) + + sizeof(struct scatterlist) * sg_elems; + + q = vblk->disk->queue = blk_mq_init_queue(&virtio_mq_reg, vblk); if (!q) { err = -ENOMEM; goto out_put_disk; } - if (use_bio) - blk_queue_make_request(q, virtblk_make_request); + blk_mq_init_commands(q, virtblk_init_vbr, vblk); + q->queuedata = vblk; virtblk_name_format("vd", index, vblk->disk->disk_name, DISK_NAME_LEN); @@ -857,8 +673,6 @@ out_del_disk: blk_cleanup_queue(vblk->disk->queue); out_put_disk: put_disk(vblk->disk); -out_mempool: - mempool_destroy(vblk->pool); out_free_vq: vdev->config->del_vqs(vdev); out_free_vblk: @@ -890,7 +704,6 @@ static void virtblk_remove(struct virtio_device *vdev) refc = atomic_read(&disk_to_dev(vblk->disk)->kobj.kref.refcount); put_disk(vblk->disk); - mempool_destroy(vblk->pool); vdev->config->del_vqs(vdev); kfree(vblk); @@ -914,10 +727,7 @@ static int virtblk_freeze(struct virtio_device *vdev) flush_work(&vblk->config_work); - spin_lock_irq(vblk->disk->queue->queue_lock); - blk_stop_queue(vblk->disk->queue); - spin_unlock_irq(vblk->disk->queue->queue_lock); - blk_sync_queue(vblk->disk->queue); + blk_mq_stop_hw_queues(vblk->disk->queue); vdev->config->del_vqs(vdev); return 0; @@ -930,11 +740,9 @@ static int virtblk_restore(struct virtio_device *vdev) vblk->config_enable = true; ret = init_vq(vdev->priv); - if (!ret) { - spin_lock_irq(vblk->disk->queue->queue_lock); - blk_start_queue(vblk->disk->queue); - spin_unlock_irq(vblk->disk->queue->queue_lock); - } + if (!ret) + blk_mq_start_stopped_hw_queues(vblk->disk->queue); + return ret; } #endif -- cgit v0.10.2