diff options
Diffstat (limited to 'drivers/md/bcache/bcache.h')
-rw-r--r-- | drivers/md/bcache/bcache.h | 327 |
1 files changed, 294 insertions, 33 deletions
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 4beb55a..0f12382 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -177,7 +177,6 @@ #define pr_fmt(fmt) "bcache: %s() " fmt "\n", __func__ -#include <linux/bcache.h> #include <linux/bio.h> #include <linux/kobject.h> #include <linux/list.h> @@ -211,6 +210,168 @@ BITMASK(GC_MARK, struct bucket, gc_mark, 0, 2); #define GC_MARK_METADATA 2 BITMASK(GC_SECTORS_USED, struct bucket, gc_mark, 2, 14); +struct bkey { + uint64_t high; + uint64_t low; + uint64_t ptr[]; +}; + +/* Enough for a key with 6 pointers */ +#define BKEY_PAD 8 + +#define BKEY_PADDED(key) \ + union { struct bkey key; uint64_t key ## _pad[BKEY_PAD]; } + +/* Version 0: Cache device + * Version 1: Backing device + * Version 2: Seed pointer into btree node checksum + * Version 3: Cache device with new UUID format + * Version 4: Backing device with data offset + */ +#define BCACHE_SB_VERSION_CDEV 0 +#define BCACHE_SB_VERSION_BDEV 1 +#define BCACHE_SB_VERSION_CDEV_WITH_UUID 3 +#define BCACHE_SB_VERSION_BDEV_WITH_OFFSET 4 +#define BCACHE_SB_MAX_VERSION 4 + +#define SB_SECTOR 8 +#define SB_SIZE 4096 +#define SB_LABEL_SIZE 32 +#define SB_JOURNAL_BUCKETS 256U +/* SB_JOURNAL_BUCKETS must be divisible by BITS_PER_LONG */ +#define MAX_CACHES_PER_SET 8 + +#define BDEV_DATA_START_DEFAULT 16 /* sectors */ + +struct cache_sb { + uint64_t csum; + uint64_t offset; /* sector where this sb was written */ + uint64_t version; + + uint8_t magic[16]; + + uint8_t uuid[16]; + union { + uint8_t set_uuid[16]; + uint64_t set_magic; + }; + uint8_t label[SB_LABEL_SIZE]; + + uint64_t flags; + uint64_t seq; + uint64_t pad[8]; + + union { + struct { + /* Cache devices */ + uint64_t nbuckets; /* device size */ + + uint16_t block_size; /* sectors */ + uint16_t bucket_size; /* sectors */ + + uint16_t nr_in_set; + uint16_t nr_this_dev; + }; + struct { + /* Backing devices */ + uint64_t data_offset; + + /* + * block_size from the cache device section is still used by + * backing devices, so don't add anything here until we fix + * things to not need it for backing devices anymore + */ + }; + }; + + uint32_t last_mount; /* time_t */ + + uint16_t first_bucket; + union { + uint16_t njournal_buckets; + uint16_t keys; + }; + uint64_t d[SB_JOURNAL_BUCKETS]; /* journal buckets */ +}; + +BITMASK(CACHE_SYNC, struct cache_sb, flags, 0, 1); +BITMASK(CACHE_DISCARD, struct cache_sb, flags, 1, 1); +BITMASK(CACHE_REPLACEMENT, struct cache_sb, flags, 2, 3); +#define CACHE_REPLACEMENT_LRU 0U +#define CACHE_REPLACEMENT_FIFO 1U +#define CACHE_REPLACEMENT_RANDOM 2U + +BITMASK(BDEV_CACHE_MODE, struct cache_sb, flags, 0, 4); +#define CACHE_MODE_WRITETHROUGH 0U +#define CACHE_MODE_WRITEBACK 1U +#define CACHE_MODE_WRITEAROUND 2U +#define CACHE_MODE_NONE 3U +BITMASK(BDEV_STATE, struct cache_sb, flags, 61, 2); +#define BDEV_STATE_NONE 0U +#define BDEV_STATE_CLEAN 1U +#define BDEV_STATE_DIRTY 2U +#define BDEV_STATE_STALE 3U + +/* Version 1: Seed pointer into btree node checksum + */ +#define BCACHE_BSET_VERSION 1 + +/* + * This is the on disk format for btree nodes - a btree node on disk is a list + * of these; within each set the keys are sorted + */ +struct bset { + uint64_t csum; + uint64_t magic; + uint64_t seq; + uint32_t version; + uint32_t keys; + + union { + struct bkey start[0]; + uint64_t d[0]; + }; +}; + +/* + * On disk format for priorities and gens - see super.c near prio_write() for + * more. + */ +struct prio_set { + uint64_t csum; + uint64_t magic; + uint64_t seq; + uint32_t version; + uint32_t pad; + + uint64_t next_bucket; + + struct bucket_disk { + uint16_t prio; + uint8_t gen; + } __attribute((packed)) data[]; +}; + +struct uuid_entry { + union { + struct { + uint8_t uuid[16]; + uint8_t label[32]; + uint32_t first_reg; + uint32_t last_reg; + uint32_t invalidated; + + uint32_t flags; + /* Size of flash only volumes */ + uint64_t sectors; + }; + + uint8_t pad[128]; + }; +}; + +BITMASK(UUID_FLASH_ONLY, struct uuid_entry, flags, 0, 1); + #include "journal.h" #include "stats.h" struct search; @@ -223,6 +384,8 @@ struct keybuf_key { void *private; }; +typedef bool (keybuf_pred_fn)(struct keybuf *, struct bkey *); + struct keybuf { struct bkey last_scanned; spinlock_t lock; @@ -237,7 +400,7 @@ struct keybuf { struct rb_root keys; -#define KEYBUF_NR 500 +#define KEYBUF_NR 100 DECLARE_ARRAY_ALLOCATOR(struct keybuf_key, freelist, KEYBUF_NR); }; @@ -266,15 +429,16 @@ struct bcache_device { struct gendisk *disk; - unsigned long flags; -#define BCACHE_DEV_CLOSING 0 -#define BCACHE_DEV_DETACHING 1 -#define BCACHE_DEV_UNLINK_DONE 2 + /* If nonzero, we're closing */ + atomic_t closing; + + /* If nonzero, we're detaching/unregistering from cache set */ + atomic_t detaching; + int flush_done; - unsigned nr_stripes; - unsigned stripe_size; + uint64_t nr_stripes; + unsigned stripe_size_bits; atomic_t *stripe_sectors_dirty; - unsigned long *full_dirty_stripes; unsigned long sectors_dirty_last; long sectors_dirty_derivative; @@ -345,7 +509,7 @@ struct cached_dev { /* Limit number of writeback bios in flight */ struct semaphore in_flight; - struct task_struct *writeback_thread; + struct closure_with_timer writeback; struct keybuf writeback_keys; @@ -363,8 +527,8 @@ struct cached_dev { unsigned sequential_cutoff; unsigned readahead; + unsigned sequential_merge:1; unsigned verify:1; - unsigned bypass_torture_test:1; unsigned partial_stripes_expensive:1; unsigned writeback_metadata:1; @@ -456,6 +620,15 @@ struct cache { bool discard; /* Get rid of? */ + /* + * We preallocate structs for issuing discards to buckets, and keep them + * on this list when they're not in use; do_discard() issues discards + * whenever there's work to do and is called by free_some_buckets() and + * when a discard finishes. + */ + atomic_t discards_in_flight; + struct list_head discards; + struct journal_device journal; /* The rest of this all shows up in sysfs */ @@ -476,6 +649,7 @@ struct gc_stat { size_t nkeys; uint64_t data; /* sectors */ + uint64_t dirty; /* sectors */ unsigned in_use; /* percent */ }; @@ -570,8 +744,8 @@ struct cache_set { * basically a lock for this that we can wait on asynchronously. The * btree_root() macro releases the lock when it returns. */ - struct task_struct *try_harder; - wait_queue_head_t try_wait; + struct closure *try_harder; + struct closure_waitlist try_wait; uint64_t try_harder_start; /* @@ -585,7 +759,7 @@ struct cache_set { * written. */ atomic_t prio_blocked; - wait_queue_head_t bucket_wait; + struct closure_waitlist bucket_wait; /* * For any bio we don't skip we subtract the number of sectors from @@ -608,7 +782,7 @@ struct cache_set { struct gc_stat gc_stats; size_t nbuckets; - struct task_struct *gc_thread; + struct closure_with_waitlist gc; /* Where in the btree gc currently is */ struct bkey gc_done; @@ -621,10 +795,11 @@ struct cache_set { /* Counts how many sectors bio_insert has added to the cache */ atomic_t sectors_to_gc; - wait_queue_head_t moving_gc_wait; + struct closure moving_gc; + struct closure_waitlist moving_gc_wait; struct keybuf moving_gc_keys; /* Number of moving GC bios in flight */ - struct semaphore moving_in_flight; + atomic_t in_flight; struct btree *root; @@ -666,27 +841,22 @@ struct cache_set { unsigned congested_read_threshold_us; unsigned congested_write_threshold_us; + spinlock_t sort_time_lock; struct time_stats sort_time; struct time_stats btree_gc_time; struct time_stats btree_split_time; + spinlock_t btree_read_time_lock; struct time_stats btree_read_time; struct time_stats try_harder_time; atomic_long_t cache_read_races; atomic_long_t writeback_keys_done; atomic_long_t writeback_keys_failed; - - enum { - ON_ERROR_UNREGISTER, - ON_ERROR_PANIC, - } on_error; unsigned error_limit; unsigned error_decay; - unsigned short journal_delay_ms; unsigned verify:1; unsigned key_merging_disabled:1; - unsigned expensive_debug_checks:1; unsigned gc_always_rewrite:1; unsigned shrinker_disabled:1; unsigned copy_gc_enabled:1; @@ -695,6 +865,21 @@ struct cache_set { struct hlist_head bucket_hash[1 << BUCKET_HASH_BITS]; }; +static inline bool key_merging_disabled(struct cache_set *c) +{ +#ifdef CONFIG_BCACHE_DEBUG + return c->key_merging_disabled; +#else + return 0; +#endif +} + +static inline bool SB_IS_BDEV(const struct cache_sb *sb) +{ + return sb->version == BCACHE_SB_VERSION_BDEV + || sb->version == BCACHE_SB_VERSION_BDEV_WITH_OFFSET; +} + struct bbio { unsigned submit_time_us; union { @@ -748,6 +933,59 @@ static inline unsigned local_clock_us(void) #define prio_buckets(c) \ DIV_ROUND_UP((size_t) (c)->sb.nbuckets, prios_per_bucket(c)) +#define JSET_MAGIC 0x245235c1a3625032ULL +#define PSET_MAGIC 0x6750e15f87337f91ULL +#define BSET_MAGIC 0x90135c78b99e07f5ULL + +#define jset_magic(c) ((c)->sb.set_magic ^ JSET_MAGIC) +#define pset_magic(c) ((c)->sb.set_magic ^ PSET_MAGIC) +#define bset_magic(c) ((c)->sb.set_magic ^ BSET_MAGIC) + +/* Bkey fields: all units are in sectors */ + +#define KEY_FIELD(name, field, offset, size) \ + BITMASK(name, struct bkey, field, offset, size) + +#define PTR_FIELD(name, offset, size) \ + static inline uint64_t name(const struct bkey *k, unsigned i) \ + { return (k->ptr[i] >> offset) & ~(((uint64_t) ~0) << size); } \ + \ + static inline void SET_##name(struct bkey *k, unsigned i, uint64_t v)\ + { \ + k->ptr[i] &= ~(~((uint64_t) ~0 << size) << offset); \ + k->ptr[i] |= v << offset; \ + } + +KEY_FIELD(KEY_PTRS, high, 60, 3) +KEY_FIELD(HEADER_SIZE, high, 58, 2) +KEY_FIELD(KEY_CSUM, high, 56, 2) +KEY_FIELD(KEY_PINNED, high, 55, 1) +KEY_FIELD(KEY_DIRTY, high, 36, 1) + +KEY_FIELD(KEY_SIZE, high, 20, 16) +KEY_FIELD(KEY_INODE, high, 0, 20) + +/* Next time I change the on disk format, KEY_OFFSET() won't be 64 bits */ + +static inline uint64_t KEY_OFFSET(const struct bkey *k) +{ + return k->low; +} + +static inline void SET_KEY_OFFSET(struct bkey *k, uint64_t v) +{ + k->low = v; +} + +PTR_FIELD(PTR_DEV, 51, 12) +PTR_FIELD(PTR_OFFSET, 8, 43) +PTR_FIELD(PTR_GEN, 0, 8) + +#define PTR_CHECK_DEV ((1 << 12) - 1) + +#define PTR(gen, offset, dev) \ + ((((uint64_t) dev) << 51) | ((uint64_t) offset) << 8 | gen) + static inline size_t sector_to_bucket(struct cache_set *c, sector_t s) { return s >> c->bucket_bits; @@ -786,11 +1024,27 @@ static inline struct bucket *PTR_BUCKET(struct cache_set *c, /* Btree key macros */ +/* + * The high bit being set is a relic from when we used it to do binary + * searches - it told you where a key started. It's not used anymore, + * and can probably be safely dropped. + */ +#define KEY(dev, sector, len) \ +((struct bkey) { \ + .high = (1ULL << 63) | ((uint64_t) (len) << 20) | (dev), \ + .low = (sector) \ +}) + static inline void bkey_init(struct bkey *k) { - *k = ZERO_KEY; + *k = KEY(0, 0, 0); } +#define KEY_START(k) (KEY_OFFSET(k) - KEY_SIZE(k)) +#define START_KEY(k) KEY(KEY_INODE(k), KEY_START(k), 0) +#define MAX_KEY KEY(~(~0 << 20), ((uint64_t) ~0) >> 1, 0) +#define ZERO_KEY KEY(0, 0, 0) + /* * This is used for various on disk data structures - cache_sb, prio_set, bset, * jset: The checksum is _always_ the first 8 bytes of these structs @@ -840,6 +1094,14 @@ do { \ for (b = (ca)->buckets + (ca)->sb.first_bucket; \ b < (ca)->buckets + (ca)->sb.nbuckets; b++) +static inline void __bkey_put(struct cache_set *c, struct bkey *k) +{ + unsigned i; + + for (i = 0; i < KEY_PTRS(k); i++) + atomic_dec_bug(&PTR_BUCKET(c, k, i)->pin); +} + static inline void cached_dev_put(struct cached_dev *dc) { if (atomic_dec_and_test(&dc->count)) @@ -911,15 +1173,13 @@ uint8_t bch_inc_gen(struct cache *, struct bucket *); void bch_rescale_priorities(struct cache_set *, int); bool bch_bucket_add_unused(struct cache *, struct bucket *); -long bch_bucket_alloc(struct cache *, unsigned, bool); +long bch_bucket_alloc(struct cache *, unsigned, struct closure *); void bch_bucket_free(struct cache_set *, struct bkey *); int __bch_bucket_alloc_set(struct cache_set *, unsigned, - struct bkey *, int, bool); + struct bkey *, int, struct closure *); int bch_bucket_alloc_set(struct cache_set *, unsigned, - struct bkey *, int, bool); -bool bch_alloc_sectors(struct cache_set *, struct bkey *, unsigned, - unsigned, unsigned, bool); + struct bkey *, int, struct closure *); __printf(2, 3) bool bch_cache_set_error(struct cache_set *, const char *, ...); @@ -927,7 +1187,7 @@ bool bch_cache_set_error(struct cache_set *, const char *, ...); void bch_prio_write(struct cache *); void bch_write_bdev_super(struct cached_dev *, struct closure *); -extern struct workqueue_struct *bcache_wq; +extern struct workqueue_struct *bcache_wq, *bch_gc_wq; extern const char * const bch_cache_modes[]; extern struct mutex bch_register_lock; extern struct list_head bch_cache_sets; @@ -960,14 +1220,15 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *); void bch_btree_cache_free(struct cache_set *); int bch_btree_cache_alloc(struct cache_set *); void bch_moving_init_cache_set(struct cache_set *); -int bch_open_buckets_alloc(struct cache_set *); -void bch_open_buckets_free(struct cache_set *); int bch_cache_allocator_start(struct cache *ca); +void bch_cache_allocator_exit(struct cache *ca); int bch_cache_allocator_init(struct cache *ca); void bch_debug_exit(void); int bch_debug_init(struct kobject *); +void bch_writeback_exit(void); +int bch_writeback_init(void); void bch_request_exit(void); int bch_request_init(void); void bch_btree_exit(void); |