summaryrefslogtreecommitdiff
path: root/drivers/md/bcache/bcache.h
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md/bcache/bcache.h')
-rw-r--r--drivers/md/bcache/bcache.h327
1 files changed, 294 insertions, 33 deletions
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index 4beb55a..0f12382 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -177,7 +177,6 @@
#define pr_fmt(fmt) "bcache: %s() " fmt "\n", __func__
-#include <linux/bcache.h>
#include <linux/bio.h>
#include <linux/kobject.h>
#include <linux/list.h>
@@ -211,6 +210,168 @@ BITMASK(GC_MARK, struct bucket, gc_mark, 0, 2);
#define GC_MARK_METADATA 2
BITMASK(GC_SECTORS_USED, struct bucket, gc_mark, 2, 14);
+struct bkey {
+ uint64_t high;
+ uint64_t low;
+ uint64_t ptr[];
+};
+
+/* Enough for a key with 6 pointers */
+#define BKEY_PAD 8
+
+#define BKEY_PADDED(key) \
+ union { struct bkey key; uint64_t key ## _pad[BKEY_PAD]; }
+
+/* Version 0: Cache device
+ * Version 1: Backing device
+ * Version 2: Seed pointer into btree node checksum
+ * Version 3: Cache device with new UUID format
+ * Version 4: Backing device with data offset
+ */
+#define BCACHE_SB_VERSION_CDEV 0
+#define BCACHE_SB_VERSION_BDEV 1
+#define BCACHE_SB_VERSION_CDEV_WITH_UUID 3
+#define BCACHE_SB_VERSION_BDEV_WITH_OFFSET 4
+#define BCACHE_SB_MAX_VERSION 4
+
+#define SB_SECTOR 8
+#define SB_SIZE 4096
+#define SB_LABEL_SIZE 32
+#define SB_JOURNAL_BUCKETS 256U
+/* SB_JOURNAL_BUCKETS must be divisible by BITS_PER_LONG */
+#define MAX_CACHES_PER_SET 8
+
+#define BDEV_DATA_START_DEFAULT 16 /* sectors */
+
+struct cache_sb {
+ uint64_t csum;
+ uint64_t offset; /* sector where this sb was written */
+ uint64_t version;
+
+ uint8_t magic[16];
+
+ uint8_t uuid[16];
+ union {
+ uint8_t set_uuid[16];
+ uint64_t set_magic;
+ };
+ uint8_t label[SB_LABEL_SIZE];
+
+ uint64_t flags;
+ uint64_t seq;
+ uint64_t pad[8];
+
+ union {
+ struct {
+ /* Cache devices */
+ uint64_t nbuckets; /* device size */
+
+ uint16_t block_size; /* sectors */
+ uint16_t bucket_size; /* sectors */
+
+ uint16_t nr_in_set;
+ uint16_t nr_this_dev;
+ };
+ struct {
+ /* Backing devices */
+ uint64_t data_offset;
+
+ /*
+ * block_size from the cache device section is still used by
+ * backing devices, so don't add anything here until we fix
+ * things to not need it for backing devices anymore
+ */
+ };
+ };
+
+ uint32_t last_mount; /* time_t */
+
+ uint16_t first_bucket;
+ union {
+ uint16_t njournal_buckets;
+ uint16_t keys;
+ };
+ uint64_t d[SB_JOURNAL_BUCKETS]; /* journal buckets */
+};
+
+BITMASK(CACHE_SYNC, struct cache_sb, flags, 0, 1);
+BITMASK(CACHE_DISCARD, struct cache_sb, flags, 1, 1);
+BITMASK(CACHE_REPLACEMENT, struct cache_sb, flags, 2, 3);
+#define CACHE_REPLACEMENT_LRU 0U
+#define CACHE_REPLACEMENT_FIFO 1U
+#define CACHE_REPLACEMENT_RANDOM 2U
+
+BITMASK(BDEV_CACHE_MODE, struct cache_sb, flags, 0, 4);
+#define CACHE_MODE_WRITETHROUGH 0U
+#define CACHE_MODE_WRITEBACK 1U
+#define CACHE_MODE_WRITEAROUND 2U
+#define CACHE_MODE_NONE 3U
+BITMASK(BDEV_STATE, struct cache_sb, flags, 61, 2);
+#define BDEV_STATE_NONE 0U
+#define BDEV_STATE_CLEAN 1U
+#define BDEV_STATE_DIRTY 2U
+#define BDEV_STATE_STALE 3U
+
+/* Version 1: Seed pointer into btree node checksum
+ */
+#define BCACHE_BSET_VERSION 1
+
+/*
+ * This is the on disk format for btree nodes - a btree node on disk is a list
+ * of these; within each set the keys are sorted
+ */
+struct bset {
+ uint64_t csum;
+ uint64_t magic;
+ uint64_t seq;
+ uint32_t version;
+ uint32_t keys;
+
+ union {
+ struct bkey start[0];
+ uint64_t d[0];
+ };
+};
+
+/*
+ * On disk format for priorities and gens - see super.c near prio_write() for
+ * more.
+ */
+struct prio_set {
+ uint64_t csum;
+ uint64_t magic;
+ uint64_t seq;
+ uint32_t version;
+ uint32_t pad;
+
+ uint64_t next_bucket;
+
+ struct bucket_disk {
+ uint16_t prio;
+ uint8_t gen;
+ } __attribute((packed)) data[];
+};
+
+struct uuid_entry {
+ union {
+ struct {
+ uint8_t uuid[16];
+ uint8_t label[32];
+ uint32_t first_reg;
+ uint32_t last_reg;
+ uint32_t invalidated;
+
+ uint32_t flags;
+ /* Size of flash only volumes */
+ uint64_t sectors;
+ };
+
+ uint8_t pad[128];
+ };
+};
+
+BITMASK(UUID_FLASH_ONLY, struct uuid_entry, flags, 0, 1);
+
#include "journal.h"
#include "stats.h"
struct search;
@@ -223,6 +384,8 @@ struct keybuf_key {
void *private;
};
+typedef bool (keybuf_pred_fn)(struct keybuf *, struct bkey *);
+
struct keybuf {
struct bkey last_scanned;
spinlock_t lock;
@@ -237,7 +400,7 @@ struct keybuf {
struct rb_root keys;
-#define KEYBUF_NR 500
+#define KEYBUF_NR 100
DECLARE_ARRAY_ALLOCATOR(struct keybuf_key, freelist, KEYBUF_NR);
};
@@ -266,15 +429,16 @@ struct bcache_device {
struct gendisk *disk;
- unsigned long flags;
-#define BCACHE_DEV_CLOSING 0
-#define BCACHE_DEV_DETACHING 1
-#define BCACHE_DEV_UNLINK_DONE 2
+ /* If nonzero, we're closing */
+ atomic_t closing;
+
+ /* If nonzero, we're detaching/unregistering from cache set */
+ atomic_t detaching;
+ int flush_done;
- unsigned nr_stripes;
- unsigned stripe_size;
+ uint64_t nr_stripes;
+ unsigned stripe_size_bits;
atomic_t *stripe_sectors_dirty;
- unsigned long *full_dirty_stripes;
unsigned long sectors_dirty_last;
long sectors_dirty_derivative;
@@ -345,7 +509,7 @@ struct cached_dev {
/* Limit number of writeback bios in flight */
struct semaphore in_flight;
- struct task_struct *writeback_thread;
+ struct closure_with_timer writeback;
struct keybuf writeback_keys;
@@ -363,8 +527,8 @@ struct cached_dev {
unsigned sequential_cutoff;
unsigned readahead;
+ unsigned sequential_merge:1;
unsigned verify:1;
- unsigned bypass_torture_test:1;
unsigned partial_stripes_expensive:1;
unsigned writeback_metadata:1;
@@ -456,6 +620,15 @@ struct cache {
bool discard; /* Get rid of? */
+ /*
+ * We preallocate structs for issuing discards to buckets, and keep them
+ * on this list when they're not in use; do_discard() issues discards
+ * whenever there's work to do and is called by free_some_buckets() and
+ * when a discard finishes.
+ */
+ atomic_t discards_in_flight;
+ struct list_head discards;
+
struct journal_device journal;
/* The rest of this all shows up in sysfs */
@@ -476,6 +649,7 @@ struct gc_stat {
size_t nkeys;
uint64_t data; /* sectors */
+ uint64_t dirty; /* sectors */
unsigned in_use; /* percent */
};
@@ -570,8 +744,8 @@ struct cache_set {
* basically a lock for this that we can wait on asynchronously. The
* btree_root() macro releases the lock when it returns.
*/
- struct task_struct *try_harder;
- wait_queue_head_t try_wait;
+ struct closure *try_harder;
+ struct closure_waitlist try_wait;
uint64_t try_harder_start;
/*
@@ -585,7 +759,7 @@ struct cache_set {
* written.
*/
atomic_t prio_blocked;
- wait_queue_head_t bucket_wait;
+ struct closure_waitlist bucket_wait;
/*
* For any bio we don't skip we subtract the number of sectors from
@@ -608,7 +782,7 @@ struct cache_set {
struct gc_stat gc_stats;
size_t nbuckets;
- struct task_struct *gc_thread;
+ struct closure_with_waitlist gc;
/* Where in the btree gc currently is */
struct bkey gc_done;
@@ -621,10 +795,11 @@ struct cache_set {
/* Counts how many sectors bio_insert has added to the cache */
atomic_t sectors_to_gc;
- wait_queue_head_t moving_gc_wait;
+ struct closure moving_gc;
+ struct closure_waitlist moving_gc_wait;
struct keybuf moving_gc_keys;
/* Number of moving GC bios in flight */
- struct semaphore moving_in_flight;
+ atomic_t in_flight;
struct btree *root;
@@ -666,27 +841,22 @@ struct cache_set {
unsigned congested_read_threshold_us;
unsigned congested_write_threshold_us;
+ spinlock_t sort_time_lock;
struct time_stats sort_time;
struct time_stats btree_gc_time;
struct time_stats btree_split_time;
+ spinlock_t btree_read_time_lock;
struct time_stats btree_read_time;
struct time_stats try_harder_time;
atomic_long_t cache_read_races;
atomic_long_t writeback_keys_done;
atomic_long_t writeback_keys_failed;
-
- enum {
- ON_ERROR_UNREGISTER,
- ON_ERROR_PANIC,
- } on_error;
unsigned error_limit;
unsigned error_decay;
-
unsigned short journal_delay_ms;
unsigned verify:1;
unsigned key_merging_disabled:1;
- unsigned expensive_debug_checks:1;
unsigned gc_always_rewrite:1;
unsigned shrinker_disabled:1;
unsigned copy_gc_enabled:1;
@@ -695,6 +865,21 @@ struct cache_set {
struct hlist_head bucket_hash[1 << BUCKET_HASH_BITS];
};
+static inline bool key_merging_disabled(struct cache_set *c)
+{
+#ifdef CONFIG_BCACHE_DEBUG
+ return c->key_merging_disabled;
+#else
+ return 0;
+#endif
+}
+
+static inline bool SB_IS_BDEV(const struct cache_sb *sb)
+{
+ return sb->version == BCACHE_SB_VERSION_BDEV
+ || sb->version == BCACHE_SB_VERSION_BDEV_WITH_OFFSET;
+}
+
struct bbio {
unsigned submit_time_us;
union {
@@ -748,6 +933,59 @@ static inline unsigned local_clock_us(void)
#define prio_buckets(c) \
DIV_ROUND_UP((size_t) (c)->sb.nbuckets, prios_per_bucket(c))
+#define JSET_MAGIC 0x245235c1a3625032ULL
+#define PSET_MAGIC 0x6750e15f87337f91ULL
+#define BSET_MAGIC 0x90135c78b99e07f5ULL
+
+#define jset_magic(c) ((c)->sb.set_magic ^ JSET_MAGIC)
+#define pset_magic(c) ((c)->sb.set_magic ^ PSET_MAGIC)
+#define bset_magic(c) ((c)->sb.set_magic ^ BSET_MAGIC)
+
+/* Bkey fields: all units are in sectors */
+
+#define KEY_FIELD(name, field, offset, size) \
+ BITMASK(name, struct bkey, field, offset, size)
+
+#define PTR_FIELD(name, offset, size) \
+ static inline uint64_t name(const struct bkey *k, unsigned i) \
+ { return (k->ptr[i] >> offset) & ~(((uint64_t) ~0) << size); } \
+ \
+ static inline void SET_##name(struct bkey *k, unsigned i, uint64_t v)\
+ { \
+ k->ptr[i] &= ~(~((uint64_t) ~0 << size) << offset); \
+ k->ptr[i] |= v << offset; \
+ }
+
+KEY_FIELD(KEY_PTRS, high, 60, 3)
+KEY_FIELD(HEADER_SIZE, high, 58, 2)
+KEY_FIELD(KEY_CSUM, high, 56, 2)
+KEY_FIELD(KEY_PINNED, high, 55, 1)
+KEY_FIELD(KEY_DIRTY, high, 36, 1)
+
+KEY_FIELD(KEY_SIZE, high, 20, 16)
+KEY_FIELD(KEY_INODE, high, 0, 20)
+
+/* Next time I change the on disk format, KEY_OFFSET() won't be 64 bits */
+
+static inline uint64_t KEY_OFFSET(const struct bkey *k)
+{
+ return k->low;
+}
+
+static inline void SET_KEY_OFFSET(struct bkey *k, uint64_t v)
+{
+ k->low = v;
+}
+
+PTR_FIELD(PTR_DEV, 51, 12)
+PTR_FIELD(PTR_OFFSET, 8, 43)
+PTR_FIELD(PTR_GEN, 0, 8)
+
+#define PTR_CHECK_DEV ((1 << 12) - 1)
+
+#define PTR(gen, offset, dev) \
+ ((((uint64_t) dev) << 51) | ((uint64_t) offset) << 8 | gen)
+
static inline size_t sector_to_bucket(struct cache_set *c, sector_t s)
{
return s >> c->bucket_bits;
@@ -786,11 +1024,27 @@ static inline struct bucket *PTR_BUCKET(struct cache_set *c,
/* Btree key macros */
+/*
+ * The high bit being set is a relic from when we used it to do binary
+ * searches - it told you where a key started. It's not used anymore,
+ * and can probably be safely dropped.
+ */
+#define KEY(dev, sector, len) \
+((struct bkey) { \
+ .high = (1ULL << 63) | ((uint64_t) (len) << 20) | (dev), \
+ .low = (sector) \
+})
+
static inline void bkey_init(struct bkey *k)
{
- *k = ZERO_KEY;
+ *k = KEY(0, 0, 0);
}
+#define KEY_START(k) (KEY_OFFSET(k) - KEY_SIZE(k))
+#define START_KEY(k) KEY(KEY_INODE(k), KEY_START(k), 0)
+#define MAX_KEY KEY(~(~0 << 20), ((uint64_t) ~0) >> 1, 0)
+#define ZERO_KEY KEY(0, 0, 0)
+
/*
* This is used for various on disk data structures - cache_sb, prio_set, bset,
* jset: The checksum is _always_ the first 8 bytes of these structs
@@ -840,6 +1094,14 @@ do { \
for (b = (ca)->buckets + (ca)->sb.first_bucket; \
b < (ca)->buckets + (ca)->sb.nbuckets; b++)
+static inline void __bkey_put(struct cache_set *c, struct bkey *k)
+{
+ unsigned i;
+
+ for (i = 0; i < KEY_PTRS(k); i++)
+ atomic_dec_bug(&PTR_BUCKET(c, k, i)->pin);
+}
+
static inline void cached_dev_put(struct cached_dev *dc)
{
if (atomic_dec_and_test(&dc->count))
@@ -911,15 +1173,13 @@ uint8_t bch_inc_gen(struct cache *, struct bucket *);
void bch_rescale_priorities(struct cache_set *, int);
bool bch_bucket_add_unused(struct cache *, struct bucket *);
-long bch_bucket_alloc(struct cache *, unsigned, bool);
+long bch_bucket_alloc(struct cache *, unsigned, struct closure *);
void bch_bucket_free(struct cache_set *, struct bkey *);
int __bch_bucket_alloc_set(struct cache_set *, unsigned,
- struct bkey *, int, bool);
+ struct bkey *, int, struct closure *);
int bch_bucket_alloc_set(struct cache_set *, unsigned,
- struct bkey *, int, bool);
-bool bch_alloc_sectors(struct cache_set *, struct bkey *, unsigned,
- unsigned, unsigned, bool);
+ struct bkey *, int, struct closure *);
__printf(2, 3)
bool bch_cache_set_error(struct cache_set *, const char *, ...);
@@ -927,7 +1187,7 @@ bool bch_cache_set_error(struct cache_set *, const char *, ...);
void bch_prio_write(struct cache *);
void bch_write_bdev_super(struct cached_dev *, struct closure *);
-extern struct workqueue_struct *bcache_wq;
+extern struct workqueue_struct *bcache_wq, *bch_gc_wq;
extern const char * const bch_cache_modes[];
extern struct mutex bch_register_lock;
extern struct list_head bch_cache_sets;
@@ -960,14 +1220,15 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *);
void bch_btree_cache_free(struct cache_set *);
int bch_btree_cache_alloc(struct cache_set *);
void bch_moving_init_cache_set(struct cache_set *);
-int bch_open_buckets_alloc(struct cache_set *);
-void bch_open_buckets_free(struct cache_set *);
int bch_cache_allocator_start(struct cache *ca);
+void bch_cache_allocator_exit(struct cache *ca);
int bch_cache_allocator_init(struct cache *ca);
void bch_debug_exit(void);
int bch_debug_init(struct kobject *);
+void bch_writeback_exit(void);
+int bch_writeback_init(void);
void bch_request_exit(void);
int bch_request_init(void);
void bch_btree_exit(void);