From eea1bf384e05b5ab747f8530c4fba9e9e6907fff Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 31 Mar 2009 14:27:02 +1100 Subject: md: Fix is_mddev_idle test (again). There are two problems with is_mddev_idle. 1/ sync_io is 'atomic_t' and hence 'int'. curr_events and all the rest are 'long'. So if sync_io were to wrap on a 64bit host, the value of curr_events would go very negative suddenly, and take a very long time to return to positive. So do all calculations as 'int'. That gives us plenty of precision for what we need. 2/ To initialise rdev->last_events we simply call is_mddev_idle, on the assumption that it will make sure that last_events is in a suitable range. It used to do this, but now it does not. So now we need to be more explicit about initialisation. Signed-off-by: NeilBrown diff --git a/drivers/md/md.c b/drivers/md/md.c index 03b4cd0..a99c50e 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -5716,19 +5716,19 @@ int unregister_md_personality(struct mdk_personality *p) return 0; } -static int is_mddev_idle(mddev_t *mddev) +static int is_mddev_idle(mddev_t *mddev, int init) { mdk_rdev_t * rdev; int idle; - long curr_events; + int curr_events; idle = 1; rcu_read_lock(); rdev_for_each_rcu(rdev, mddev) { struct gendisk *disk = rdev->bdev->bd_contains->bd_disk; - curr_events = part_stat_read(&disk->part0, sectors[0]) + - part_stat_read(&disk->part0, sectors[1]) - - atomic_read(&disk->sync_io); + curr_events = (int)part_stat_read(&disk->part0, sectors[0]) + + (int)part_stat_read(&disk->part0, sectors[1]) - + atomic_read(&disk->sync_io); /* sync IO will cause sync_io to increase before the disk_stats * as sync_io is counted when a request starts, and * disk_stats is counted when it completes. @@ -5751,7 +5751,7 @@ static int is_mddev_idle(mddev_t *mddev) * always make curr_events less than last_events. * */ - if (curr_events - rdev->last_events > 4096) { + if (init || curr_events - rdev->last_events > 64) { rdev->last_events = curr_events; idle = 0; } @@ -5994,7 +5994,7 @@ void md_do_sync(mddev_t *mddev) "(but not more than %d KB/sec) for %s.\n", speed_max(mddev), desc); - is_mddev_idle(mddev); /* this also initializes IO event counters */ + is_mddev_idle(mddev, 1); /* this initializes IO event counters */ io_sectors = 0; for (m = 0; m < SYNC_MARKS; m++) { @@ -6096,7 +6096,7 @@ void md_do_sync(mddev_t *mddev) if (currspeed > speed_min(mddev)) { if ((currspeed > speed_max(mddev)) || - !is_mddev_idle(mddev)) { + !is_mddev_idle(mddev, 0)) { msleep(500); goto repeat; } diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h index 9743e4d..4aedb9f 100644 --- a/include/linux/raid/md_k.h +++ b/include/linux/raid/md_k.h @@ -51,7 +51,7 @@ struct mdk_rdev_s sector_t size; /* Device size (in blocks) */ mddev_t *mddev; /* RAID array if running */ - long last_events; /* IO event timestamp */ + int last_events; /* IO event timestamp */ struct block_device *bdev; /* block device handle */ -- cgit v0.10.2 From 1187cf0a3c8b647d08bc86e043563c8d2a327adc Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 31 Mar 2009 14:27:02 +1100 Subject: md: Allow write-intent bitmaps to have chunksize < PAGE_SIZE md currently insists that the chunk size used for write-intent bitmaps (the amount of data that corresponds to one chunk) be at least one page. The reason for this restriction is lost in the mists of time, but a review of the code (and a vague memory) suggests that the only problem would be related to resync. Resync tries very hard to work in multiples of a page, but also needs to sync with units of a bitmap_chunk too. This connection comes out in the bitmap_start_sync call. So change bitmap_start_sync to always work in multiples of a page. If the bitmap chunk size is less that one page, we flag multiple chunks as 'syncing' and generally make them all appear to the resync routines like one chunk. All other code either already works with data ranges that could span multiple chunks, or explicitly only cares about a single chunk. Signed-off-by: Neil Brown diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index 7199437..8fa3277 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c @@ -111,9 +111,10 @@ static int bitmap_checkpage(struct bitmap *bitmap, unsigned long page, int creat unsigned char *mappage; if (page >= bitmap->pages) { - printk(KERN_ALERT - "%s: invalid bitmap page request: %lu (> %lu)\n", - bmname(bitmap), page, bitmap->pages-1); + /* This can happen if bitmap_start_sync goes beyond + * End-of-device while looking for a whole page. + * It is harmless. + */ return -EINVAL; } @@ -570,7 +571,7 @@ static int bitmap_read_sb(struct bitmap *bitmap) else if (le32_to_cpu(sb->version) < BITMAP_MAJOR_LO || le32_to_cpu(sb->version) > BITMAP_MAJOR_HI) reason = "unrecognized superblock version"; - else if (chunksize < PAGE_SIZE) + else if (chunksize < 512) reason = "bitmap chunksize too small"; else if ((1 << ffz(~chunksize)) != chunksize) reason = "bitmap chunksize not a power of 2"; @@ -1345,8 +1346,8 @@ void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long secto } } -int bitmap_start_sync(struct bitmap *bitmap, sector_t offset, int *blocks, - int degraded) +static int __bitmap_start_sync(struct bitmap *bitmap, sector_t offset, int *blocks, + int degraded) { bitmap_counter_t *bmc; int rv; @@ -1374,6 +1375,29 @@ int bitmap_start_sync(struct bitmap *bitmap, sector_t offset, int *blocks, return rv; } +int bitmap_start_sync(struct bitmap *bitmap, sector_t offset, int *blocks, + int degraded) +{ + /* bitmap_start_sync must always report on multiples of whole + * pages, otherwise resync (which is very PAGE_SIZE based) will + * get confused. + * So call __bitmap_start_sync repeatedly (if needed) until + * At least PAGE_SIZE>>9 blocks are covered. + * Return the 'or' of the result. + */ + int rv = 0; + int blocks1; + + *blocks = 0; + while (*blocks < (PAGE_SIZE>>9)) { + rv |= __bitmap_start_sync(bitmap, offset, + &blocks1, degraded); + offset += blocks1; + *blocks += blocks1; + } + return rv; +} + void bitmap_end_sync(struct bitmap *bitmap, sector_t offset, int *blocks, int aborted) { bitmap_counter_t *bmc; -- cgit v0.10.2 From d0a4bb492772ce5c4bdfba3744a99ed6f6fb238f Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 31 Mar 2009 14:27:02 +1100 Subject: md: never clear bit from the write-intent bitmap when the array is degraded. It is safe to clear a bit from the write-intent bitmap for a raid1 if we know the data has been written to all devices, which is what the current test does. But it is not always safe to update the 'events_cleared' counter in that case. This is because one request could complete successfully after some other request has partially failed. So simply disable the clearing and updating of events_cleared whenever the array is degraded. This might end up not clearing some bits that could safely be cleared, but it is safest approach. Note that the bug fixed here did not risk corrupting data by letting the array get out-of-sync. Rather it meant that when a device is removed and re-added to the array, it might incorrectly require a full recovery rather than just recovering based on the bitmap. Signed-off-by: NeilBrown diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index 8fa3277..2ef497d 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c @@ -1307,6 +1307,9 @@ void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long secto PRINTK(KERN_DEBUG "dec write-behind count %d/%d\n", atomic_read(&bitmap->behind_writes), bitmap->max_write_behind); } + if (bitmap->mddev->degraded) + /* Never clear bits or update events_cleared when degraded */ + success = 0; while (sectors) { int blocks; -- cgit v0.10.2 From 355a43e641b948a7b755cb4c2466ec548d5b495f Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 31 Mar 2009 14:27:02 +1100 Subject: md: write bitmap information to devices that are undergoing recovery. When we add some spares to an array and start recovery, and we have a bitmap which is stored 'internally' on all devices, we call bitmap_write_all to make sure the bitmap is correct on the new device(s). However that doesn't work as write_sb_page only writes to 'In_sync' devices, and devices undergoing recovery are not 'In_sync' until recovery finishes. So extend write_sb_page (actually next_active_rdev) to include devices that are under recovery. Signed-off-by: NeilBrown diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index 2ef497d..27f978d 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c @@ -266,7 +266,6 @@ static mdk_rdev_t *next_active_rdev(mdk_rdev_t *rdev, mddev_t *mddev) list_for_each_continue_rcu(pos, &mddev->disks) { rdev = list_entry(pos, mdk_rdev_t, same_set); if (rdev->raid_disk >= 0 && - test_bit(In_sync, &rdev->flags) && !test_bit(Faulty, &rdev->flags)) { /* this is a usable devices */ atomic_inc(&rdev->nr_pending); -- cgit v0.10.2 From 3f9d99c12a533809342b475c95452e82761bcc1c Mon Sep 17 00:00:00 2001 From: "Martin K. Petersen" Date: Tue, 31 Mar 2009 14:27:02 +1100 Subject: MD data integrity support md: Add support for data integrity to MD If all subdevices support the same protection format the MD device is flagged as integrity capable. Signed-off-by: Martin K. Petersen Signed-off-by: NeilBrown diff --git a/drivers/md/md.c b/drivers/md/md.c index a99c50e..f30b461 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -1438,6 +1438,38 @@ static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2) static LIST_HEAD(pending_raid_disks); +static void md_integrity_check(mdk_rdev_t *rdev, mddev_t *mddev) +{ + struct mdk_personality *pers = mddev->pers; + struct gendisk *disk = mddev->gendisk; + struct blk_integrity *bi_rdev = bdev_get_integrity(rdev->bdev); + struct blk_integrity *bi_mddev = blk_get_integrity(disk); + + /* Data integrity passthrough not supported on RAID 4, 5 and 6 */ + if (pers && pers->level >= 4 && pers->level <= 6) + return; + + /* If rdev is integrity capable, register profile for mddev */ + if (!bi_mddev && bi_rdev) { + if (blk_integrity_register(disk, bi_rdev)) + printk(KERN_ERR "%s: %s Could not register integrity!\n", + __func__, disk->disk_name); + else + printk(KERN_NOTICE "Enabling data integrity on %s\n", + disk->disk_name); + return; + } + + /* Check that mddev and rdev have matching profiles */ + if (blk_integrity_compare(disk, rdev->bdev->bd_disk) < 0) { + printk(KERN_ERR "%s: %s/%s integrity mismatch!\n", __func__, + disk->disk_name, rdev->bdev->bd_disk->disk_name); + printk(KERN_NOTICE "Disabling data integrity on %s\n", + disk->disk_name); + blk_integrity_unregister(disk); + } +} + static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev) { char b[BDEVNAME_SIZE]; @@ -1508,6 +1540,8 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev) /* May as well allow recovery to be retried once */ mddev->recovery_disabled = 0; + + md_integrity_check(rdev, mddev); return 0; fail: @@ -3794,6 +3828,10 @@ static int do_md_run(mddev_t * mddev) mddev->level = pers->level; strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); + if (pers->level >= 4 && pers->level <= 6) + /* Cannot support integrity (yet) */ + blk_integrity_unregister(mddev->gendisk); + if (mddev->reshape_position != MaxSector && pers->start_reshape == NULL) { /* This personality cannot handle reshaping... */ @@ -4129,6 +4167,7 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open) printk(KERN_INFO "md: %s switched to read-only mode.\n", mdname(mddev)); err = 0; + blk_integrity_unregister(disk); md_new_event(mddev); sysfs_notify_dirent(mddev->sysfs_state); out: -- cgit v0.10.2 From 3dbd8c2e3ff0185585e068f190289d2a267a3e83 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 31 Mar 2009 14:27:02 +1100 Subject: md: stop defining MAJOR_NR MAJOR_NR was only required for magic in linux/blk.h in 2.4 or earlier kernels, so no need to keep it around. Signed-off-by: Christoph Hellwig Signed-off-by: NeilBrown diff --git a/drivers/md/md.c b/drivers/md/md.c index f30b461..3efc0bc 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -46,8 +46,6 @@ #include #include -#define MAJOR_NR MD_MAJOR - /* 63 partitions with the alternate major number (mdp) */ #define MdpMinorShift 6 @@ -6503,13 +6501,13 @@ static void md_geninit(void) static int __init md_init(void) { - if (register_blkdev(MAJOR_NR, "md")) + if (register_blkdev(MD_MAJOR, "md")) return -1; if ((mdp_major=register_blkdev(0, "mdp"))<=0) { - unregister_blkdev(MAJOR_NR, "md"); + unregister_blkdev(MD_MAJOR, "md"); return -1; } - blk_register_region(MKDEV(MAJOR_NR, 0), 1UL< Date: Tue, 31 Mar 2009 14:27:02 +1100 Subject: cleanup drivers/md/Makefile Use the -y variables instead of the old -objs so we can easily add conditional objects to the modules. Also always use += to add subobjects to avoid problems when placing additional objects in some place in the file. Signed-off-by: Christoph Hellwig Signed-off-by: NeilBrown diff --git a/drivers/md/Makefile b/drivers/md/Makefile index 72880b7..3b118da 100644 --- a/drivers/md/Makefile +++ b/drivers/md/Makefile @@ -2,20 +2,20 @@ # Makefile for the kernel software RAID and LVM drivers. # -dm-mod-objs := dm.o dm-table.o dm-target.o dm-linear.o dm-stripe.o \ +dm-mod-y += dm.o dm-table.o dm-target.o dm-linear.o dm-stripe.o \ dm-ioctl.o dm-io.o dm-kcopyd.o dm-sysfs.o -dm-multipath-objs := dm-path-selector.o dm-mpath.o -dm-snapshot-objs := dm-snap.o dm-exception-store.o dm-snap-transient.o \ +dm-multipath-y += dm-path-selector.o dm-mpath.o +dm-snapshot-y += dm-snap.o dm-exception-store.o dm-snap-transient.o \ dm-snap-persistent.o -dm-mirror-objs := dm-raid1.o -md-mod-objs := md.o bitmap.o -raid456-objs := raid5.o raid6algos.o raid6recov.o raid6tables.o \ +dm-mirror-y += dm-raid1.o +md-mod-y += md.o bitmap.o +raid456-y += raid5.o raid6algos.o raid6recov.o raid6tables.o \ raid6int1.o raid6int2.o raid6int4.o \ raid6int8.o raid6int16.o raid6int32.o \ raid6altivec1.o raid6altivec2.o raid6altivec4.o \ raid6altivec8.o \ raid6mmx.o raid6sse1.o raid6sse2.o -hostprogs-y := mktables +hostprogs-y += mktables # Note: link order is important. All raid personalities # and must come before md.o, as they each initialise -- cgit v0.10.2 From ef740c372dfd80e706dbf955d4e4aedda6c0c148 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 31 Mar 2009 14:27:03 +1100 Subject: md: move headers out of include/linux/raid/ Move the headers with the local structures for the disciplines and bitmap.h into drivers/md/ so that they are more easily grepable for hacking and not far away. md.h is left where it is for now as there are some uses from the outside. Signed-off-by: Christoph Hellwig Signed-off-by: NeilBrown diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index 27f978d..7666117 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c @@ -27,7 +27,7 @@ #include #include #include -#include +#include "bitmap.h" /* debug macros */ diff --git a/drivers/md/bitmap.h b/drivers/md/bitmap.h new file mode 100644 index 0000000..e989006 --- /dev/null +++ b/drivers/md/bitmap.h @@ -0,0 +1,288 @@ +/* + * bitmap.h: Copyright (C) Peter T. Breuer (ptb@ot.uc3m.es) 2003 + * + * additions: Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc. + */ +#ifndef BITMAP_H +#define BITMAP_H 1 + +#define BITMAP_MAJOR_LO 3 +/* version 4 insists the bitmap is in little-endian order + * with version 3, it is host-endian which is non-portable + */ +#define BITMAP_MAJOR_HI 4 +#define BITMAP_MAJOR_HOSTENDIAN 3 + +#define BITMAP_MINOR 39 + +/* + * in-memory bitmap: + * + * Use 16 bit block counters to track pending writes to each "chunk". + * The 2 high order bits are special-purpose, the first is a flag indicating + * whether a resync is needed. The second is a flag indicating whether a + * resync is active. + * This means that the counter is actually 14 bits: + * + * +--------+--------+------------------------------------------------+ + * | resync | resync | counter | + * | needed | active | | + * | (0-1) | (0-1) | (0-16383) | + * +--------+--------+------------------------------------------------+ + * + * The "resync needed" bit is set when: + * a '1' bit is read from storage at startup. + * a write request fails on some drives + * a resync is aborted on a chunk with 'resync active' set + * It is cleared (and resync-active set) when a resync starts across all drives + * of the chunk. + * + * + * The "resync active" bit is set when: + * a resync is started on all drives, and resync_needed is set. + * resync_needed will be cleared (as long as resync_active wasn't already set). + * It is cleared when a resync completes. + * + * The counter counts pending write requests, plus the on-disk bit. + * When the counter is '1' and the resync bits are clear, the on-disk + * bit can be cleared aswell, thus setting the counter to 0. + * When we set a bit, or in the counter (to start a write), if the fields is + * 0, we first set the disk bit and set the counter to 1. + * + * If the counter is 0, the on-disk bit is clear and the stipe is clean + * Anything that dirties the stipe pushes the counter to 2 (at least) + * and sets the on-disk bit (lazily). + * If a periodic sweep find the counter at 2, it is decremented to 1. + * If the sweep find the counter at 1, the on-disk bit is cleared and the + * counter goes to zero. + * + * Also, we'll hijack the "map" pointer itself and use it as two 16 bit block + * counters as a fallback when "page" memory cannot be allocated: + * + * Normal case (page memory allocated): + * + * page pointer (32-bit) + * + * [ ] ------+ + * | + * +-------> [ ][ ]..[ ] (4096 byte page == 2048 counters) + * c1 c2 c2048 + * + * Hijacked case (page memory allocation failed): + * + * hijacked page pointer (32-bit) + * + * [ ][ ] (no page memory allocated) + * counter #1 (16-bit) counter #2 (16-bit) + * + */ + +#ifdef __KERNEL__ + +#define PAGE_BITS (PAGE_SIZE << 3) +#define PAGE_BIT_SHIFT (PAGE_SHIFT + 3) + +typedef __u16 bitmap_counter_t; +#define COUNTER_BITS 16 +#define COUNTER_BIT_SHIFT 4 +#define COUNTER_BYTE_RATIO (COUNTER_BITS / 8) +#define COUNTER_BYTE_SHIFT (COUNTER_BIT_SHIFT - 3) + +#define NEEDED_MASK ((bitmap_counter_t) (1 << (COUNTER_BITS - 1))) +#define RESYNC_MASK ((bitmap_counter_t) (1 << (COUNTER_BITS - 2))) +#define COUNTER_MAX ((bitmap_counter_t) RESYNC_MASK - 1) +#define NEEDED(x) (((bitmap_counter_t) x) & NEEDED_MASK) +#define RESYNC(x) (((bitmap_counter_t) x) & RESYNC_MASK) +#define COUNTER(x) (((bitmap_counter_t) x) & COUNTER_MAX) + +/* how many counters per page? */ +#define PAGE_COUNTER_RATIO (PAGE_BITS / COUNTER_BITS) +/* same, except a shift value for more efficient bitops */ +#define PAGE_COUNTER_SHIFT (PAGE_BIT_SHIFT - COUNTER_BIT_SHIFT) +/* same, except a mask value for more efficient bitops */ +#define PAGE_COUNTER_MASK (PAGE_COUNTER_RATIO - 1) + +#define BITMAP_BLOCK_SIZE 512 +#define BITMAP_BLOCK_SHIFT 9 + +/* how many blocks per chunk? (this is variable) */ +#define CHUNK_BLOCK_RATIO(bitmap) ((bitmap)->chunksize >> BITMAP_BLOCK_SHIFT) +#define CHUNK_BLOCK_SHIFT(bitmap) ((bitmap)->chunkshift - BITMAP_BLOCK_SHIFT) +#define CHUNK_BLOCK_MASK(bitmap) (CHUNK_BLOCK_RATIO(bitmap) - 1) + +/* when hijacked, the counters and bits represent even larger "chunks" */ +/* there will be 1024 chunks represented by each counter in the page pointers */ +#define PAGEPTR_BLOCK_RATIO(bitmap) \ + (CHUNK_BLOCK_RATIO(bitmap) << PAGE_COUNTER_SHIFT >> 1) +#define PAGEPTR_BLOCK_SHIFT(bitmap) \ + (CHUNK_BLOCK_SHIFT(bitmap) + PAGE_COUNTER_SHIFT - 1) +#define PAGEPTR_BLOCK_MASK(bitmap) (PAGEPTR_BLOCK_RATIO(bitmap) - 1) + +/* + * on-disk bitmap: + * + * Use one bit per "chunk" (block set). We do the disk I/O on the bitmap + * file a page at a time. There's a superblock at the start of the file. + */ + +/* map chunks (bits) to file pages - offset by the size of the superblock */ +#define CHUNK_BIT_OFFSET(chunk) ((chunk) + (sizeof(bitmap_super_t) << 3)) + +#endif + +/* + * bitmap structures: + */ + +#define BITMAP_MAGIC 0x6d746962 + +/* use these for bitmap->flags and bitmap->sb->state bit-fields */ +enum bitmap_state { + BITMAP_STALE = 0x002, /* the bitmap file is out of date or had -EIO */ + BITMAP_WRITE_ERROR = 0x004, /* A write error has occurred */ + BITMAP_HOSTENDIAN = 0x8000, +}; + +/* the superblock at the front of the bitmap file -- little endian */ +typedef struct bitmap_super_s { + __le32 magic; /* 0 BITMAP_MAGIC */ + __le32 version; /* 4 the bitmap major for now, could change... */ + __u8 uuid[16]; /* 8 128 bit uuid - must match md device uuid */ + __le64 events; /* 24 event counter for the bitmap (1)*/ + __le64 events_cleared;/*32 event counter when last bit cleared (2) */ + __le64 sync_size; /* 40 the size of the md device's sync range(3) */ + __le32 state; /* 48 bitmap state information */ + __le32 chunksize; /* 52 the bitmap chunk size in bytes */ + __le32 daemon_sleep; /* 56 seconds between disk flushes */ + __le32 write_behind; /* 60 number of outstanding write-behind writes */ + + __u8 pad[256 - 64]; /* set to zero */ +} bitmap_super_t; + +/* notes: + * (1) This event counter is updated before the eventcounter in the md superblock + * When a bitmap is loaded, it is only accepted if this event counter is equal + * to, or one greater than, the event counter in the superblock. + * (2) This event counter is updated when the other one is *if*and*only*if* the + * array is not degraded. As bits are not cleared when the array is degraded, + * this represents the last time that any bits were cleared. + * If a device is being added that has an event count with this value or + * higher, it is accepted as conforming to the bitmap. + * (3)This is the number of sectors represented by the bitmap, and is the range that + * resync happens across. For raid1 and raid5/6 it is the size of individual + * devices. For raid10 it is the size of the array. + */ + +#ifdef __KERNEL__ + +/* the in-memory bitmap is represented by bitmap_pages */ +struct bitmap_page { + /* + * map points to the actual memory page + */ + char *map; + /* + * in emergencies (when map cannot be alloced), hijack the map + * pointer and use it as two counters itself + */ + unsigned int hijacked:1; + /* + * count of dirty bits on the page + */ + unsigned int count:31; +}; + +/* keep track of bitmap file pages that have pending writes on them */ +struct page_list { + struct list_head list; + struct page *page; +}; + +/* the main bitmap structure - one per mddev */ +struct bitmap { + struct bitmap_page *bp; + unsigned long pages; /* total number of pages in the bitmap */ + unsigned long missing_pages; /* number of pages not yet allocated */ + + mddev_t *mddev; /* the md device that the bitmap is for */ + + int counter_bits; /* how many bits per block counter */ + + /* bitmap chunksize -- how much data does each bit represent? */ + unsigned long chunksize; + unsigned long chunkshift; /* chunksize = 2^chunkshift (for bitops) */ + unsigned long chunks; /* total number of data chunks for the array */ + + /* We hold a count on the chunk currently being synced, and drop + * it when the last block is started. If the resync is aborted + * midway, we need to be able to drop that count, so we remember + * the counted chunk.. + */ + unsigned long syncchunk; + + __u64 events_cleared; + int need_sync; + + /* bitmap spinlock */ + spinlock_t lock; + + long offset; /* offset from superblock if file is NULL */ + struct file *file; /* backing disk file */ + struct page *sb_page; /* cached copy of the bitmap file superblock */ + struct page **filemap; /* list of cache pages for the file */ + unsigned long *filemap_attr; /* attributes associated w/ filemap pages */ + unsigned long file_pages; /* number of pages in the file */ + int last_page_size; /* bytes in the last page */ + + unsigned long flags; + + int allclean; + + unsigned long max_write_behind; /* write-behind mode */ + atomic_t behind_writes; + + /* + * the bitmap daemon - periodically wakes up and sweeps the bitmap + * file, cleaning up bits and flushing out pages to disk as necessary + */ + unsigned long daemon_lastrun; /* jiffies of last run */ + unsigned long daemon_sleep; /* how many seconds between updates? */ + unsigned long last_end_sync; /* when we lasted called end_sync to + * update bitmap with resync progress */ + + atomic_t pending_writes; /* pending writes to the bitmap file */ + wait_queue_head_t write_wait; + wait_queue_head_t overflow_wait; + +}; + +/* the bitmap API */ + +/* these are used only by md/bitmap */ +int bitmap_create(mddev_t *mddev); +void bitmap_flush(mddev_t *mddev); +void bitmap_destroy(mddev_t *mddev); + +void bitmap_print_sb(struct bitmap *bitmap); +void bitmap_update_sb(struct bitmap *bitmap); + +int bitmap_setallbits(struct bitmap *bitmap); +void bitmap_write_all(struct bitmap *bitmap); + +void bitmap_dirty_bits(struct bitmap *bitmap, unsigned long s, unsigned long e); + +/* these are exported */ +int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, + unsigned long sectors, int behind); +void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, + unsigned long sectors, int success, int behind); +int bitmap_start_sync(struct bitmap *bitmap, sector_t offset, int *blocks, int degraded); +void bitmap_end_sync(struct bitmap *bitmap, sector_t offset, int *blocks, int aborted); +void bitmap_close_sync(struct bitmap *bitmap); +void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector); + +void bitmap_unplug(struct bitmap *bitmap); +void bitmap_daemon_work(struct bitmap *bitmap); +#endif + +#endif diff --git a/drivers/md/linear.c b/drivers/md/linear.c index 09658b2..3603ffa 100644 --- a/drivers/md/linear.c +++ b/drivers/md/linear.c @@ -16,7 +16,7 @@ Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ -#include +#include "linear.h" /* * find which device holds a particular offset diff --git a/drivers/md/linear.h b/drivers/md/linear.h new file mode 100644 index 0000000..f38b9c5 --- /dev/null +++ b/drivers/md/linear.h @@ -0,0 +1,31 @@ +#ifndef _LINEAR_H +#define _LINEAR_H + +#include + +struct dev_info { + mdk_rdev_t *rdev; + sector_t num_sectors; + sector_t start_sector; +}; + +typedef struct dev_info dev_info_t; + +struct linear_private_data +{ + struct linear_private_data *prev; /* earlier version */ + dev_info_t **hash_table; + sector_t spacing; + sector_t array_sectors; + int sector_shift; /* shift before dividing + * by spacing + */ + dev_info_t disks[0]; +}; + + +typedef struct linear_private_data linear_conf_t; + +#define mddev_to_conf(mddev) ((linear_conf_t *) mddev->private) + +#endif diff --git a/drivers/md/md.c b/drivers/md/md.c index 3efc0bc..9a3214c 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -34,7 +34,6 @@ #include #include -#include #include #include /* for invalidate_bdev */ #include @@ -45,6 +44,7 @@ #include #include #include +#include "bitmap.h" /* 63 partitions with the alternate major number (mdp) */ #define MdpMinorShift 6 diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c index f6d08f2..547df09 100644 --- a/drivers/md/multipath.c +++ b/drivers/md/multipath.c @@ -19,7 +19,7 @@ * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ -#include +#include "multipath.h" #define MAX_WORK_PER_DISK 128 diff --git a/drivers/md/multipath.h b/drivers/md/multipath.h new file mode 100644 index 0000000..6f53fc1 --- /dev/null +++ b/drivers/md/multipath.h @@ -0,0 +1,42 @@ +#ifndef _MULTIPATH_H +#define _MULTIPATH_H + +#include + +struct multipath_info { + mdk_rdev_t *rdev; +}; + +struct multipath_private_data { + mddev_t *mddev; + struct multipath_info *multipaths; + int raid_disks; + int working_disks; + spinlock_t device_lock; + struct list_head retry_list; + + mempool_t *pool; +}; + +typedef struct multipath_private_data multipath_conf_t; + +/* + * this is the only point in the RAID code where we violate + * C type safety. mddev->private is an 'opaque' pointer. + */ +#define mddev_to_conf(mddev) ((multipath_conf_t *) mddev->private) + +/* + * this is our 'private' 'collective' MULTIPATH buffer head. + * it contains information about what kind of IO operations were started + * for this MULTIPATH operation, and about their status: + */ + +struct multipath_bh { + mddev_t *mddev; + struct bio *master_bio; + struct bio bio; + int path; + struct list_head retry_list; +}; +#endif diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index c605ba8..ef09ed0 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -18,7 +18,7 @@ Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ -#include +#include "raid0.h" static void raid0_unplug(struct request_queue *q) { diff --git a/drivers/md/raid0.h b/drivers/md/raid0.h new file mode 100644 index 0000000..fd42aa8 --- /dev/null +++ b/drivers/md/raid0.h @@ -0,0 +1,30 @@ +#ifndef _RAID0_H +#define _RAID0_H + +#include + +struct strip_zone +{ + sector_t zone_start; /* Zone offset in md_dev (in sectors) */ + sector_t dev_start; /* Zone offset in real dev (in sectors) */ + sector_t sectors; /* Zone size in sectors */ + int nb_dev; /* # of devices attached to the zone */ + mdk_rdev_t **dev; /* Devices attached to the zone */ +}; + +struct raid0_private_data +{ + struct strip_zone **hash_table; /* Table of indexes into strip_zone */ + struct strip_zone *strip_zone; + mdk_rdev_t **devlist; /* lists of rdevs, pointed to by strip_zone->dev */ + int nr_strip_zones; + + sector_t spacing; + int sector_shift; /* shift this before divide by spacing */ +}; + +typedef struct raid0_private_data raid0_conf_t; + +#define mddev_to_conf(mddev) ((raid0_conf_t *) mddev->private) + +#endif diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index e246642..bff3228 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -33,8 +33,8 @@ #include "dm-bio-list.h" #include -#include -#include +#include "raid1.h" +#include "bitmap.h" #define DEBUG 0 #if DEBUG diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h new file mode 100644 index 0000000..0a9ba7c --- /dev/null +++ b/drivers/md/raid1.h @@ -0,0 +1,134 @@ +#ifndef _RAID1_H +#define _RAID1_H + +#include + +typedef struct mirror_info mirror_info_t; + +struct mirror_info { + mdk_rdev_t *rdev; + sector_t head_position; +}; + +/* + * memory pools need a pointer to the mddev, so they can force an unplug + * when memory is tight, and a count of the number of drives that the + * pool was allocated for, so they know how much to allocate and free. + * mddev->raid_disks cannot be used, as it can change while a pool is active + * These two datums are stored in a kmalloced struct. + */ + +struct pool_info { + mddev_t *mddev; + int raid_disks; +}; + + +typedef struct r1bio_s r1bio_t; + +struct r1_private_data_s { + mddev_t *mddev; + mirror_info_t *mirrors; + int raid_disks; + int last_used; + sector_t next_seq_sect; + spinlock_t device_lock; + + struct list_head retry_list; + /* queue pending writes and submit them on unplug */ + struct bio_list pending_bio_list; + /* queue of writes that have been unplugged */ + struct bio_list flushing_bio_list; + + /* for use when syncing mirrors: */ + + spinlock_t resync_lock; + int nr_pending; + int nr_waiting; + int nr_queued; + int barrier; + sector_t next_resync; + int fullsync; /* set to 1 if a full sync is needed, + * (fresh device added). + * Cleared when a sync completes. + */ + + wait_queue_head_t wait_barrier; + + struct pool_info *poolinfo; + + struct page *tmppage; + + mempool_t *r1bio_pool; + mempool_t *r1buf_pool; +}; + +typedef struct r1_private_data_s conf_t; + +/* + * this is the only point in the RAID code where we violate + * C type safety. mddev->private is an 'opaque' pointer. + */ +#define mddev_to_conf(mddev) ((conf_t *) mddev->private) + +/* + * this is our 'private' RAID1 bio. + * + * it contains information about what kind of IO operations were started + * for this RAID1 operation, and about their status: + */ + +struct r1bio_s { + atomic_t remaining; /* 'have we finished' count, + * used from IRQ handlers + */ + atomic_t behind_remaining; /* number of write-behind ios remaining + * in this BehindIO request + */ + sector_t sector; + int sectors; + unsigned long state; + mddev_t *mddev; + /* + * original bio going to /dev/mdx + */ + struct bio *master_bio; + /* + * if the IO is in READ direction, then this is where we read + */ + int read_disk; + + struct list_head retry_list; + struct bitmap_update *bitmap_update; + /* + * if the IO is in WRITE direction, then multiple bios are used. + * We choose the number when they are allocated. + */ + struct bio *bios[0]; + /* DO NOT PUT ANY NEW FIELDS HERE - bios array is contiguously alloced*/ +}; + +/* when we get a read error on a read-only array, we redirect to another + * device without failing the first device, or trying to over-write to + * correct the read error. To keep track of bad blocks on a per-bio + * level, we store IO_BLOCKED in the appropriate 'bios' pointer + */ +#define IO_BLOCKED ((struct bio*)1) + +/* bits for r1bio.state */ +#define R1BIO_Uptodate 0 +#define R1BIO_IsSync 1 +#define R1BIO_Degraded 2 +#define R1BIO_BehindIO 3 +#define R1BIO_Barrier 4 +#define R1BIO_BarrierRetry 5 +/* For write-behind requests, we call bi_end_io when + * the last non-write-behind device completes, providing + * any write was successful. Otherwise we call when + * any write-behind write succeeds, otherwise we call + * with failure when last write completes (and all failed). + * Record that bi_end_io was called with this flag... + */ +#define R1BIO_Returned 6 + +#endif diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 7301631..f03dd70 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -20,8 +20,8 @@ #include "dm-bio-list.h" #include -#include -#include +#include "raid10.h" +#include "bitmap.h" /* * RAID10 provides a combination of RAID0 and RAID1 functionality. diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h new file mode 100644 index 0000000..e9091cf --- /dev/null +++ b/drivers/md/raid10.h @@ -0,0 +1,123 @@ +#ifndef _RAID10_H +#define _RAID10_H + +#include + +typedef struct mirror_info mirror_info_t; + +struct mirror_info { + mdk_rdev_t *rdev; + sector_t head_position; +}; + +typedef struct r10bio_s r10bio_t; + +struct r10_private_data_s { + mddev_t *mddev; + mirror_info_t *mirrors; + int raid_disks; + spinlock_t device_lock; + + /* geometry */ + int near_copies; /* number of copies layed out raid0 style */ + int far_copies; /* number of copies layed out + * at large strides across drives + */ + int far_offset; /* far_copies are offset by 1 stripe + * instead of many + */ + int copies; /* near_copies * far_copies. + * must be <= raid_disks + */ + sector_t stride; /* distance between far copies. + * This is size / far_copies unless + * far_offset, in which case it is + * 1 stripe. + */ + + int chunk_shift; /* shift from chunks to sectors */ + sector_t chunk_mask; + + struct list_head retry_list; + /* queue pending writes and submit them on unplug */ + struct bio_list pending_bio_list; + + + spinlock_t resync_lock; + int nr_pending; + int nr_waiting; + int nr_queued; + int barrier; + sector_t next_resync; + int fullsync; /* set to 1 if a full sync is needed, + * (fresh device added). + * Cleared when a sync completes. + */ + + wait_queue_head_t wait_barrier; + + mempool_t *r10bio_pool; + mempool_t *r10buf_pool; + struct page *tmppage; +}; + +typedef struct r10_private_data_s conf_t; + +/* + * this is the only point in the RAID code where we violate + * C type safety. mddev->private is an 'opaque' pointer. + */ +#define mddev_to_conf(mddev) ((conf_t *) mddev->private) + +/* + * this is our 'private' RAID10 bio. + * + * it contains information about what kind of IO operations were started + * for this RAID10 operation, and about their status: + */ + +struct r10bio_s { + atomic_t remaining; /* 'have we finished' count, + * used from IRQ handlers + */ + sector_t sector; /* virtual sector number */ + int sectors; + unsigned long state; + mddev_t *mddev; + /* + * original bio going to /dev/mdx + */ + struct bio *master_bio; + /* + * if the IO is in READ direction, then this is where we read + */ + int read_slot; + + struct list_head retry_list; + /* + * if the IO is in WRITE direction, then multiple bios are used, + * one for each copy. + * When resyncing we also use one for each copy. + * When reconstructing, we use 2 bios, one for read, one for write. + * We choose the number when they are allocated. + */ + struct { + struct bio *bio; + sector_t addr; + int devnum; + } devs[0]; +}; + +/* when we get a read error on a read-only array, we redirect to another + * device without failing the first device, or trying to over-write to + * correct the read error. To keep track of bad blocks on a per-bio + * level, we store IO_BLOCKED in the appropriate 'bios' pointer + */ +#define IO_BLOCKED ((struct bio*)1) + +/* bits for r10bio.state */ +#define R10BIO_Uptodate 0 +#define R10BIO_IsSync 1 +#define R10BIO_IsRecover 2 +#define R10BIO_Degraded 3 +#endif diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index a5ba080..f75698b 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -44,10 +44,9 @@ */ #include -#include "raid6.h" - -#include #include +#include "raid6.h" +#include "bitmap.h" /* * Stripe cache diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h new file mode 100644 index 0000000..40f1d03 --- /dev/null +++ b/drivers/md/raid5.h @@ -0,0 +1,402 @@ +#ifndef _RAID5_H +#define _RAID5_H + +#include +#include + +/* + * + * Each stripe contains one buffer per disc. Each buffer can be in + * one of a number of states stored in "flags". Changes between + * these states happen *almost* exclusively under a per-stripe + * spinlock. Some very specific changes can happen in bi_end_io, and + * these are not protected by the spin lock. + * + * The flag bits that are used to represent these states are: + * R5_UPTODATE and R5_LOCKED + * + * State Empty == !UPTODATE, !LOCK + * We have no data, and there is no active request + * State Want == !UPTODATE, LOCK + * A read request is being submitted for this block + * State Dirty == UPTODATE, LOCK + * Some new data is in this buffer, and it is being written out + * State Clean == UPTODATE, !LOCK + * We have valid data which is the same as on disc + * + * The possible state transitions are: + * + * Empty -> Want - on read or write to get old data for parity calc + * Empty -> Dirty - on compute_parity to satisfy write/sync request.(RECONSTRUCT_WRITE) + * Empty -> Clean - on compute_block when computing a block for failed drive + * Want -> Empty - on failed read + * Want -> Clean - on successful completion of read request + * Dirty -> Clean - on successful completion of write request + * Dirty -> Clean - on failed write + * Clean -> Dirty - on compute_parity to satisfy write/sync (RECONSTRUCT or RMW) + * + * The Want->Empty, Want->Clean, Dirty->Clean, transitions + * all happen in b_end_io at interrupt time. + * Each sets the Uptodate bit before releasing the Lock bit. + * This leaves one multi-stage transition: + * Want->Dirty->Clean + * This is safe because thinking that a Clean buffer is actually dirty + * will at worst delay some action, and the stripe will be scheduled + * for attention after the transition is complete. + * + * There is one possibility that is not covered by these states. That + * is if one drive has failed and there is a spare being rebuilt. We + * can't distinguish between a clean block that has been generated + * from parity calculations, and a clean block that has been + * successfully written to the spare ( or to parity when resyncing). + * To distingush these states we have a stripe bit STRIPE_INSYNC that + * is set whenever a write is scheduled to the spare, or to the parity + * disc if there is no spare. A sync request clears this bit, and + * when we find it set with no buffers locked, we know the sync is + * complete. + * + * Buffers for the md device that arrive via make_request are attached + * to the appropriate stripe in one of two lists linked on b_reqnext. + * One list (bh_read) for read requests, one (bh_write) for write. + * There should never be more than one buffer on the two lists + * together, but we are not guaranteed of that so we allow for more. + * + * If a buffer is on the read list when the associated cache buffer is + * Uptodate, the data is copied into the read buffer and it's b_end_io + * routine is called. This may happen in the end_request routine only + * if the buffer has just successfully been read. end_request should + * remove the buffers from the list and then set the Uptodate bit on + * the buffer. Other threads may do this only if they first check + * that the Uptodate bit is set. Once they have checked that they may + * take buffers off the read queue. + * + * When a buffer on the write list is committed for write it is copied + * into the cache buffer, which is then marked dirty, and moved onto a + * third list, the written list (bh_written). Once both the parity + * block and the cached buffer are successfully written, any buffer on + * a written list can be returned with b_end_io. + * + * The write list and read list both act as fifos. The read list is + * protected by the device_lock. The write and written lists are + * protected by the stripe lock. The device_lock, which can be + * claimed while the stipe lock is held, is only for list + * manipulations and will only be held for a very short time. It can + * be claimed from interrupts. + * + * + * Stripes in the stripe cache can be on one of two lists (or on + * neither). The "inactive_list" contains stripes which are not + * currently being used for any request. They can freely be reused + * for another stripe. The "handle_list" contains stripes that need + * to be handled in some way. Both of these are fifo queues. Each + * stripe is also (potentially) linked to a hash bucket in the hash + * table so that it can be found by sector number. Stripes that are + * not hashed must be on the inactive_list, and will normally be at + * the front. All stripes start life this way. + * + * The inactive_list, handle_list and hash bucket lists are all protected by the + * device_lock. + * - stripes on the inactive_list never have their stripe_lock held. + * - stripes have a reference counter. If count==0, they are on a list. + * - If a stripe might need handling, STRIPE_HANDLE is set. + * - When refcount reaches zero, then if STRIPE_HANDLE it is put on + * handle_list else inactive_list + * + * This, combined with the fact that STRIPE_HANDLE is only ever + * cleared while a stripe has a non-zero count means that if the + * refcount is 0 and STRIPE_HANDLE is set, then it is on the + * handle_list and if recount is 0 and STRIPE_HANDLE is not set, then + * the stripe is on inactive_list. + * + * The possible transitions are: + * activate an unhashed/inactive stripe (get_active_stripe()) + * lockdev check-hash unlink-stripe cnt++ clean-stripe hash-stripe unlockdev + * activate a hashed, possibly active stripe (get_active_stripe()) + * lockdev check-hash if(!cnt++)unlink-stripe unlockdev + * attach a request to an active stripe (add_stripe_bh()) + * lockdev attach-buffer unlockdev + * handle a stripe (handle_stripe()) + * lockstripe clrSTRIPE_HANDLE ... + * (lockdev check-buffers unlockdev) .. + * change-state .. + * record io/ops needed unlockstripe schedule io/ops + * release an active stripe (release_stripe()) + * lockdev if (!--cnt) { if STRIPE_HANDLE, add to handle_list else add to inactive-list } unlockdev + * + * The refcount counts each thread that have activated the stripe, + * plus raid5d if it is handling it, plus one for each active request + * on a cached buffer, and plus one if the stripe is undergoing stripe + * operations. + * + * Stripe operations are performed outside the stripe lock, + * the stripe operations are: + * -copying data between the stripe cache and user application buffers + * -computing blocks to save a disk access, or to recover a missing block + * -updating the parity on a write operation (reconstruct write and + * read-modify-write) + * -checking parity correctness + * -running i/o to disk + * These operations are carried out by raid5_run_ops which uses the async_tx + * api to (optionally) offload operations to dedicated hardware engines. + * When requesting an operation handle_stripe sets the pending bit for the + * operation and increments the count. raid5_run_ops is then run whenever + * the count is non-zero. + * There are some critical dependencies between the operations that prevent some + * from being requested while another is in flight. + * 1/ Parity check operations destroy the in cache version of the parity block, + * so we prevent parity dependent operations like writes and compute_blocks + * from starting while a check is in progress. Some dma engines can perform + * the check without damaging the parity block, in these cases the parity + * block is re-marked up to date (assuming the check was successful) and is + * not re-read from disk. + * 2/ When a write operation is requested we immediately lock the affected + * blocks, and mark them as not up to date. This causes new read requests + * to be held off, as well as parity checks and compute block operations. + * 3/ Once a compute block operation has been requested handle_stripe treats + * that block as if it is up to date. raid5_run_ops guaruntees that any + * operation that is dependent on the compute block result is initiated after + * the compute block completes. + */ + +/* + * Operations state - intermediate states that are visible outside of sh->lock + * In general _idle indicates nothing is running, _run indicates a data + * processing operation is active, and _result means the data processing result + * is stable and can be acted upon. For simple operations like biofill and + * compute that only have an _idle and _run state they are indicated with + * sh->state flags (STRIPE_BIOFILL_RUN and STRIPE_COMPUTE_RUN) + */ +/** + * enum check_states - handles syncing / repairing a stripe + * @check_state_idle - check operations are quiesced + * @check_state_run - check operation is running + * @check_state_result - set outside lock when check result is valid + * @check_state_compute_run - check failed and we are repairing + * @check_state_compute_result - set outside lock when compute result is valid + */ +enum check_states { + check_state_idle = 0, + check_state_run, /* parity check */ + check_state_check_result, + check_state_compute_run, /* parity repair */ + check_state_compute_result, +}; + +/** + * enum reconstruct_states - handles writing or expanding a stripe + */ +enum reconstruct_states { + reconstruct_state_idle = 0, + reconstruct_state_prexor_drain_run, /* prexor-write */ + reconstruct_state_drain_run, /* write */ + reconstruct_state_run, /* expand */ + reconstruct_state_prexor_drain_result, + reconstruct_state_drain_result, + reconstruct_state_result, +}; + +struct stripe_head { + struct hlist_node hash; + struct list_head lru; /* inactive_list or handle_list */ + struct raid5_private_data *raid_conf; + sector_t sector; /* sector of this row */ + int pd_idx; /* parity disk index */ + unsigned long state; /* state flags */ + atomic_t count; /* nr of active thread/requests */ + spinlock_t lock; + int bm_seq; /* sequence number for bitmap flushes */ + int disks; /* disks in stripe */ + enum check_states check_state; + enum reconstruct_states reconstruct_state; + /* stripe_operations + * @target - STRIPE_OP_COMPUTE_BLK target + */ + struct stripe_operations { + int target; + u32 zero_sum_result; + } ops; + struct r5dev { + struct bio req; + struct bio_vec vec; + struct page *page; + struct bio *toread, *read, *towrite, *written; + sector_t sector; /* sector of this page */ + unsigned long flags; + } dev[1]; /* allocated with extra space depending of RAID geometry */ +}; + +/* stripe_head_state - collects and tracks the dynamic state of a stripe_head + * for handle_stripe. It is only valid under spin_lock(sh->lock); + */ +struct stripe_head_state { + int syncing, expanding, expanded; + int locked, uptodate, to_read, to_write, failed, written; + int to_fill, compute, req_compute, non_overwrite; + int failed_num; + unsigned long ops_request; +}; + +/* r6_state - extra state data only relevant to r6 */ +struct r6_state { + int p_failed, q_failed, qd_idx, failed_num[2]; +}; + +/* Flags */ +#define R5_UPTODATE 0 /* page contains current data */ +#define R5_LOCKED 1 /* IO has been submitted on "req" */ +#define R5_OVERWRITE 2 /* towrite covers whole page */ +/* and some that are internal to handle_stripe */ +#define R5_Insync 3 /* rdev && rdev->in_sync at start */ +#define R5_Wantread 4 /* want to schedule a read */ +#define R5_Wantwrite 5 +#define R5_Overlap 7 /* There is a pending overlapping request on this block */ +#define R5_ReadError 8 /* seen a read error here recently */ +#define R5_ReWrite 9 /* have tried to over-write the readerror */ + +#define R5_Expanded 10 /* This block now has post-expand data */ +#define R5_Wantcompute 11 /* compute_block in progress treat as + * uptodate + */ +#define R5_Wantfill 12 /* dev->toread contains a bio that needs + * filling + */ +#define R5_Wantdrain 13 /* dev->towrite needs to be drained */ +/* + * Write method + */ +#define RECONSTRUCT_WRITE 1 +#define READ_MODIFY_WRITE 2 +/* not a write method, but a compute_parity mode */ +#define CHECK_PARITY 3 + +/* + * Stripe state + */ +#define STRIPE_HANDLE 2 +#define STRIPE_SYNCING 3 +#define STRIPE_INSYNC 4 +#define STRIPE_PREREAD_ACTIVE 5 +#define STRIPE_DELAYED 6 +#define STRIPE_DEGRADED 7 +#define STRIPE_BIT_DELAY 8 +#define STRIPE_EXPANDING 9 +#define STRIPE_EXPAND_SOURCE 10 +#define STRIPE_EXPAND_READY 11 +#define STRIPE_IO_STARTED 12 /* do not count towards 'bypass_count' */ +#define STRIPE_FULL_WRITE 13 /* all blocks are set to be overwritten */ +#define STRIPE_BIOFILL_RUN 14 +#define STRIPE_COMPUTE_RUN 15 +/* + * Operation request flags + */ +#define STRIPE_OP_BIOFILL 0 +#define STRIPE_OP_COMPUTE_BLK 1 +#define STRIPE_OP_PREXOR 2 +#define STRIPE_OP_BIODRAIN 3 +#define STRIPE_OP_POSTXOR 4 +#define STRIPE_OP_CHECK 5 + +/* + * Plugging: + * + * To improve write throughput, we need to delay the handling of some + * stripes until there has been a chance that several write requests + * for the one stripe have all been collected. + * In particular, any write request that would require pre-reading + * is put on a "delayed" queue until there are no stripes currently + * in a pre-read phase. Further, if the "delayed" queue is empty when + * a stripe is put on it then we "plug" the queue and do not process it + * until an unplug call is made. (the unplug_io_fn() is called). + * + * When preread is initiated on a stripe, we set PREREAD_ACTIVE and add + * it to the count of prereading stripes. + * When write is initiated, or the stripe refcnt == 0 (just in case) we + * clear the PREREAD_ACTIVE flag and decrement the count + * Whenever the 'handle' queue is empty and the device is not plugged, we + * move any strips from delayed to handle and clear the DELAYED flag and set + * PREREAD_ACTIVE. + * In stripe_handle, if we find pre-reading is necessary, we do it if + * PREREAD_ACTIVE is set, else we set DELAYED which will send it to the delayed queue. + * HANDLE gets cleared if stripe_handle leave nothing locked. + */ + + +struct disk_info { + mdk_rdev_t *rdev; +}; + +struct raid5_private_data { + struct hlist_head *stripe_hashtbl; + mddev_t *mddev; + struct disk_info *spare; + int chunk_size, level, algorithm; + int max_degraded; + int raid_disks; + int max_nr_stripes; + + /* used during an expand */ + sector_t expand_progress; /* MaxSector when no expand happening */ + sector_t expand_lo; /* from here up to expand_progress it out-of-bounds + * as we haven't flushed the metadata yet + */ + int previous_raid_disks; + + struct list_head handle_list; /* stripes needing handling */ + struct list_head hold_list; /* preread ready stripes */ + struct list_head delayed_list; /* stripes that have plugged requests */ + struct list_head bitmap_list; /* stripes delaying awaiting bitmap update */ + struct bio *retry_read_aligned; /* currently retrying aligned bios */ + struct bio *retry_read_aligned_list; /* aligned bios retry list */ + atomic_t preread_active_stripes; /* stripes with scheduled io */ + atomic_t active_aligned_reads; + atomic_t pending_full_writes; /* full write backlog */ + int bypass_count; /* bypassed prereads */ + int bypass_threshold; /* preread nice */ + struct list_head *last_hold; /* detect hold_list promotions */ + + atomic_t reshape_stripes; /* stripes with pending writes for reshape */ + /* unfortunately we need two cache names as we temporarily have + * two caches. + */ + int active_name; + char cache_name[2][20]; + struct kmem_cache *slab_cache; /* for allocating stripes */ + + int seq_flush, seq_write; + int quiesce; + + int fullsync; /* set to 1 if a full sync is needed, + * (fresh device added). + * Cleared when a sync completes. + */ + + struct page *spare_page; /* Used when checking P/Q in raid6 */ + + /* + * Free stripes pool + */ + atomic_t active_stripes; + struct list_head inactive_list; + wait_queue_head_t wait_for_stripe; + wait_queue_head_t wait_for_overlap; + int inactive_blocked; /* release of inactive stripes blocked, + * waiting for 25% to be free + */ + int pool_size; /* number of disks in stripeheads in pool */ + spinlock_t device_lock; + struct disk_info *disks; +}; + +typedef struct raid5_private_data raid5_conf_t; + +#define mddev_to_conf(mddev) ((raid5_conf_t *) mddev->private) + +/* + * Our supported algorithms + */ +#define ALGORITHM_LEFT_ASYMMETRIC 0 +#define ALGORITHM_RIGHT_ASYMMETRIC 1 +#define ALGORITHM_LEFT_SYMMETRIC 2 +#define ALGORITHM_RIGHT_SYMMETRIC 3 + +#endif diff --git a/drivers/md/raid6.h b/drivers/md/raid6.h index 98dcde8..f6c13af 100644 --- a/drivers/md/raid6.h +++ b/drivers/md/raid6.h @@ -19,7 +19,7 @@ #define RAID6_USE_EMPTY_ZERO_PAGE 0 #include -#include +#include "raid5.h" typedef raid5_conf_t raid6_conf_t; /* Same configuration */ diff --git a/include/linux/raid/bitmap.h b/include/linux/raid/bitmap.h deleted file mode 100644 index e989006..0000000 --- a/include/linux/raid/bitmap.h +++ /dev/null @@ -1,288 +0,0 @@ -/* - * bitmap.h: Copyright (C) Peter T. Breuer (ptb@ot.uc3m.es) 2003 - * - * additions: Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc. - */ -#ifndef BITMAP_H -#define BITMAP_H 1 - -#define BITMAP_MAJOR_LO 3 -/* version 4 insists the bitmap is in little-endian order - * with version 3, it is host-endian which is non-portable - */ -#define BITMAP_MAJOR_HI 4 -#define BITMAP_MAJOR_HOSTENDIAN 3 - -#define BITMAP_MINOR 39 - -/* - * in-memory bitmap: - * - * Use 16 bit block counters to track pending writes to each "chunk". - * The 2 high order bits are special-purpose, the first is a flag indicating - * whether a resync is needed. The second is a flag indicating whether a - * resync is active. - * This means that the counter is actually 14 bits: - * - * +--------+--------+------------------------------------------------+ - * | resync | resync | counter | - * | needed | active | | - * | (0-1) | (0-1) | (0-16383) | - * +--------+--------+------------------------------------------------+ - * - * The "resync needed" bit is set when: - * a '1' bit is read from storage at startup. - * a write request fails on some drives - * a resync is aborted on a chunk with 'resync active' set - * It is cleared (and resync-active set) when a resync starts across all drives - * of the chunk. - * - * - * The "resync active" bit is set when: - * a resync is started on all drives, and resync_needed is set. - * resync_needed will be cleared (as long as resync_active wasn't already set). - * It is cleared when a resync completes. - * - * The counter counts pending write requests, plus the on-disk bit. - * When the counter is '1' and the resync bits are clear, the on-disk - * bit can be cleared aswell, thus setting the counter to 0. - * When we set a bit, or in the counter (to start a write), if the fields is - * 0, we first set the disk bit and set the counter to 1. - * - * If the counter is 0, the on-disk bit is clear and the stipe is clean - * Anything that dirties the stipe pushes the counter to 2 (at least) - * and sets the on-disk bit (lazily). - * If a periodic sweep find the counter at 2, it is decremented to 1. - * If the sweep find the counter at 1, the on-disk bit is cleared and the - * counter goes to zero. - * - * Also, we'll hijack the "map" pointer itself and use it as two 16 bit block - * counters as a fallback when "page" memory cannot be allocated: - * - * Normal case (page memory allocated): - * - * page pointer (32-bit) - * - * [ ] ------+ - * | - * +-------> [ ][ ]..[ ] (4096 byte page == 2048 counters) - * c1 c2 c2048 - * - * Hijacked case (page memory allocation failed): - * - * hijacked page pointer (32-bit) - * - * [ ][ ] (no page memory allocated) - * counter #1 (16-bit) counter #2 (16-bit) - * - */ - -#ifdef __KERNEL__ - -#define PAGE_BITS (PAGE_SIZE << 3) -#define PAGE_BIT_SHIFT (PAGE_SHIFT + 3) - -typedef __u16 bitmap_counter_t; -#define COUNTER_BITS 16 -#define COUNTER_BIT_SHIFT 4 -#define COUNTER_BYTE_RATIO (COUNTER_BITS / 8) -#define COUNTER_BYTE_SHIFT (COUNTER_BIT_SHIFT - 3) - -#define NEEDED_MASK ((bitmap_counter_t) (1 << (COUNTER_BITS - 1))) -#define RESYNC_MASK ((bitmap_counter_t) (1 << (COUNTER_BITS - 2))) -#define COUNTER_MAX ((bitmap_counter_t) RESYNC_MASK - 1) -#define NEEDED(x) (((bitmap_counter_t) x) & NEEDED_MASK) -#define RESYNC(x) (((bitmap_counter_t) x) & RESYNC_MASK) -#define COUNTER(x) (((bitmap_counter_t) x) & COUNTER_MAX) - -/* how many counters per page? */ -#define PAGE_COUNTER_RATIO (PAGE_BITS / COUNTER_BITS) -/* same, except a shift value for more efficient bitops */ -#define PAGE_COUNTER_SHIFT (PAGE_BIT_SHIFT - COUNTER_BIT_SHIFT) -/* same, except a mask value for more efficient bitops */ -#define PAGE_COUNTER_MASK (PAGE_COUNTER_RATIO - 1) - -#define BITMAP_BLOCK_SIZE 512 -#define BITMAP_BLOCK_SHIFT 9 - -/* how many blocks per chunk? (this is variable) */ -#define CHUNK_BLOCK_RATIO(bitmap) ((bitmap)->chunksize >> BITMAP_BLOCK_SHIFT) -#define CHUNK_BLOCK_SHIFT(bitmap) ((bitmap)->chunkshift - BITMAP_BLOCK_SHIFT) -#define CHUNK_BLOCK_MASK(bitmap) (CHUNK_BLOCK_RATIO(bitmap) - 1) - -/* when hijacked, the counters and bits represent even larger "chunks" */ -/* there will be 1024 chunks represented by each counter in the page pointers */ -#define PAGEPTR_BLOCK_RATIO(bitmap) \ - (CHUNK_BLOCK_RATIO(bitmap) << PAGE_COUNTER_SHIFT >> 1) -#define PAGEPTR_BLOCK_SHIFT(bitmap) \ - (CHUNK_BLOCK_SHIFT(bitmap) + PAGE_COUNTER_SHIFT - 1) -#define PAGEPTR_BLOCK_MASK(bitmap) (PAGEPTR_BLOCK_RATIO(bitmap) - 1) - -/* - * on-disk bitmap: - * - * Use one bit per "chunk" (block set). We do the disk I/O on the bitmap - * file a page at a time. There's a superblock at the start of the file. - */ - -/* map chunks (bits) to file pages - offset by the size of the superblock */ -#define CHUNK_BIT_OFFSET(chunk) ((chunk) + (sizeof(bitmap_super_t) << 3)) - -#endif - -/* - * bitmap structures: - */ - -#define BITMAP_MAGIC 0x6d746962 - -/* use these for bitmap->flags and bitmap->sb->state bit-fields */ -enum bitmap_state { - BITMAP_STALE = 0x002, /* the bitmap file is out of date or had -EIO */ - BITMAP_WRITE_ERROR = 0x004, /* A write error has occurred */ - BITMAP_HOSTENDIAN = 0x8000, -}; - -/* the superblock at the front of the bitmap file -- little endian */ -typedef struct bitmap_super_s { - __le32 magic; /* 0 BITMAP_MAGIC */ - __le32 version; /* 4 the bitmap major for now, could change... */ - __u8 uuid[16]; /* 8 128 bit uuid - must match md device uuid */ - __le64 events; /* 24 event counter for the bitmap (1)*/ - __le64 events_cleared;/*32 event counter when last bit cleared (2) */ - __le64 sync_size; /* 40 the size of the md device's sync range(3) */ - __le32 state; /* 48 bitmap state information */ - __le32 chunksize; /* 52 the bitmap chunk size in bytes */ - __le32 daemon_sleep; /* 56 seconds between disk flushes */ - __le32 write_behind; /* 60 number of outstanding write-behind writes */ - - __u8 pad[256 - 64]; /* set to zero */ -} bitmap_super_t; - -/* notes: - * (1) This event counter is updated before the eventcounter in the md superblock - * When a bitmap is loaded, it is only accepted if this event counter is equal - * to, or one greater than, the event counter in the superblock. - * (2) This event counter is updated when the other one is *if*and*only*if* the - * array is not degraded. As bits are not cleared when the array is degraded, - * this represents the last time that any bits were cleared. - * If a device is being added that has an event count with this value or - * higher, it is accepted as conforming to the bitmap. - * (3)This is the number of sectors represented by the bitmap, and is the range that - * resync happens across. For raid1 and raid5/6 it is the size of individual - * devices. For raid10 it is the size of the array. - */ - -#ifdef __KERNEL__ - -/* the in-memory bitmap is represented by bitmap_pages */ -struct bitmap_page { - /* - * map points to the actual memory page - */ - char *map; - /* - * in emergencies (when map cannot be alloced), hijack the map - * pointer and use it as two counters itself - */ - unsigned int hijacked:1; - /* - * count of dirty bits on the page - */ - unsigned int count:31; -}; - -/* keep track of bitmap file pages that have pending writes on them */ -struct page_list { - struct list_head list; - struct page *page; -}; - -/* the main bitmap structure - one per mddev */ -struct bitmap { - struct bitmap_page *bp; - unsigned long pages; /* total number of pages in the bitmap */ - unsigned long missing_pages; /* number of pages not yet allocated */ - - mddev_t *mddev; /* the md device that the bitmap is for */ - - int counter_bits; /* how many bits per block counter */ - - /* bitmap chunksize -- how much data does each bit represent? */ - unsigned long chunksize; - unsigned long chunkshift; /* chunksize = 2^chunkshift (for bitops) */ - unsigned long chunks; /* total number of data chunks for the array */ - - /* We hold a count on the chunk currently being synced, and drop - * it when the last block is started. If the resync is aborted - * midway, we need to be able to drop that count, so we remember - * the counted chunk.. - */ - unsigned long syncchunk; - - __u64 events_cleared; - int need_sync; - - /* bitmap spinlock */ - spinlock_t lock; - - long offset; /* offset from superblock if file is NULL */ - struct file *file; /* backing disk file */ - struct page *sb_page; /* cached copy of the bitmap file superblock */ - struct page **filemap; /* list of cache pages for the file */ - unsigned long *filemap_attr; /* attributes associated w/ filemap pages */ - unsigned long file_pages; /* number of pages in the file */ - int last_page_size; /* bytes in the last page */ - - unsigned long flags; - - int allclean; - - unsigned long max_write_behind; /* write-behind mode */ - atomic_t behind_writes; - - /* - * the bitmap daemon - periodically wakes up and sweeps the bitmap - * file, cleaning up bits and flushing out pages to disk as necessary - */ - unsigned long daemon_lastrun; /* jiffies of last run */ - unsigned long daemon_sleep; /* how many seconds between updates? */ - unsigned long last_end_sync; /* when we lasted called end_sync to - * update bitmap with resync progress */ - - atomic_t pending_writes; /* pending writes to the bitmap file */ - wait_queue_head_t write_wait; - wait_queue_head_t overflow_wait; - -}; - -/* the bitmap API */ - -/* these are used only by md/bitmap */ -int bitmap_create(mddev_t *mddev); -void bitmap_flush(mddev_t *mddev); -void bitmap_destroy(mddev_t *mddev); - -void bitmap_print_sb(struct bitmap *bitmap); -void bitmap_update_sb(struct bitmap *bitmap); - -int bitmap_setallbits(struct bitmap *bitmap); -void bitmap_write_all(struct bitmap *bitmap); - -void bitmap_dirty_bits(struct bitmap *bitmap, unsigned long s, unsigned long e); - -/* these are exported */ -int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, - unsigned long sectors, int behind); -void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, - unsigned long sectors, int success, int behind); -int bitmap_start_sync(struct bitmap *bitmap, sector_t offset, int *blocks, int degraded); -void bitmap_end_sync(struct bitmap *bitmap, sector_t offset, int *blocks, int aborted); -void bitmap_close_sync(struct bitmap *bitmap); -void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector); - -void bitmap_unplug(struct bitmap *bitmap); -void bitmap_daemon_work(struct bitmap *bitmap); -#endif - -#endif diff --git a/include/linux/raid/linear.h b/include/linux/raid/linear.h deleted file mode 100644 index f38b9c5..0000000 --- a/include/linux/raid/linear.h +++ /dev/null @@ -1,31 +0,0 @@ -#ifndef _LINEAR_H -#define _LINEAR_H - -#include - -struct dev_info { - mdk_rdev_t *rdev; - sector_t num_sectors; - sector_t start_sector; -}; - -typedef struct dev_info dev_info_t; - -struct linear_private_data -{ - struct linear_private_data *prev; /* earlier version */ - dev_info_t **hash_table; - sector_t spacing; - sector_t array_sectors; - int sector_shift; /* shift before dividing - * by spacing - */ - dev_info_t disks[0]; -}; - - -typedef struct linear_private_data linear_conf_t; - -#define mddev_to_conf(mddev) ((linear_conf_t *) mddev->private) - -#endif diff --git a/include/linux/raid/multipath.h b/include/linux/raid/multipath.h deleted file mode 100644 index 6f53fc1..0000000 --- a/include/linux/raid/multipath.h +++ /dev/null @@ -1,42 +0,0 @@ -#ifndef _MULTIPATH_H -#define _MULTIPATH_H - -#include - -struct multipath_info { - mdk_rdev_t *rdev; -}; - -struct multipath_private_data { - mddev_t *mddev; - struct multipath_info *multipaths; - int raid_disks; - int working_disks; - spinlock_t device_lock; - struct list_head retry_list; - - mempool_t *pool; -}; - -typedef struct multipath_private_data multipath_conf_t; - -/* - * this is the only point in the RAID code where we violate - * C type safety. mddev->private is an 'opaque' pointer. - */ -#define mddev_to_conf(mddev) ((multipath_conf_t *) mddev->private) - -/* - * this is our 'private' 'collective' MULTIPATH buffer head. - * it contains information about what kind of IO operations were started - * for this MULTIPATH operation, and about their status: - */ - -struct multipath_bh { - mddev_t *mddev; - struct bio *master_bio; - struct bio bio; - int path; - struct list_head retry_list; -}; -#endif diff --git a/include/linux/raid/raid0.h b/include/linux/raid/raid0.h deleted file mode 100644 index fd42aa8..0000000 --- a/include/linux/raid/raid0.h +++ /dev/null @@ -1,30 +0,0 @@ -#ifndef _RAID0_H -#define _RAID0_H - -#include - -struct strip_zone -{ - sector_t zone_start; /* Zone offset in md_dev (in sectors) */ - sector_t dev_start; /* Zone offset in real dev (in sectors) */ - sector_t sectors; /* Zone size in sectors */ - int nb_dev; /* # of devices attached to the zone */ - mdk_rdev_t **dev; /* Devices attached to the zone */ -}; - -struct raid0_private_data -{ - struct strip_zone **hash_table; /* Table of indexes into strip_zone */ - struct strip_zone *strip_zone; - mdk_rdev_t **devlist; /* lists of rdevs, pointed to by strip_zone->dev */ - int nr_strip_zones; - - sector_t spacing; - int sector_shift; /* shift this before divide by spacing */ -}; - -typedef struct raid0_private_data raid0_conf_t; - -#define mddev_to_conf(mddev) ((raid0_conf_t *) mddev->private) - -#endif diff --git a/include/linux/raid/raid1.h b/include/linux/raid/raid1.h deleted file mode 100644 index 0a9ba7c..0000000 --- a/include/linux/raid/raid1.h +++ /dev/null @@ -1,134 +0,0 @@ -#ifndef _RAID1_H -#define _RAID1_H - -#include - -typedef struct mirror_info mirror_info_t; - -struct mirror_info { - mdk_rdev_t *rdev; - sector_t head_position; -}; - -/* - * memory pools need a pointer to the mddev, so they can force an unplug - * when memory is tight, and a count of the number of drives that the - * pool was allocated for, so they know how much to allocate and free. - * mddev->raid_disks cannot be used, as it can change while a pool is active - * These two datums are stored in a kmalloced struct. - */ - -struct pool_info { - mddev_t *mddev; - int raid_disks; -}; - - -typedef struct r1bio_s r1bio_t; - -struct r1_private_data_s { - mddev_t *mddev; - mirror_info_t *mirrors; - int raid_disks; - int last_used; - sector_t next_seq_sect; - spinlock_t device_lock; - - struct list_head retry_list; - /* queue pending writes and submit them on unplug */ - struct bio_list pending_bio_list; - /* queue of writes that have been unplugged */ - struct bio_list flushing_bio_list; - - /* for use when syncing mirrors: */ - - spinlock_t resync_lock; - int nr_pending; - int nr_waiting; - int nr_queued; - int barrier; - sector_t next_resync; - int fullsync; /* set to 1 if a full sync is needed, - * (fresh device added). - * Cleared when a sync completes. - */ - - wait_queue_head_t wait_barrier; - - struct pool_info *poolinfo; - - struct page *tmppage; - - mempool_t *r1bio_pool; - mempool_t *r1buf_pool; -}; - -typedef struct r1_private_data_s conf_t; - -/* - * this is the only point in the RAID code where we violate - * C type safety. mddev->private is an 'opaque' pointer. - */ -#define mddev_to_conf(mddev) ((conf_t *) mddev->private) - -/* - * this is our 'private' RAID1 bio. - * - * it contains information about what kind of IO operations were started - * for this RAID1 operation, and about their status: - */ - -struct r1bio_s { - atomic_t remaining; /* 'have we finished' count, - * used from IRQ handlers - */ - atomic_t behind_remaining; /* number of write-behind ios remaining - * in this BehindIO request - */ - sector_t sector; - int sectors; - unsigned long state; - mddev_t *mddev; - /* - * original bio going to /dev/mdx - */ - struct bio *master_bio; - /* - * if the IO is in READ direction, then this is where we read - */ - int read_disk; - - struct list_head retry_list; - struct bitmap_update *bitmap_update; - /* - * if the IO is in WRITE direction, then multiple bios are used. - * We choose the number when they are allocated. - */ - struct bio *bios[0]; - /* DO NOT PUT ANY NEW FIELDS HERE - bios array is contiguously alloced*/ -}; - -/* when we get a read error on a read-only array, we redirect to another - * device without failing the first device, or trying to over-write to - * correct the read error. To keep track of bad blocks on a per-bio - * level, we store IO_BLOCKED in the appropriate 'bios' pointer - */ -#define IO_BLOCKED ((struct bio*)1) - -/* bits for r1bio.state */ -#define R1BIO_Uptodate 0 -#define R1BIO_IsSync 1 -#define R1BIO_Degraded 2 -#define R1BIO_BehindIO 3 -#define R1BIO_Barrier 4 -#define R1BIO_BarrierRetry 5 -/* For write-behind requests, we call bi_end_io when - * the last non-write-behind device completes, providing - * any write was successful. Otherwise we call when - * any write-behind write succeeds, otherwise we call - * with failure when last write completes (and all failed). - * Record that bi_end_io was called with this flag... - */ -#define R1BIO_Returned 6 - -#endif diff --git a/include/linux/raid/raid10.h b/include/linux/raid/raid10.h deleted file mode 100644 index e9091cf..0000000 --- a/include/linux/raid/raid10.h +++ /dev/null @@ -1,123 +0,0 @@ -#ifndef _RAID10_H -#define _RAID10_H - -#include - -typedef struct mirror_info mirror_info_t; - -struct mirror_info { - mdk_rdev_t *rdev; - sector_t head_position; -}; - -typedef struct r10bio_s r10bio_t; - -struct r10_private_data_s { - mddev_t *mddev; - mirror_info_t *mirrors; - int raid_disks; - spinlock_t device_lock; - - /* geometry */ - int near_copies; /* number of copies layed out raid0 style */ - int far_copies; /* number of copies layed out - * at large strides across drives - */ - int far_offset; /* far_copies are offset by 1 stripe - * instead of many - */ - int copies; /* near_copies * far_copies. - * must be <= raid_disks - */ - sector_t stride; /* distance between far copies. - * This is size / far_copies unless - * far_offset, in which case it is - * 1 stripe. - */ - - int chunk_shift; /* shift from chunks to sectors */ - sector_t chunk_mask; - - struct list_head retry_list; - /* queue pending writes and submit them on unplug */ - struct bio_list pending_bio_list; - - - spinlock_t resync_lock; - int nr_pending; - int nr_waiting; - int nr_queued; - int barrier; - sector_t next_resync; - int fullsync; /* set to 1 if a full sync is needed, - * (fresh device added). - * Cleared when a sync completes. - */ - - wait_queue_head_t wait_barrier; - - mempool_t *r10bio_pool; - mempool_t *r10buf_pool; - struct page *tmppage; -}; - -typedef struct r10_private_data_s conf_t; - -/* - * this is the only point in the RAID code where we violate - * C type safety. mddev->private is an 'opaque' pointer. - */ -#define mddev_to_conf(mddev) ((conf_t *) mddev->private) - -/* - * this is our 'private' RAID10 bio. - * - * it contains information about what kind of IO operations were started - * for this RAID10 operation, and about their status: - */ - -struct r10bio_s { - atomic_t remaining; /* 'have we finished' count, - * used from IRQ handlers - */ - sector_t sector; /* virtual sector number */ - int sectors; - unsigned long state; - mddev_t *mddev; - /* - * original bio going to /dev/mdx - */ - struct bio *master_bio; - /* - * if the IO is in READ direction, then this is where we read - */ - int read_slot; - - struct list_head retry_list; - /* - * if the IO is in WRITE direction, then multiple bios are used, - * one for each copy. - * When resyncing we also use one for each copy. - * When reconstructing, we use 2 bios, one for read, one for write. - * We choose the number when they are allocated. - */ - struct { - struct bio *bio; - sector_t addr; - int devnum; - } devs[0]; -}; - -/* when we get a read error on a read-only array, we redirect to another - * device without failing the first device, or trying to over-write to - * correct the read error. To keep track of bad blocks on a per-bio - * level, we store IO_BLOCKED in the appropriate 'bios' pointer - */ -#define IO_BLOCKED ((struct bio*)1) - -/* bits for r10bio.state */ -#define R10BIO_Uptodate 0 -#define R10BIO_IsSync 1 -#define R10BIO_IsRecover 2 -#define R10BIO_Degraded 3 -#endif diff --git a/include/linux/raid/raid5.h b/include/linux/raid/raid5.h deleted file mode 100644 index 3b26727..0000000 --- a/include/linux/raid/raid5.h +++ /dev/null @@ -1,402 +0,0 @@ -#ifndef _RAID5_H -#define _RAID5_H - -#include -#include - -/* - * - * Each stripe contains one buffer per disc. Each buffer can be in - * one of a number of states stored in "flags". Changes between - * these states happen *almost* exclusively under a per-stripe - * spinlock. Some very specific changes can happen in bi_end_io, and - * these are not protected by the spin lock. - * - * The flag bits that are used to represent these states are: - * R5_UPTODATE and R5_LOCKED - * - * State Empty == !UPTODATE, !LOCK - * We have no data, and there is no active request - * State Want == !UPTODATE, LOCK - * A read request is being submitted for this block - * State Dirty == UPTODATE, LOCK - * Some new data is in this buffer, and it is being written out - * State Clean == UPTODATE, !LOCK - * We have valid data which is the same as on disc - * - * The possible state transitions are: - * - * Empty -> Want - on read or write to get old data for parity calc - * Empty -> Dirty - on compute_parity to satisfy write/sync request.(RECONSTRUCT_WRITE) - * Empty -> Clean - on compute_block when computing a block for failed drive - * Want -> Empty - on failed read - * Want -> Clean - on successful completion of read request - * Dirty -> Clean - on successful completion of write request - * Dirty -> Clean - on failed write - * Clean -> Dirty - on compute_parity to satisfy write/sync (RECONSTRUCT or RMW) - * - * The Want->Empty, Want->Clean, Dirty->Clean, transitions - * all happen in b_end_io at interrupt time. - * Each sets the Uptodate bit before releasing the Lock bit. - * This leaves one multi-stage transition: - * Want->Dirty->Clean - * This is safe because thinking that a Clean buffer is actually dirty - * will at worst delay some action, and the stripe will be scheduled - * for attention after the transition is complete. - * - * There is one possibility that is not covered by these states. That - * is if one drive has failed and there is a spare being rebuilt. We - * can't distinguish between a clean block that has been generated - * from parity calculations, and a clean block that has been - * successfully written to the spare ( or to parity when resyncing). - * To distingush these states we have a stripe bit STRIPE_INSYNC that - * is set whenever a write is scheduled to the spare, or to the parity - * disc if there is no spare. A sync request clears this bit, and - * when we find it set with no buffers locked, we know the sync is - * complete. - * - * Buffers for the md device that arrive via make_request are attached - * to the appropriate stripe in one of two lists linked on b_reqnext. - * One list (bh_read) for read requests, one (bh_write) for write. - * There should never be more than one buffer on the two lists - * together, but we are not guaranteed of that so we allow for more. - * - * If a buffer is on the read list when the associated cache buffer is - * Uptodate, the data is copied into the read buffer and it's b_end_io - * routine is called. This may happen in the end_request routine only - * if the buffer has just successfully been read. end_request should - * remove the buffers from the list and then set the Uptodate bit on - * the buffer. Other threads may do this only if they first check - * that the Uptodate bit is set. Once they have checked that they may - * take buffers off the read queue. - * - * When a buffer on the write list is committed for write it is copied - * into the cache buffer, which is then marked dirty, and moved onto a - * third list, the written list (bh_written). Once both the parity - * block and the cached buffer are successfully written, any buffer on - * a written list can be returned with b_end_io. - * - * The write list and read list both act as fifos. The read list is - * protected by the device_lock. The write and written lists are - * protected by the stripe lock. The device_lock, which can be - * claimed while the stipe lock is held, is only for list - * manipulations and will only be held for a very short time. It can - * be claimed from interrupts. - * - * - * Stripes in the stripe cache can be on one of two lists (or on - * neither). The "inactive_list" contains stripes which are not - * currently being used for any request. They can freely be reused - * for another stripe. The "handle_list" contains stripes that need - * to be handled in some way. Both of these are fifo queues. Each - * stripe is also (potentially) linked to a hash bucket in the hash - * table so that it can be found by sector number. Stripes that are - * not hashed must be on the inactive_list, and will normally be at - * the front. All stripes start life this way. - * - * The inactive_list, handle_list and hash bucket lists are all protected by the - * device_lock. - * - stripes on the inactive_list never have their stripe_lock held. - * - stripes have a reference counter. If count==0, they are on a list. - * - If a stripe might need handling, STRIPE_HANDLE is set. - * - When refcount reaches zero, then if STRIPE_HANDLE it is put on - * handle_list else inactive_list - * - * This, combined with the fact that STRIPE_HANDLE is only ever - * cleared while a stripe has a non-zero count means that if the - * refcount is 0 and STRIPE_HANDLE is set, then it is on the - * handle_list and if recount is 0 and STRIPE_HANDLE is not set, then - * the stripe is on inactive_list. - * - * The possible transitions are: - * activate an unhashed/inactive stripe (get_active_stripe()) - * lockdev check-hash unlink-stripe cnt++ clean-stripe hash-stripe unlockdev - * activate a hashed, possibly active stripe (get_active_stripe()) - * lockdev check-hash if(!cnt++)unlink-stripe unlockdev - * attach a request to an active stripe (add_stripe_bh()) - * lockdev attach-buffer unlockdev - * handle a stripe (handle_stripe()) - * lockstripe clrSTRIPE_HANDLE ... - * (lockdev check-buffers unlockdev) .. - * change-state .. - * record io/ops needed unlockstripe schedule io/ops - * release an active stripe (release_stripe()) - * lockdev if (!--cnt) { if STRIPE_HANDLE, add to handle_list else add to inactive-list } unlockdev - * - * The refcount counts each thread that have activated the stripe, - * plus raid5d if it is handling it, plus one for each active request - * on a cached buffer, and plus one if the stripe is undergoing stripe - * operations. - * - * Stripe operations are performed outside the stripe lock, - * the stripe operations are: - * -copying data between the stripe cache and user application buffers - * -computing blocks to save a disk access, or to recover a missing block - * -updating the parity on a write operation (reconstruct write and - * read-modify-write) - * -checking parity correctness - * -running i/o to disk - * These operations are carried out by raid5_run_ops which uses the async_tx - * api to (optionally) offload operations to dedicated hardware engines. - * When requesting an operation handle_stripe sets the pending bit for the - * operation and increments the count. raid5_run_ops is then run whenever - * the count is non-zero. - * There are some critical dependencies between the operations that prevent some - * from being requested while another is in flight. - * 1/ Parity check operations destroy the in cache version of the parity block, - * so we prevent parity dependent operations like writes and compute_blocks - * from starting while a check is in progress. Some dma engines can perform - * the check without damaging the parity block, in these cases the parity - * block is re-marked up to date (assuming the check was successful) and is - * not re-read from disk. - * 2/ When a write operation is requested we immediately lock the affected - * blocks, and mark them as not up to date. This causes new read requests - * to be held off, as well as parity checks and compute block operations. - * 3/ Once a compute block operation has been requested handle_stripe treats - * that block as if it is up to date. raid5_run_ops guaruntees that any - * operation that is dependent on the compute block result is initiated after - * the compute block completes. - */ - -/* - * Operations state - intermediate states that are visible outside of sh->lock - * In general _idle indicates nothing is running, _run indicates a data - * processing operation is active, and _result means the data processing result - * is stable and can be acted upon. For simple operations like biofill and - * compute that only have an _idle and _run state they are indicated with - * sh->state flags (STRIPE_BIOFILL_RUN and STRIPE_COMPUTE_RUN) - */ -/** - * enum check_states - handles syncing / repairing a stripe - * @check_state_idle - check operations are quiesced - * @check_state_run - check operation is running - * @check_state_result - set outside lock when check result is valid - * @check_state_compute_run - check failed and we are repairing - * @check_state_compute_result - set outside lock when compute result is valid - */ -enum check_states { - check_state_idle = 0, - check_state_run, /* parity check */ - check_state_check_result, - check_state_compute_run, /* parity repair */ - check_state_compute_result, -}; - -/** - * enum reconstruct_states - handles writing or expanding a stripe - */ -enum reconstruct_states { - reconstruct_state_idle = 0, - reconstruct_state_prexor_drain_run, /* prexor-write */ - reconstruct_state_drain_run, /* write */ - reconstruct_state_run, /* expand */ - reconstruct_state_prexor_drain_result, - reconstruct_state_drain_result, - reconstruct_state_result, -}; - -struct stripe_head { - struct hlist_node hash; - struct list_head lru; /* inactive_list or handle_list */ - struct raid5_private_data *raid_conf; - sector_t sector; /* sector of this row */ - int pd_idx; /* parity disk index */ - unsigned long state; /* state flags */ - atomic_t count; /* nr of active thread/requests */ - spinlock_t lock; - int bm_seq; /* sequence number for bitmap flushes */ - int disks; /* disks in stripe */ - enum check_states check_state; - enum reconstruct_states reconstruct_state; - /* stripe_operations - * @target - STRIPE_OP_COMPUTE_BLK target - */ - struct stripe_operations { - int target; - u32 zero_sum_result; - } ops; - struct r5dev { - struct bio req; - struct bio_vec vec; - struct page *page; - struct bio *toread, *read, *towrite, *written; - sector_t sector; /* sector of this page */ - unsigned long flags; - } dev[1]; /* allocated with extra space depending of RAID geometry */ -}; - -/* stripe_head_state - collects and tracks the dynamic state of a stripe_head - * for handle_stripe. It is only valid under spin_lock(sh->lock); - */ -struct stripe_head_state { - int syncing, expanding, expanded; - int locked, uptodate, to_read, to_write, failed, written; - int to_fill, compute, req_compute, non_overwrite; - int failed_num; - unsigned long ops_request; -}; - -/* r6_state - extra state data only relevant to r6 */ -struct r6_state { - int p_failed, q_failed, qd_idx, failed_num[2]; -}; - -/* Flags */ -#define R5_UPTODATE 0 /* page contains current data */ -#define R5_LOCKED 1 /* IO has been submitted on "req" */ -#define R5_OVERWRITE 2 /* towrite covers whole page */ -/* and some that are internal to handle_stripe */ -#define R5_Insync 3 /* rdev && rdev->in_sync at start */ -#define R5_Wantread 4 /* want to schedule a read */ -#define R5_Wantwrite 5 -#define R5_Overlap 7 /* There is a pending overlapping request on this block */ -#define R5_ReadError 8 /* seen a read error here recently */ -#define R5_ReWrite 9 /* have tried to over-write the readerror */ - -#define R5_Expanded 10 /* This block now has post-expand data */ -#define R5_Wantcompute 11 /* compute_block in progress treat as - * uptodate - */ -#define R5_Wantfill 12 /* dev->toread contains a bio that needs - * filling - */ -#define R5_Wantdrain 13 /* dev->towrite needs to be drained */ -/* - * Write method - */ -#define RECONSTRUCT_WRITE 1 -#define READ_MODIFY_WRITE 2 -/* not a write method, but a compute_parity mode */ -#define CHECK_PARITY 3 - -/* - * Stripe state - */ -#define STRIPE_HANDLE 2 -#define STRIPE_SYNCING 3 -#define STRIPE_INSYNC 4 -#define STRIPE_PREREAD_ACTIVE 5 -#define STRIPE_DELAYED 6 -#define STRIPE_DEGRADED 7 -#define STRIPE_BIT_DELAY 8 -#define STRIPE_EXPANDING 9 -#define STRIPE_EXPAND_SOURCE 10 -#define STRIPE_EXPAND_READY 11 -#define STRIPE_IO_STARTED 12 /* do not count towards 'bypass_count' */ -#define STRIPE_FULL_WRITE 13 /* all blocks are set to be overwritten */ -#define STRIPE_BIOFILL_RUN 14 -#define STRIPE_COMPUTE_RUN 15 -/* - * Operation request flags - */ -#define STRIPE_OP_BIOFILL 0 -#define STRIPE_OP_COMPUTE_BLK 1 -#define STRIPE_OP_PREXOR 2 -#define STRIPE_OP_BIODRAIN 3 -#define STRIPE_OP_POSTXOR 4 -#define STRIPE_OP_CHECK 5 - -/* - * Plugging: - * - * To improve write throughput, we need to delay the handling of some - * stripes until there has been a chance that several write requests - * for the one stripe have all been collected. - * In particular, any write request that would require pre-reading - * is put on a "delayed" queue until there are no stripes currently - * in a pre-read phase. Further, if the "delayed" queue is empty when - * a stripe is put on it then we "plug" the queue and do not process it - * until an unplug call is made. (the unplug_io_fn() is called). - * - * When preread is initiated on a stripe, we set PREREAD_ACTIVE and add - * it to the count of prereading stripes. - * When write is initiated, or the stripe refcnt == 0 (just in case) we - * clear the PREREAD_ACTIVE flag and decrement the count - * Whenever the 'handle' queue is empty and the device is not plugged, we - * move any strips from delayed to handle and clear the DELAYED flag and set - * PREREAD_ACTIVE. - * In stripe_handle, if we find pre-reading is necessary, we do it if - * PREREAD_ACTIVE is set, else we set DELAYED which will send it to the delayed queue. - * HANDLE gets cleared if stripe_handle leave nothing locked. - */ - - -struct disk_info { - mdk_rdev_t *rdev; -}; - -struct raid5_private_data { - struct hlist_head *stripe_hashtbl; - mddev_t *mddev; - struct disk_info *spare; - int chunk_size, level, algorithm; - int max_degraded; - int raid_disks; - int max_nr_stripes; - - /* used during an expand */ - sector_t expand_progress; /* MaxSector when no expand happening */ - sector_t expand_lo; /* from here up to expand_progress it out-of-bounds - * as we haven't flushed the metadata yet - */ - int previous_raid_disks; - - struct list_head handle_list; /* stripes needing handling */ - struct list_head hold_list; /* preread ready stripes */ - struct list_head delayed_list; /* stripes that have plugged requests */ - struct list_head bitmap_list; /* stripes delaying awaiting bitmap update */ - struct bio *retry_read_aligned; /* currently retrying aligned bios */ - struct bio *retry_read_aligned_list; /* aligned bios retry list */ - atomic_t preread_active_stripes; /* stripes with scheduled io */ - atomic_t active_aligned_reads; - atomic_t pending_full_writes; /* full write backlog */ - int bypass_count; /* bypassed prereads */ - int bypass_threshold; /* preread nice */ - struct list_head *last_hold; /* detect hold_list promotions */ - - atomic_t reshape_stripes; /* stripes with pending writes for reshape */ - /* unfortunately we need two cache names as we temporarily have - * two caches. - */ - int active_name; - char cache_name[2][20]; - struct kmem_cache *slab_cache; /* for allocating stripes */ - - int seq_flush, seq_write; - int quiesce; - - int fullsync; /* set to 1 if a full sync is needed, - * (fresh device added). - * Cleared when a sync completes. - */ - - struct page *spare_page; /* Used when checking P/Q in raid6 */ - - /* - * Free stripes pool - */ - atomic_t active_stripes; - struct list_head inactive_list; - wait_queue_head_t wait_for_stripe; - wait_queue_head_t wait_for_overlap; - int inactive_blocked; /* release of inactive stripes blocked, - * waiting for 25% to be free - */ - int pool_size; /* number of disks in stripeheads in pool */ - spinlock_t device_lock; - struct disk_info *disks; -}; - -typedef struct raid5_private_data raid5_conf_t; - -#define mddev_to_conf(mddev) ((raid5_conf_t *) mddev->private) - -/* - * Our supported algorithms - */ -#define ALGORITHM_LEFT_ASYMMETRIC 0 -#define ALGORITHM_RIGHT_ASYMMETRIC 1 -#define ALGORITHM_LEFT_SYMMETRIC 2 -#define ALGORITHM_RIGHT_SYMMETRIC 3 - -#endif -- cgit v0.10.2 From 8b2b5c217c20b5460218ab8731295f2e46c7dd29 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 31 Mar 2009 14:27:03 +1100 Subject: md: move LEVEL_* definition from md_k.h to md_u.h .. as they are part of the user-space interface. Also move MdpMinorShift into there so we can remove duplication. Lastly move mdp_major in. It is less obviously part of the user-space interface, but do_mounts_md.c uses it, and it is acting a bit like user-space. Signed-off-by: NeilBrown diff --git a/drivers/md/md.c b/drivers/md/md.c index 9a3214c..96336b0 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -46,9 +46,6 @@ #include #include "bitmap.h" -/* 63 partitions with the alternate major number (mdp) */ -#define MdpMinorShift 6 - #define DEBUG 0 #define dprintk(x...) ((void)(DEBUG && printk(x))) diff --git a/include/linux/raid/md.h b/include/linux/raid/md.h index 82bea14..8bfaf6b 100644 --- a/include/linux/raid/md.h +++ b/include/linux/raid/md.h @@ -52,8 +52,6 @@ */ #define MD_PATCHLEVEL_VERSION 3 -extern int mdp_major; - extern int register_md_personality(struct mdk_personality *p); extern int unregister_md_personality(struct mdk_personality *p); extern mdk_thread_t * md_register_thread(void (*run) (mddev_t *mddev), diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h index 4aedb9f..758ec28 100644 --- a/include/linux/raid/md_k.h +++ b/include/linux/raid/md_k.h @@ -20,16 +20,6 @@ #ifdef CONFIG_BLOCK -#define LEVEL_MULTIPATH (-4) -#define LEVEL_LINEAR (-1) -#define LEVEL_FAULTY (-5) - -/* we need a value for 'no level specified' and 0 - * means 'raid0', so we need something else. This is - * for internal use only - */ -#define LEVEL_NONE (-1000000) - #define MaxSector (~(sector_t)0) typedef struct mddev_s mddev_t; diff --git a/include/linux/raid/md_u.h b/include/linux/raid/md_u.h index 7192035..2f824aa 100644 --- a/include/linux/raid/md_u.h +++ b/include/linux/raid/md_u.h @@ -46,6 +46,12 @@ #define STOP_ARRAY_RO _IO (MD_MAJOR, 0x33) #define RESTART_ARRAY_RW _IO (MD_MAJOR, 0x34) +/* 63 partitions with the alternate major number (mdp) */ +#define MdpMinorShift 6 +#ifdef __KERNEL__ +extern int mdp_major; +#endif + typedef struct mdu_version_s { int major; int minor; @@ -85,6 +91,17 @@ typedef struct mdu_array_info_s { } mdu_array_info_t; +/* non-obvious values for 'level' */ +#define LEVEL_MULTIPATH (-4) +#define LEVEL_LINEAR (-1) +#define LEVEL_FAULTY (-5) + +/* we need a value for 'no level specified' and 0 + * means 'raid0', so we need something else. This is + * for internal use only + */ +#define LEVEL_NONE (-1000000) + typedef struct mdu_disk_info_s { /* * configuration/status of one particular disk diff --git a/init/do_mounts_md.c b/init/do_mounts_md.c index 9bdddbc..23a15fb 100644 --- a/init/do_mounts_md.c +++ b/init/do_mounts_md.c @@ -112,8 +112,6 @@ static int __init md_setup(char *str) return 1; } -#define MdpMinorShift 6 - static void __init md_setup_drive(void) { int minor, i, ent, partitioned; -- cgit v0.10.2 From 92022950c6b1bb3da90b2976b20271cdfd98b8a3 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 31 Mar 2009 14:33:13 +1100 Subject: md: move most content from md.h to md_k.h The extern function definitions are kernel-internal definitions, so they belong in md_k.h The MD_*_VERSION values could reasonably go in a number of places, but md_u.h seems most reasonable. This leaves almost nothing in md.h. It will go soon. Signed-off-by: NeilBrown diff --git a/include/linux/raid/md.h b/include/linux/raid/md.h index 8bfaf6b..71c4fd1 100644 --- a/include/linux/raid/md.h +++ b/include/linux/raid/md.h @@ -34,46 +34,6 @@ #ifdef CONFIG_MD -/* - * Different major versions are not compatible. - * Different minor versions are only downward compatible. - * Different patchlevel versions are downward and upward compatible. - */ -#define MD_MAJOR_VERSION 0 -#define MD_MINOR_VERSION 90 -/* - * MD_PATCHLEVEL_VERSION indicates kernel functionality. - * >=1 means different superblock formats are selectable using SET_ARRAY_INFO - * and major_version/minor_version accordingly - * >=2 means that Internal bitmaps are supported by setting MD_SB_BITMAP_PRESENT - * in the super status byte - * >=3 means that bitmap superblock version 4 is supported, which uses - * little-ending representation rather than host-endian - */ -#define MD_PATCHLEVEL_VERSION 3 - -extern int register_md_personality(struct mdk_personality *p); -extern int unregister_md_personality(struct mdk_personality *p); -extern mdk_thread_t * md_register_thread(void (*run) (mddev_t *mddev), - mddev_t *mddev, const char *name); -extern void md_unregister_thread(mdk_thread_t *thread); -extern void md_wakeup_thread(mdk_thread_t *thread); -extern void md_check_recovery(mddev_t *mddev); -extern void md_write_start(mddev_t *mddev, struct bio *bi); -extern void md_write_end(mddev_t *mddev); -extern void md_done_sync(mddev_t *mddev, int blocks, int ok); -extern void md_error(mddev_t *mddev, mdk_rdev_t *rdev); - -extern void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev, - sector_t sector, int size, struct page *page); -extern void md_super_wait(mddev_t *mddev); -extern int sync_page_io(struct block_device *bdev, sector_t sector, int size, - struct page *page, int rw); -extern void md_do_sync(mddev_t *mddev); -extern void md_new_event(mddev_t *mddev); -extern int md_allow_write(mddev_t *mddev); -extern void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev); - #endif /* CONFIG_MD */ #endif diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h index 758ec28..4c5e2d0 100644 --- a/include/linux/raid/md_k.h +++ b/include/linux/raid/md_k.h @@ -390,3 +390,25 @@ static inline void safe_put_page(struct page *p) #endif /* CONFIG_BLOCK */ #endif + +extern int register_md_personality(struct mdk_personality *p); +extern int unregister_md_personality(struct mdk_personality *p); +extern mdk_thread_t * md_register_thread(void (*run) (mddev_t *mddev), + mddev_t *mddev, const char *name); +extern void md_unregister_thread(mdk_thread_t *thread); +extern void md_wakeup_thread(mdk_thread_t *thread); +extern void md_check_recovery(mddev_t *mddev); +extern void md_write_start(mddev_t *mddev, struct bio *bi); +extern void md_write_end(mddev_t *mddev); +extern void md_done_sync(mddev_t *mddev, int blocks, int ok); +extern void md_error(mddev_t *mddev, mdk_rdev_t *rdev); + +extern void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev, + sector_t sector, int size, struct page *page); +extern void md_super_wait(mddev_t *mddev); +extern int sync_page_io(struct block_device *bdev, sector_t sector, int size, + struct page *page, int rw); +extern void md_do_sync(mddev_t *mddev); +extern void md_new_event(mddev_t *mddev); +extern int md_allow_write(mddev_t *mddev); +extern void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev); diff --git a/include/linux/raid/md_u.h b/include/linux/raid/md_u.h index 2f824aa..fb1abb3 100644 --- a/include/linux/raid/md_u.h +++ b/include/linux/raid/md_u.h @@ -15,6 +15,24 @@ #ifndef _MD_U_H #define _MD_U_H +/* + * Different major versions are not compatible. + * Different minor versions are only downward compatible. + * Different patchlevel versions are downward and upward compatible. + */ +#define MD_MAJOR_VERSION 0 +#define MD_MINOR_VERSION 90 +/* + * MD_PATCHLEVEL_VERSION indicates kernel functionality. + * >=1 means different superblock formats are selectable using SET_ARRAY_INFO + * and major_version/minor_version accordingly + * >=2 means that Internal bitmaps are supported by setting MD_SB_BITMAP_PRESENT + * in the super status byte + * >=3 means that bitmap superblock version 4 is supported, which uses + * little-ending representation rather than host-endian + */ +#define MD_PATCHLEVEL_VERSION 3 + /* ioctls */ /* status */ -- cgit v0.10.2 From bff61975b3d6c18ee31457cc5b4d73042f44915f Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 31 Mar 2009 14:33:13 +1100 Subject: md: move lots of #include lines out of .h files and into .c This makes the includes more explicit, and is preparation for moving md_k.h to drivers/md/md.h Remove include/raid/md.h as its only remaining use was to #include other files. Signed-off-by: NeilBrown diff --git a/crypto/xor.c b/crypto/xor.c index b2e6db0..996b6ee 100644 --- a/crypto/xor.c +++ b/crypto/xor.c @@ -18,8 +18,8 @@ #define BH_TRACE 0 #include -#include #include +#include #include /* The xor routines to use. */ diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index 7666117..1df012e 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c @@ -16,6 +16,7 @@ * wait if count gets too high, wake when it drops to half. */ +#include #include #include #include @@ -26,7 +27,7 @@ #include #include #include -#include +#include #include "bitmap.h" /* debug macros */ diff --git a/drivers/md/faulty.c b/drivers/md/faulty.c index 86d9adf..cc5d2cf 100644 --- a/drivers/md/faulty.c +++ b/drivers/md/faulty.c @@ -62,7 +62,10 @@ #define ModeShift 5 #define MaxFault 50 -#include +#include +#include +#include +#include static void faulty_fail(struct bio *bio, int error) diff --git a/drivers/md/linear.c b/drivers/md/linear.c index 3603ffa..c43c3b6 100644 --- a/drivers/md/linear.c +++ b/drivers/md/linear.c @@ -16,6 +16,10 @@ Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ +#include +#include +#include +#include #include "linear.h" /* diff --git a/drivers/md/linear.h b/drivers/md/linear.h index f38b9c5..bf81795 100644 --- a/drivers/md/linear.h +++ b/drivers/md/linear.h @@ -1,8 +1,6 @@ #ifndef _LINEAR_H #define _LINEAR_H -#include - struct dev_info { mdk_rdev_t *rdev; sector_t num_sectors; diff --git a/drivers/md/md.c b/drivers/md/md.c index 96336b0..11d6e0e 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -33,8 +33,9 @@ */ #include -#include +#include #include +#include #include /* for invalidate_bdev */ #include #include @@ -44,6 +45,9 @@ #include #include #include +#include +#include +#include #include "bitmap.h" #define DEBUG 0 diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c index 547df09..148b3cd 100644 --- a/drivers/md/multipath.c +++ b/drivers/md/multipath.c @@ -19,6 +19,10 @@ * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ +#include +#include +#include +#include #include "multipath.h" #define MAX_WORK_PER_DISK 128 diff --git a/drivers/md/multipath.h b/drivers/md/multipath.h index 6f53fc1..6fa70b4 100644 --- a/drivers/md/multipath.h +++ b/drivers/md/multipath.h @@ -1,8 +1,6 @@ #ifndef _MULTIPATH_H #define _MULTIPATH_H -#include - struct multipath_info { mdk_rdev_t *rdev; }; diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index ef09ed0..64e4c77 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -18,6 +18,9 @@ Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ +#include +#include +#include #include "raid0.h" static void raid0_unplug(struct request_queue *q) diff --git a/drivers/md/raid0.h b/drivers/md/raid0.h index fd42aa8..824b12e 100644 --- a/drivers/md/raid0.h +++ b/drivers/md/raid0.h @@ -1,8 +1,6 @@ #ifndef _RAID0_H #define _RAID0_H -#include - struct strip_zone { sector_t zone_start; /* Zone offset in md_dev (in sectors) */ diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index bff3228..253b09c 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -31,8 +31,11 @@ * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ -#include "dm-bio-list.h" #include +#include +#include +#include +#include "dm-bio-list.h" #include "raid1.h" #include "bitmap.h" diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h index 0a9ba7c..1620eea 100644 --- a/drivers/md/raid1.h +++ b/drivers/md/raid1.h @@ -1,8 +1,6 @@ #ifndef _RAID1_H #define _RAID1_H -#include - typedef struct mirror_info mirror_info_t; struct mirror_info { diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index f03dd70..186e1b1 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -18,8 +18,11 @@ * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ -#include "dm-bio-list.h" #include +#include +#include +#include +#include "dm-bio-list.h" #include "raid10.h" #include "bitmap.h" diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h index e9091cf..244dbe5 100644 --- a/drivers/md/raid10.h +++ b/drivers/md/raid10.h @@ -1,8 +1,6 @@ #ifndef _RAID10_H #define _RAID10_H -#include - typedef struct mirror_info mirror_info_t; struct mirror_info { diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index f75698b..816157e 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -43,8 +43,12 @@ * miss any bits. */ +#include +#include #include #include +#include +#include "raid5.h" #include "raid6.h" #include "bitmap.h" @@ -1467,7 +1471,7 @@ static void copy_data(int frombio, struct bio *bio, static void compute_parity6(struct stripe_head *sh, int method) { - raid6_conf_t *conf = sh->raid_conf; + raid5_conf_t *conf = sh->raid_conf; int i, pd_idx = sh->pd_idx, qd_idx, d0_idx, disks = sh->disks, count; struct bio *chosen; /**** FIX THIS: This could be very bad if disks is close to 256 ****/ @@ -2795,7 +2799,7 @@ static bool handle_stripe5(struct stripe_head *sh) static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page) { - raid6_conf_t *conf = sh->raid_conf; + raid5_conf_t *conf = sh->raid_conf; int disks = sh->disks; struct bio *return_bi = NULL; int i, pd_idx = sh->pd_idx; diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index 40f1d03..0ed22df 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -1,7 +1,6 @@ #ifndef _RAID5_H #define _RAID5_H -#include #include /* diff --git a/drivers/md/raid6.h b/drivers/md/raid6.h index f6c13af..66e6b0c 100644 --- a/drivers/md/raid6.h +++ b/drivers/md/raid6.h @@ -17,11 +17,7 @@ /* Set to 1 to use kernel-wide empty_zero_page */ #define RAID6_USE_EMPTY_ZERO_PAGE 0 - -#include -#include "raid5.h" - -typedef raid5_conf_t raid6_conf_t; /* Same configuration */ +#include /* Additional compute_parity mode -- updates the parity w/o LOCKING */ #define UPDATE_PARITY 4 diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c index 45e59d3..141c038 100644 --- a/fs/compat_ioctl.c +++ b/fs/compat_ioctl.c @@ -23,7 +23,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/include/linux/raid/md.h b/include/linux/raid/md.h deleted file mode 100644 index 71c4fd1..0000000 --- a/include/linux/raid/md.h +++ /dev/null @@ -1,39 +0,0 @@ -/* - md.h : Multiple Devices driver for Linux - Copyright (C) 1996-98 Ingo Molnar, Gadi Oxman - Copyright (C) 1994-96 Marc ZYNGIER - or - - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2, or (at your option) - any later version. - - You should have received a copy of the GNU General Public License - (for example /usr/src/linux/COPYING); if not, write to the Free - Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. -*/ - -#ifndef _MD_H -#define _MD_H - -#include -#include - -/* - * 'md_p.h' holds the 'physical' layout of RAID devices - * 'md_u.h' holds the user <=> kernel API - * - * 'md_k.h' holds kernel internal definitions - */ - -#include -#include -#include - -#ifdef CONFIG_MD - -#endif /* CONFIG_MD */ -#endif - diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h index 4c5e2d0..e78b3c1d 100644 --- a/include/linux/raid/md_k.h +++ b/include/linux/raid/md_k.h @@ -15,9 +15,6 @@ #ifndef _MD_K_H #define _MD_K_H -/* and dm-bio-list.h is not under include/linux because.... ??? */ -#include "../../../drivers/md/dm-bio-list.h" - #ifdef CONFIG_BLOCK #define MaxSector (~(sector_t)0) diff --git a/include/linux/raid/xor.h b/include/linux/raid/xor.h index 3e12058..5a21095 100644 --- a/include/linux/raid/xor.h +++ b/include/linux/raid/xor.h @@ -1,8 +1,6 @@ #ifndef _XOR_H #define _XOR_H -#include - #define MAX_XOR_BLOCKS 4 extern void xor_blocks(unsigned int count, unsigned int bytes, diff --git a/init/do_mounts.h b/init/do_mounts.h index 9aa968d..f5b978a 100644 --- a/init/do_mounts.h +++ b/init/do_mounts.h @@ -1,4 +1,5 @@ #include +#include #include #include #include diff --git a/init/do_mounts_md.c b/init/do_mounts_md.c index 23a15fb..69aebbf 100644 --- a/init/do_mounts_md.c +++ b/init/do_mounts_md.c @@ -1,5 +1,6 @@ #include -#include +#include +#include #include "do_mounts.h" -- cgit v0.10.2 From 43b2e5d86d8bdd77386226db0bc961529492c043 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 31 Mar 2009 14:33:13 +1100 Subject: md: move md_k.h from include/linux/raid/ to drivers/md/ It really is nicer to keep related code together.. Signed-off-by: NeilBrown diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index 1df012e..623292a 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c @@ -27,7 +27,7 @@ #include #include #include -#include +#include "md.h" #include "bitmap.h" /* debug macros */ diff --git a/drivers/md/faulty.c b/drivers/md/faulty.c index cc5d2cf..7b66b9f 100644 --- a/drivers/md/faulty.c +++ b/drivers/md/faulty.c @@ -64,7 +64,7 @@ #define MaxFault 50 #include #include -#include +#include "md.h" #include diff --git a/drivers/md/linear.c b/drivers/md/linear.c index c43c3b6..f248834 100644 --- a/drivers/md/linear.c +++ b/drivers/md/linear.c @@ -18,8 +18,8 @@ #include #include -#include #include +#include "md.h" #include "linear.h" /* diff --git a/drivers/md/md.c b/drivers/md/md.c index 11d6e0e..aad0ac5 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -45,9 +45,9 @@ #include #include #include -#include #include #include +#include "md.h" #include "bitmap.h" #define DEBUG 0 diff --git a/drivers/md/md.h b/drivers/md/md.h new file mode 100644 index 0000000..e78b3c1d --- /dev/null +++ b/drivers/md/md.h @@ -0,0 +1,411 @@ +/* + md_k.h : kernel internal structure of the Linux MD driver + Copyright (C) 1996-98 Ingo Molnar, Gadi Oxman + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + You should have received a copy of the GNU General Public License + (for example /usr/src/linux/COPYING); if not, write to the Free + Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. +*/ + +#ifndef _MD_K_H +#define _MD_K_H + +#ifdef CONFIG_BLOCK + +#define MaxSector (~(sector_t)0) + +typedef struct mddev_s mddev_t; +typedef struct mdk_rdev_s mdk_rdev_t; + +/* + * options passed in raidrun: + */ + +/* Currently this must fit in an 'int' */ +#define MAX_CHUNK_SIZE (1<<30) + +/* + * MD's 'extended' device + */ +struct mdk_rdev_s +{ + struct list_head same_set; /* RAID devices within the same set */ + + sector_t size; /* Device size (in blocks) */ + mddev_t *mddev; /* RAID array if running */ + int last_events; /* IO event timestamp */ + + struct block_device *bdev; /* block device handle */ + + struct page *sb_page; + int sb_loaded; + __u64 sb_events; + sector_t data_offset; /* start of data in array */ + sector_t sb_start; /* offset of the super block (in 512byte sectors) */ + int sb_size; /* bytes in the superblock */ + int preferred_minor; /* autorun support */ + + struct kobject kobj; + + /* A device can be in one of three states based on two flags: + * Not working: faulty==1 in_sync==0 + * Fully working: faulty==0 in_sync==1 + * Working, but not + * in sync with array + * faulty==0 in_sync==0 + * + * It can never have faulty==1, in_sync==1 + * This reduces the burden of testing multiple flags in many cases + */ + + unsigned long flags; +#define Faulty 1 /* device is known to have a fault */ +#define In_sync 2 /* device is in_sync with rest of array */ +#define WriteMostly 4 /* Avoid reading if at all possible */ +#define BarriersNotsupp 5 /* BIO_RW_BARRIER is not supported */ +#define AllReserved 6 /* If whole device is reserved for + * one array */ +#define AutoDetected 7 /* added by auto-detect */ +#define Blocked 8 /* An error occured on an externally + * managed array, don't allow writes + * until it is cleared */ +#define StateChanged 9 /* Faulty or Blocked has changed during + * interrupt, so it needs to be + * notified by the thread */ + wait_queue_head_t blocked_wait; + + int desc_nr; /* descriptor index in the superblock */ + int raid_disk; /* role of device in array */ + int saved_raid_disk; /* role that device used to have in the + * array and could again if we did a partial + * resync from the bitmap + */ + sector_t recovery_offset;/* If this device has been partially + * recovered, this is where we were + * up to. + */ + + atomic_t nr_pending; /* number of pending requests. + * only maintained for arrays that + * support hot removal + */ + atomic_t read_errors; /* number of consecutive read errors that + * we have tried to ignore. + */ + atomic_t corrected_errors; /* number of corrected read errors, + * for reporting to userspace and storing + * in superblock. + */ + struct work_struct del_work; /* used for delayed sysfs removal */ + + struct sysfs_dirent *sysfs_state; /* handle for 'state' + * sysfs entry */ +}; + +struct mddev_s +{ + void *private; + struct mdk_personality *pers; + dev_t unit; + int md_minor; + struct list_head disks; + unsigned long flags; +#define MD_CHANGE_DEVS 0 /* Some device status has changed */ +#define MD_CHANGE_CLEAN 1 /* transition to or from 'clean' */ +#define MD_CHANGE_PENDING 2 /* superblock update in progress */ + + int ro; + + struct gendisk *gendisk; + + struct kobject kobj; + int hold_active; +#define UNTIL_IOCTL 1 +#define UNTIL_STOP 2 + + /* Superblock information */ + int major_version, + minor_version, + patch_version; + int persistent; + int external; /* metadata is + * managed externally */ + char metadata_type[17]; /* externally set*/ + int chunk_size; + time_t ctime, utime; + int level, layout; + char clevel[16]; + int raid_disks; + int max_disks; + sector_t size; /* used size of component devices */ + sector_t array_sectors; /* exported array size */ + __u64 events; + + char uuid[16]; + + /* If the array is being reshaped, we need to record the + * new shape and an indication of where we are up to. + * This is written to the superblock. + * If reshape_position is MaxSector, then no reshape is happening (yet). + */ + sector_t reshape_position; + int delta_disks, new_level, new_layout, new_chunk; + + struct mdk_thread_s *thread; /* management thread */ + struct mdk_thread_s *sync_thread; /* doing resync or reconstruct */ + sector_t curr_resync; /* last block scheduled */ + unsigned long resync_mark; /* a recent timestamp */ + sector_t resync_mark_cnt;/* blocks written at resync_mark */ + sector_t curr_mark_cnt; /* blocks scheduled now */ + + sector_t resync_max_sectors; /* may be set by personality */ + + sector_t resync_mismatches; /* count of sectors where + * parity/replica mismatch found + */ + + /* allow user-space to request suspension of IO to regions of the array */ + sector_t suspend_lo; + sector_t suspend_hi; + /* if zero, use the system-wide default */ + int sync_speed_min; + int sync_speed_max; + + /* resync even though the same disks are shared among md-devices */ + int parallel_resync; + + int ok_start_degraded; + /* recovery/resync flags + * NEEDED: we might need to start a resync/recover + * RUNNING: a thread is running, or about to be started + * SYNC: actually doing a resync, not a recovery + * RECOVER: doing recovery, or need to try it. + * INTR: resync needs to be aborted for some reason + * DONE: thread is done and is waiting to be reaped + * REQUEST: user-space has requested a sync (used with SYNC) + * CHECK: user-space request for for check-only, no repair + * RESHAPE: A reshape is happening + * + * If neither SYNC or RESHAPE are set, then it is a recovery. + */ +#define MD_RECOVERY_RUNNING 0 +#define MD_RECOVERY_SYNC 1 +#define MD_RECOVERY_RECOVER 2 +#define MD_RECOVERY_INTR 3 +#define MD_RECOVERY_DONE 4 +#define MD_RECOVERY_NEEDED 5 +#define MD_RECOVERY_REQUESTED 6 +#define MD_RECOVERY_CHECK 7 +#define MD_RECOVERY_RESHAPE 8 +#define MD_RECOVERY_FROZEN 9 + + unsigned long recovery; + int recovery_disabled; /* if we detect that recovery + * will always fail, set this + * so we don't loop trying */ + + int in_sync; /* know to not need resync */ + struct mutex reconfig_mutex; + atomic_t active; /* general refcount */ + atomic_t openers; /* number of active opens */ + + int changed; /* true if we might need to reread partition info */ + int degraded; /* whether md should consider + * adding a spare + */ + int barriers_work; /* initialised to true, cleared as soon + * as a barrier request to slave + * fails. Only supported + */ + struct bio *biolist; /* bios that need to be retried + * because BIO_RW_BARRIER is not supported + */ + + atomic_t recovery_active; /* blocks scheduled, but not written */ + wait_queue_head_t recovery_wait; + sector_t recovery_cp; + sector_t resync_min; /* user requested sync + * starts here */ + sector_t resync_max; /* resync should pause + * when it gets here */ + + struct sysfs_dirent *sysfs_state; /* handle for 'array_state' + * file in sysfs. + */ + struct sysfs_dirent *sysfs_action; /* handle for 'sync_action' */ + + struct work_struct del_work; /* used for delayed sysfs removal */ + + spinlock_t write_lock; + wait_queue_head_t sb_wait; /* for waiting on superblock updates */ + atomic_t pending_writes; /* number of active superblock writes */ + + unsigned int safemode; /* if set, update "clean" superblock + * when no writes pending. + */ + unsigned int safemode_delay; + struct timer_list safemode_timer; + atomic_t writes_pending; + struct request_queue *queue; /* for plugging ... */ + + atomic_t write_behind; /* outstanding async IO */ + unsigned int max_write_behind; /* 0 = sync */ + + struct bitmap *bitmap; /* the bitmap for the device */ + struct file *bitmap_file; /* the bitmap file */ + long bitmap_offset; /* offset from superblock of + * start of bitmap. May be + * negative, but not '0' + */ + long default_bitmap_offset; /* this is the offset to use when + * hot-adding a bitmap. It should + * eventually be settable by sysfs. + */ + + struct list_head all_mddevs; +}; + + +static inline void rdev_dec_pending(mdk_rdev_t *rdev, mddev_t *mddev) +{ + int faulty = test_bit(Faulty, &rdev->flags); + if (atomic_dec_and_test(&rdev->nr_pending) && faulty) + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); +} + +static inline void md_sync_acct(struct block_device *bdev, unsigned long nr_sectors) +{ + atomic_add(nr_sectors, &bdev->bd_contains->bd_disk->sync_io); +} + +struct mdk_personality +{ + char *name; + int level; + struct list_head list; + struct module *owner; + int (*make_request)(struct request_queue *q, struct bio *bio); + int (*run)(mddev_t *mddev); + int (*stop)(mddev_t *mddev); + void (*status)(struct seq_file *seq, mddev_t *mddev); + /* error_handler must set ->faulty and clear ->in_sync + * if appropriate, and should abort recovery if needed + */ + void (*error_handler)(mddev_t *mddev, mdk_rdev_t *rdev); + int (*hot_add_disk) (mddev_t *mddev, mdk_rdev_t *rdev); + int (*hot_remove_disk) (mddev_t *mddev, int number); + int (*spare_active) (mddev_t *mddev); + sector_t (*sync_request)(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster); + int (*resize) (mddev_t *mddev, sector_t sectors); + int (*check_reshape) (mddev_t *mddev); + int (*start_reshape) (mddev_t *mddev); + int (*reconfig) (mddev_t *mddev, int layout, int chunk_size); + /* quiesce moves between quiescence states + * 0 - fully active + * 1 - no new requests allowed + * others - reserved + */ + void (*quiesce) (mddev_t *mddev, int state); +}; + + +struct md_sysfs_entry { + struct attribute attr; + ssize_t (*show)(mddev_t *, char *); + ssize_t (*store)(mddev_t *, const char *, size_t); +}; + + +static inline char * mdname (mddev_t * mddev) +{ + return mddev->gendisk ? mddev->gendisk->disk_name : "mdX"; +} + +/* + * iterates through some rdev ringlist. It's safe to remove the + * current 'rdev'. Dont touch 'tmp' though. + */ +#define rdev_for_each_list(rdev, tmp, head) \ + list_for_each_entry_safe(rdev, tmp, head, same_set) + +/* + * iterates through the 'same array disks' ringlist + */ +#define rdev_for_each(rdev, tmp, mddev) \ + list_for_each_entry_safe(rdev, tmp, &((mddev)->disks), same_set) + +#define rdev_for_each_rcu(rdev, mddev) \ + list_for_each_entry_rcu(rdev, &((mddev)->disks), same_set) + +typedef struct mdk_thread_s { + void (*run) (mddev_t *mddev); + mddev_t *mddev; + wait_queue_head_t wqueue; + unsigned long flags; + struct task_struct *tsk; + unsigned long timeout; +} mdk_thread_t; + +#define THREAD_WAKEUP 0 + +#define __wait_event_lock_irq(wq, condition, lock, cmd) \ +do { \ + wait_queue_t __wait; \ + init_waitqueue_entry(&__wait, current); \ + \ + add_wait_queue(&wq, &__wait); \ + for (;;) { \ + set_current_state(TASK_UNINTERRUPTIBLE); \ + if (condition) \ + break; \ + spin_unlock_irq(&lock); \ + cmd; \ + schedule(); \ + spin_lock_irq(&lock); \ + } \ + current->state = TASK_RUNNING; \ + remove_wait_queue(&wq, &__wait); \ +} while (0) + +#define wait_event_lock_irq(wq, condition, lock, cmd) \ +do { \ + if (condition) \ + break; \ + __wait_event_lock_irq(wq, condition, lock, cmd); \ +} while (0) + +static inline void safe_put_page(struct page *p) +{ + if (p) put_page(p); +} + +#endif /* CONFIG_BLOCK */ +#endif + + +extern int register_md_personality(struct mdk_personality *p); +extern int unregister_md_personality(struct mdk_personality *p); +extern mdk_thread_t * md_register_thread(void (*run) (mddev_t *mddev), + mddev_t *mddev, const char *name); +extern void md_unregister_thread(mdk_thread_t *thread); +extern void md_wakeup_thread(mdk_thread_t *thread); +extern void md_check_recovery(mddev_t *mddev); +extern void md_write_start(mddev_t *mddev, struct bio *bi); +extern void md_write_end(mddev_t *mddev); +extern void md_done_sync(mddev_t *mddev, int blocks, int ok); +extern void md_error(mddev_t *mddev, mdk_rdev_t *rdev); + +extern void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev, + sector_t sector, int size, struct page *page); +extern void md_super_wait(mddev_t *mddev); +extern int sync_page_io(struct block_device *bdev, sector_t sector, int size, + struct page *page, int rw); +extern void md_do_sync(mddev_t *mddev); +extern void md_new_event(mddev_t *mddev); +extern int md_allow_write(mddev_t *mddev); +extern void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev); diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c index 148b3cd..0ed1005 100644 --- a/drivers/md/multipath.c +++ b/drivers/md/multipath.c @@ -21,8 +21,8 @@ #include #include -#include #include +#include "md.h" #include "multipath.h" #define MAX_WORK_PER_DISK 128 diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index 64e4c77..3d06df8 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -19,8 +19,8 @@ */ #include -#include #include +#include "md.h" #include "raid0.h" static void raid0_unplug(struct request_queue *q) diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 253b09c..051ecfa 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -33,8 +33,8 @@ #include #include -#include #include +#include "md.h" #include "dm-bio-list.h" #include "raid1.h" #include "bitmap.h" diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 186e1b1..fea61e3 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -20,8 +20,8 @@ #include #include -#include #include +#include "md.h" #include "dm-bio-list.h" #include "raid10.h" #include "bitmap.h" diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 816157e..849478e 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -44,10 +44,10 @@ */ #include -#include #include #include #include +#include "md.h" #include "raid5.h" #include "raid6.h" #include "bitmap.h" diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h deleted file mode 100644 index e78b3c1d..0000000 --- a/include/linux/raid/md_k.h +++ /dev/null @@ -1,411 +0,0 @@ -/* - md_k.h : kernel internal structure of the Linux MD driver - Copyright (C) 1996-98 Ingo Molnar, Gadi Oxman - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2, or (at your option) - any later version. - - You should have received a copy of the GNU General Public License - (for example /usr/src/linux/COPYING); if not, write to the Free - Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. -*/ - -#ifndef _MD_K_H -#define _MD_K_H - -#ifdef CONFIG_BLOCK - -#define MaxSector (~(sector_t)0) - -typedef struct mddev_s mddev_t; -typedef struct mdk_rdev_s mdk_rdev_t; - -/* - * options passed in raidrun: - */ - -/* Currently this must fit in an 'int' */ -#define MAX_CHUNK_SIZE (1<<30) - -/* - * MD's 'extended' device - */ -struct mdk_rdev_s -{ - struct list_head same_set; /* RAID devices within the same set */ - - sector_t size; /* Device size (in blocks) */ - mddev_t *mddev; /* RAID array if running */ - int last_events; /* IO event timestamp */ - - struct block_device *bdev; /* block device handle */ - - struct page *sb_page; - int sb_loaded; - __u64 sb_events; - sector_t data_offset; /* start of data in array */ - sector_t sb_start; /* offset of the super block (in 512byte sectors) */ - int sb_size; /* bytes in the superblock */ - int preferred_minor; /* autorun support */ - - struct kobject kobj; - - /* A device can be in one of three states based on two flags: - * Not working: faulty==1 in_sync==0 - * Fully working: faulty==0 in_sync==1 - * Working, but not - * in sync with array - * faulty==0 in_sync==0 - * - * It can never have faulty==1, in_sync==1 - * This reduces the burden of testing multiple flags in many cases - */ - - unsigned long flags; -#define Faulty 1 /* device is known to have a fault */ -#define In_sync 2 /* device is in_sync with rest of array */ -#define WriteMostly 4 /* Avoid reading if at all possible */ -#define BarriersNotsupp 5 /* BIO_RW_BARRIER is not supported */ -#define AllReserved 6 /* If whole device is reserved for - * one array */ -#define AutoDetected 7 /* added by auto-detect */ -#define Blocked 8 /* An error occured on an externally - * managed array, don't allow writes - * until it is cleared */ -#define StateChanged 9 /* Faulty or Blocked has changed during - * interrupt, so it needs to be - * notified by the thread */ - wait_queue_head_t blocked_wait; - - int desc_nr; /* descriptor index in the superblock */ - int raid_disk; /* role of device in array */ - int saved_raid_disk; /* role that device used to have in the - * array and could again if we did a partial - * resync from the bitmap - */ - sector_t recovery_offset;/* If this device has been partially - * recovered, this is where we were - * up to. - */ - - atomic_t nr_pending; /* number of pending requests. - * only maintained for arrays that - * support hot removal - */ - atomic_t read_errors; /* number of consecutive read errors that - * we have tried to ignore. - */ - atomic_t corrected_errors; /* number of corrected read errors, - * for reporting to userspace and storing - * in superblock. - */ - struct work_struct del_work; /* used for delayed sysfs removal */ - - struct sysfs_dirent *sysfs_state; /* handle for 'state' - * sysfs entry */ -}; - -struct mddev_s -{ - void *private; - struct mdk_personality *pers; - dev_t unit; - int md_minor; - struct list_head disks; - unsigned long flags; -#define MD_CHANGE_DEVS 0 /* Some device status has changed */ -#define MD_CHANGE_CLEAN 1 /* transition to or from 'clean' */ -#define MD_CHANGE_PENDING 2 /* superblock update in progress */ - - int ro; - - struct gendisk *gendisk; - - struct kobject kobj; - int hold_active; -#define UNTIL_IOCTL 1 -#define UNTIL_STOP 2 - - /* Superblock information */ - int major_version, - minor_version, - patch_version; - int persistent; - int external; /* metadata is - * managed externally */ - char metadata_type[17]; /* externally set*/ - int chunk_size; - time_t ctime, utime; - int level, layout; - char clevel[16]; - int raid_disks; - int max_disks; - sector_t size; /* used size of component devices */ - sector_t array_sectors; /* exported array size */ - __u64 events; - - char uuid[16]; - - /* If the array is being reshaped, we need to record the - * new shape and an indication of where we are up to. - * This is written to the superblock. - * If reshape_position is MaxSector, then no reshape is happening (yet). - */ - sector_t reshape_position; - int delta_disks, new_level, new_layout, new_chunk; - - struct mdk_thread_s *thread; /* management thread */ - struct mdk_thread_s *sync_thread; /* doing resync or reconstruct */ - sector_t curr_resync; /* last block scheduled */ - unsigned long resync_mark; /* a recent timestamp */ - sector_t resync_mark_cnt;/* blocks written at resync_mark */ - sector_t curr_mark_cnt; /* blocks scheduled now */ - - sector_t resync_max_sectors; /* may be set by personality */ - - sector_t resync_mismatches; /* count of sectors where - * parity/replica mismatch found - */ - - /* allow user-space to request suspension of IO to regions of the array */ - sector_t suspend_lo; - sector_t suspend_hi; - /* if zero, use the system-wide default */ - int sync_speed_min; - int sync_speed_max; - - /* resync even though the same disks are shared among md-devices */ - int parallel_resync; - - int ok_start_degraded; - /* recovery/resync flags - * NEEDED: we might need to start a resync/recover - * RUNNING: a thread is running, or about to be started - * SYNC: actually doing a resync, not a recovery - * RECOVER: doing recovery, or need to try it. - * INTR: resync needs to be aborted for some reason - * DONE: thread is done and is waiting to be reaped - * REQUEST: user-space has requested a sync (used with SYNC) - * CHECK: user-space request for for check-only, no repair - * RESHAPE: A reshape is happening - * - * If neither SYNC or RESHAPE are set, then it is a recovery. - */ -#define MD_RECOVERY_RUNNING 0 -#define MD_RECOVERY_SYNC 1 -#define MD_RECOVERY_RECOVER 2 -#define MD_RECOVERY_INTR 3 -#define MD_RECOVERY_DONE 4 -#define MD_RECOVERY_NEEDED 5 -#define MD_RECOVERY_REQUESTED 6 -#define MD_RECOVERY_CHECK 7 -#define MD_RECOVERY_RESHAPE 8 -#define MD_RECOVERY_FROZEN 9 - - unsigned long recovery; - int recovery_disabled; /* if we detect that recovery - * will always fail, set this - * so we don't loop trying */ - - int in_sync; /* know to not need resync */ - struct mutex reconfig_mutex; - atomic_t active; /* general refcount */ - atomic_t openers; /* number of active opens */ - - int changed; /* true if we might need to reread partition info */ - int degraded; /* whether md should consider - * adding a spare - */ - int barriers_work; /* initialised to true, cleared as soon - * as a barrier request to slave - * fails. Only supported - */ - struct bio *biolist; /* bios that need to be retried - * because BIO_RW_BARRIER is not supported - */ - - atomic_t recovery_active; /* blocks scheduled, but not written */ - wait_queue_head_t recovery_wait; - sector_t recovery_cp; - sector_t resync_min; /* user requested sync - * starts here */ - sector_t resync_max; /* resync should pause - * when it gets here */ - - struct sysfs_dirent *sysfs_state; /* handle for 'array_state' - * file in sysfs. - */ - struct sysfs_dirent *sysfs_action; /* handle for 'sync_action' */ - - struct work_struct del_work; /* used for delayed sysfs removal */ - - spinlock_t write_lock; - wait_queue_head_t sb_wait; /* for waiting on superblock updates */ - atomic_t pending_writes; /* number of active superblock writes */ - - unsigned int safemode; /* if set, update "clean" superblock - * when no writes pending. - */ - unsigned int safemode_delay; - struct timer_list safemode_timer; - atomic_t writes_pending; - struct request_queue *queue; /* for plugging ... */ - - atomic_t write_behind; /* outstanding async IO */ - unsigned int max_write_behind; /* 0 = sync */ - - struct bitmap *bitmap; /* the bitmap for the device */ - struct file *bitmap_file; /* the bitmap file */ - long bitmap_offset; /* offset from superblock of - * start of bitmap. May be - * negative, but not '0' - */ - long default_bitmap_offset; /* this is the offset to use when - * hot-adding a bitmap. It should - * eventually be settable by sysfs. - */ - - struct list_head all_mddevs; -}; - - -static inline void rdev_dec_pending(mdk_rdev_t *rdev, mddev_t *mddev) -{ - int faulty = test_bit(Faulty, &rdev->flags); - if (atomic_dec_and_test(&rdev->nr_pending) && faulty) - set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); -} - -static inline void md_sync_acct(struct block_device *bdev, unsigned long nr_sectors) -{ - atomic_add(nr_sectors, &bdev->bd_contains->bd_disk->sync_io); -} - -struct mdk_personality -{ - char *name; - int level; - struct list_head list; - struct module *owner; - int (*make_request)(struct request_queue *q, struct bio *bio); - int (*run)(mddev_t *mddev); - int (*stop)(mddev_t *mddev); - void (*status)(struct seq_file *seq, mddev_t *mddev); - /* error_handler must set ->faulty and clear ->in_sync - * if appropriate, and should abort recovery if needed - */ - void (*error_handler)(mddev_t *mddev, mdk_rdev_t *rdev); - int (*hot_add_disk) (mddev_t *mddev, mdk_rdev_t *rdev); - int (*hot_remove_disk) (mddev_t *mddev, int number); - int (*spare_active) (mddev_t *mddev); - sector_t (*sync_request)(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster); - int (*resize) (mddev_t *mddev, sector_t sectors); - int (*check_reshape) (mddev_t *mddev); - int (*start_reshape) (mddev_t *mddev); - int (*reconfig) (mddev_t *mddev, int layout, int chunk_size); - /* quiesce moves between quiescence states - * 0 - fully active - * 1 - no new requests allowed - * others - reserved - */ - void (*quiesce) (mddev_t *mddev, int state); -}; - - -struct md_sysfs_entry { - struct attribute attr; - ssize_t (*show)(mddev_t *, char *); - ssize_t (*store)(mddev_t *, const char *, size_t); -}; - - -static inline char * mdname (mddev_t * mddev) -{ - return mddev->gendisk ? mddev->gendisk->disk_name : "mdX"; -} - -/* - * iterates through some rdev ringlist. It's safe to remove the - * current 'rdev'. Dont touch 'tmp' though. - */ -#define rdev_for_each_list(rdev, tmp, head) \ - list_for_each_entry_safe(rdev, tmp, head, same_set) - -/* - * iterates through the 'same array disks' ringlist - */ -#define rdev_for_each(rdev, tmp, mddev) \ - list_for_each_entry_safe(rdev, tmp, &((mddev)->disks), same_set) - -#define rdev_for_each_rcu(rdev, mddev) \ - list_for_each_entry_rcu(rdev, &((mddev)->disks), same_set) - -typedef struct mdk_thread_s { - void (*run) (mddev_t *mddev); - mddev_t *mddev; - wait_queue_head_t wqueue; - unsigned long flags; - struct task_struct *tsk; - unsigned long timeout; -} mdk_thread_t; - -#define THREAD_WAKEUP 0 - -#define __wait_event_lock_irq(wq, condition, lock, cmd) \ -do { \ - wait_queue_t __wait; \ - init_waitqueue_entry(&__wait, current); \ - \ - add_wait_queue(&wq, &__wait); \ - for (;;) { \ - set_current_state(TASK_UNINTERRUPTIBLE); \ - if (condition) \ - break; \ - spin_unlock_irq(&lock); \ - cmd; \ - schedule(); \ - spin_lock_irq(&lock); \ - } \ - current->state = TASK_RUNNING; \ - remove_wait_queue(&wq, &__wait); \ -} while (0) - -#define wait_event_lock_irq(wq, condition, lock, cmd) \ -do { \ - if (condition) \ - break; \ - __wait_event_lock_irq(wq, condition, lock, cmd); \ -} while (0) - -static inline void safe_put_page(struct page *p) -{ - if (p) put_page(p); -} - -#endif /* CONFIG_BLOCK */ -#endif - - -extern int register_md_personality(struct mdk_personality *p); -extern int unregister_md_personality(struct mdk_personality *p); -extern mdk_thread_t * md_register_thread(void (*run) (mddev_t *mddev), - mddev_t *mddev, const char *name); -extern void md_unregister_thread(mdk_thread_t *thread); -extern void md_wakeup_thread(mdk_thread_t *thread); -extern void md_check_recovery(mddev_t *mddev); -extern void md_write_start(mddev_t *mddev, struct bio *bi); -extern void md_write_end(mddev_t *mddev); -extern void md_done_sync(mddev_t *mddev, int blocks, int ok); -extern void md_error(mddev_t *mddev, mdk_rdev_t *rdev); - -extern void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev, - sector_t sector, int size, struct page *page); -extern void md_super_wait(mddev_t *mddev); -extern int sync_page_io(struct block_device *bdev, sector_t sector, int size, - struct page *page, int rw); -extern void md_do_sync(mddev_t *mddev); -extern void md_new_event(mddev_t *mddev); -extern int md_allow_write(mddev_t *mddev); -extern void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev); -- cgit v0.10.2 From 97e4f42d62badb0f9fbc27c013e89bc1336a03bc Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 31 Mar 2009 14:33:13 +1100 Subject: md: occasionally checkpoint drive recovery to reduce duplicate effort after a crash Version 1.x metadata has the ability to record the status of a partially completed drive recovery. However we only update that record on a clean shutdown. It would be nice to update it on unclean shutdowns too, particularly when using a bitmap that removes much to the 'sync' effort after an unclean shutdown. One complication with checkpointing recovery is that we only know where we are up to in terms of IO requests started, not which ones have completed. And we need to know what has completed to record how much is recovered. So occasionally pause the recovery until all submitted requests are completed, then update the record of where we are up to. When we have a bitmap, we already do that pause occasionally to keep the bitmap up-to-date. So enhance that code to record the recovery offset and schedule a superblock update. And when there is no bitmap, just pause 16 times during the resync to do a checkpoint. '16' is a fairly arbitrary number. But we don't really have any good way to judge how often is acceptable, and it seems like a reasonable number for now. Signed-off-by: NeilBrown diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index 623292a..5d64da9 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c @@ -1470,6 +1470,8 @@ void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector) wait_event(bitmap->mddev->recovery_wait, atomic_read(&bitmap->mddev->recovery_active) == 0); + bitmap->mddev->curr_resync_completed = bitmap->mddev->curr_resync; + set_bit(MD_CHANGE_CLEAN, &bitmap->mddev->flags); sector &= ~((1ULL << CHUNK_BLOCK_SHIFT(bitmap)) - 1); s = 0; while (s < sector && s < bitmap->mddev->resync_max_sectors) { diff --git a/drivers/md/md.c b/drivers/md/md.c index aad0ac5..8ea2088 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -1324,10 +1324,15 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev) } if (rdev->raid_disk >= 0 && - !test_bit(In_sync, &rdev->flags) && - rdev->recovery_offset > 0) { - sb->feature_map |= cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET); - sb->recovery_offset = cpu_to_le64(rdev->recovery_offset); + !test_bit(In_sync, &rdev->flags)) { + if (mddev->curr_resync_completed > rdev->recovery_offset) + rdev->recovery_offset = mddev->curr_resync_completed; + if (rdev->recovery_offset > 0) { + sb->feature_map |= + cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET); + sb->recovery_offset = + cpu_to_le64(rdev->recovery_offset); + } } if (mddev->reshape_position != MaxSector) { @@ -6072,6 +6077,18 @@ void md_do_sync(mddev_t *mddev) } if (kthread_should_stop()) goto interrupted; + + if (mddev->curr_resync > mddev->curr_resync_completed && + (mddev->curr_resync - mddev->curr_resync_completed) + > (max_sectors >> 4)) { + /* time to update curr_resync_completed */ + blk_unplug(mddev->queue); + wait_event(mddev->recovery_wait, + atomic_read(&mddev->recovery_active) == 0); + mddev->curr_resync_completed = + mddev->curr_resync; + set_bit(MD_CHANGE_CLEAN, &mddev->flags); + } sectors = mddev->pers->sync_request(mddev, j, &skipped, currspeed < speed_min(mddev)); if (sectors == 0) { @@ -6205,6 +6222,8 @@ static int remove_and_add_spares(mddev_t *mddev) mdk_rdev_t *rdev; int spares = 0; + mddev->curr_resync_completed = 0; + list_for_each_entry(rdev, &mddev->disks, same_set) if (rdev->raid_disk >= 0 && !test_bit(Blocked, &rdev->flags) && diff --git a/drivers/md/md.h b/drivers/md/md.h index e78b3c1d..bede26c 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -159,6 +159,13 @@ struct mddev_s struct mdk_thread_s *thread; /* management thread */ struct mdk_thread_s *sync_thread; /* doing resync or reconstruct */ sector_t curr_resync; /* last block scheduled */ + /* As resync requests can complete out of order, we cannot easily track + * how much resync has been completed. So we occasionally pause until + * everything completes, then set curr_resync_completed to curr_resync. + * As such it may be well behind the real resync mark, but it is a value + * we are certain of. + */ + sector_t curr_resync_completed; unsigned long resync_mark; /* a recent timestamp */ sector_t resync_mark_cnt;/* blocks written at resync_mark */ sector_t curr_mark_cnt; /* blocks scheduled now */ -- cgit v0.10.2 From 575a80fa4f623141e9791e41879d87800fb6d862 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 31 Mar 2009 14:33:13 +1100 Subject: md: be more consistent about setting WriteMostly flag when adding a drive to an array When a drive is added to an array using ADD_NEW_DISK, there are two places we can get certain flags from: the metadata on the disk or the flags passed through the IOCTL. For the WriteMostly flag (aka MD_DISK_WRITEMOSTLY) we take the value from either of those sources depending on if it is set (i.e. we effectively 'or' the two sources together). This makes it awkward to clear, and is at best inconsistent. As documented code (in mdadm) requires that setting MD_DISK_WRITEMOSTLY in the ioctl will be effective, we resolve the inconsistency by always using the value for this flag from the ioctl, and ignoring the value on disk. Signed-off-by: NeilBrown diff --git a/drivers/md/md.c b/drivers/md/md.c index 8ea2088..b2c00ce 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -4517,6 +4517,8 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info) clear_bit(In_sync, &rdev->flags); /* just to be sure */ if (info->state & (1<flags); + else + clear_bit(WriteMostly, &rdev->flags); rdev->raid_disk = -1; err = bind_rdev_to_array(rdev, mddev); -- cgit v0.10.2 From 58c0fed400603a802968b23ddf78f029c5a84e41 Mon Sep 17 00:00:00 2001 From: Andre Noll Date: Tue, 31 Mar 2009 14:33:13 +1100 Subject: md: Make mddev->size sector-based. This patch renames the "size" field of struct mddev_s to "dev_sectors" and stores the number of 512-byte sectors instead of the number of 1K-blocks in it. All users of that field, including raid levels 1,4-6,10, are adjusted accordingly. This simplifies the code a bit because it allows to get rid of a couple of divisions/multiplications by two. In order to make checkpatch happy, some minor coding style issues have also been addressed. In particular, size_store() now uses strict_strtoull() instead of simple_strtoull(). Signed-off-by: Andre Noll Signed-off-by: NeilBrown diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index 5d64da9..f8a9f7a 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c @@ -298,7 +298,7 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait) + size/512 > 0) /* bitmap runs in to metadata */ goto bad_alignment; - if (rdev->data_offset + mddev->size*2 + if (rdev->data_offset + mddev->dev_sectors > rdev->sb_start + bitmap->offset) /* data runs in to bitmap */ goto bad_alignment; diff --git a/drivers/md/faulty.c b/drivers/md/faulty.c index 7b66b9f..18793c1 100644 --- a/drivers/md/faulty.c +++ b/drivers/md/faulty.c @@ -301,7 +301,7 @@ static int run(mddev_t *mddev) list_for_each_entry(rdev, &mddev->disks, same_set) conf->rdev = rdev; - mddev->array_sectors = mddev->size * 2; + mddev->array_sectors = mddev->dev_sectors; mddev->private = conf; reconfig(mddev, mddev->layout, -1); diff --git a/drivers/md/md.c b/drivers/md/md.c index b2c00ce..be4a131 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -816,7 +816,7 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) mddev->clevel[0] = 0; mddev->layout = sb->layout; mddev->raid_disks = sb->raid_disks; - mddev->size = sb->size; + mddev->dev_sectors = sb->size * 2; mddev->events = ev1; mddev->bitmap_offset = 0; mddev->default_bitmap_offset = MD_SB_BYTES >> 9; @@ -930,7 +930,7 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev) sb->ctime = mddev->ctime; sb->level = mddev->level; - sb->size = mddev->size; + sb->size = mddev->dev_sectors / 2; sb->raid_disks = mddev->raid_disks; sb->md_minor = mddev->md_minor; sb->not_persistent = 0; @@ -1028,7 +1028,7 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev) static unsigned long long super_90_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors) { - if (num_sectors && num_sectors < rdev->mddev->size * 2) + if (num_sectors && num_sectors < rdev->mddev->dev_sectors) return 0; /* component must fit device */ if (rdev->mddev->bitmap_offset) return 0; /* can't move bitmap */ @@ -1220,7 +1220,7 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev) mddev->clevel[0] = 0; mddev->layout = le32_to_cpu(sb->layout); mddev->raid_disks = le32_to_cpu(sb->raid_disks); - mddev->size = le64_to_cpu(sb->size)/2; + mddev->dev_sectors = le64_to_cpu(sb->size); mddev->events = ev1; mddev->bitmap_offset = 0; mddev->default_bitmap_offset = 1024 >> 9; @@ -1316,7 +1316,7 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev) sb->cnt_corrected_read = cpu_to_le32(atomic_read(&rdev->corrected_errors)); sb->raid_disks = cpu_to_le32(mddev->raid_disks); - sb->size = cpu_to_le64(mddev->size<<1); + sb->size = cpu_to_le64(mddev->dev_sectors); if (mddev->bitmap && mddev->bitmap_file == NULL) { sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_offset); @@ -1374,7 +1374,7 @@ super_1_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors) { struct mdp_superblock_1 *sb; sector_t max_sectors; - if (num_sectors && num_sectors < rdev->mddev->size * 2) + if (num_sectors && num_sectors < rdev->mddev->dev_sectors) return 0; /* component must fit device */ if (rdev->sb_start < rdev->data_offset) { /* minor versions 1 and 2; superblock before data */ @@ -1490,8 +1490,9 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev) if (find_rdev(mddev, rdev->bdev->bd_dev)) return -EEXIST; - /* make sure rdev->size exceeds mddev->size */ - if (rdev->size && (mddev->size == 0 || rdev->size < mddev->size)) { + /* make sure rdev->size exceeds mddev->dev_sectors / 2 */ + if (rdev->size && (mddev->dev_sectors == 0 || + rdev->size < mddev->dev_sectors / 2)) { if (mddev->pers) { /* Cannot change size, so fail * If mddev->level <= 0, then we don't care @@ -1500,7 +1501,7 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev) if (mddev->level > 0) return -ENOSPC; } else - mddev->size = rdev->size; + mddev->dev_sectors = rdev->size * 2; } /* Verify rdev->desc_nr is unique. @@ -2243,7 +2244,7 @@ rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len) size -= rdev->data_offset/2; } } - if (size < my_mddev->size) + if (size < my_mddev->dev_sectors / 2) return -EINVAL; /* component must fit device */ rdev->size = size; @@ -2809,7 +2810,7 @@ array_state_show(mddev_t *mddev, char *page) else { if (list_empty(&mddev->disks) && mddev->raid_disks == 0 && - mddev->size == 0) + mddev->dev_sectors == 0) st = clear; else st = inactive; @@ -3016,7 +3017,8 @@ __ATTR(bitmap_set_bits, S_IWUSR, null_show, bitmap_store); static ssize_t size_show(mddev_t *mddev, char *page) { - return sprintf(page, "%llu\n", (unsigned long long)mddev->size); + return sprintf(page, "%llu\n", + (unsigned long long)mddev->dev_sectors / 2); } static int update_size(mddev_t *mddev, sector_t num_sectors); @@ -3028,20 +3030,19 @@ size_store(mddev_t *mddev, const char *buf, size_t len) * not increase it (except from 0). * If array is active, we can try an on-line resize */ - char *e; - int err = 0; - unsigned long long size = simple_strtoull(buf, &e, 10); - if (!*buf || *buf == '\n' || - (*e && *e != '\n')) - return -EINVAL; + unsigned long long sectors; + int err = strict_strtoull(buf, 10, §ors); + if (err < 0) + return err; + sectors *= 2; if (mddev->pers) { - err = update_size(mddev, size * 2); + err = update_size(mddev, sectors); md_update_sb(mddev, 1); } else { - if (mddev->size == 0 || - mddev->size > size) - mddev->size = size; + if (mddev->dev_sectors == 0 || + mddev->dev_sectors > sectors) + mddev->dev_sectors = sectors; else err = -ENOSPC; } @@ -3306,15 +3307,15 @@ static struct md_sysfs_entry md_sync_speed = __ATTR_RO(sync_speed); static ssize_t sync_completed_show(mddev_t *mddev, char *page) { - unsigned long max_blocks, resync; + unsigned long max_sectors, resync; if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) - max_blocks = mddev->resync_max_sectors; + max_sectors = mddev->resync_max_sectors; else - max_blocks = mddev->size << 1; + max_sectors = mddev->dev_sectors; resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active)); - return sprintf(page, "%lu / %lu\n", resync, max_blocks); + return sprintf(page, "%lu / %lu\n", resync, max_sectors); } static struct md_sysfs_entry md_sync_completed = __ATTR_RO(sync_completed); @@ -3789,11 +3790,11 @@ static int do_md_run(mddev_t * mddev) /* perform some consistency tests on the device. * We don't want the data to overlap the metadata, - * Internal Bitmap issues has handled elsewhere. + * Internal Bitmap issues have been handled elsewhere. */ if (rdev->data_offset < rdev->sb_start) { - if (mddev->size && - rdev->data_offset + mddev->size*2 + if (mddev->dev_sectors && + rdev->data_offset + mddev->dev_sectors > rdev->sb_start) { printk("md: %s: data overlaps metadata\n", mdname(mddev)); @@ -3875,7 +3876,9 @@ static int do_md_run(mddev_t * mddev) } mddev->recovery = 0; - mddev->resync_max_sectors = mddev->size << 1; /* may be over-ridden by personality */ + /* may be over-ridden by personality */ + mddev->resync_max_sectors = mddev->dev_sectors; + mddev->barriers_work = 1; mddev->ok_start_degraded = start_dirty_degraded; @@ -4131,7 +4134,7 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open) export_array(mddev); mddev->array_sectors = 0; - mddev->size = 0; + mddev->dev_sectors = 0; mddev->raid_disks = 0; mddev->recovery_cp = 0; mddev->resync_min = 0; @@ -4337,8 +4340,8 @@ static int get_array_info(mddev_t * mddev, void __user * arg) info.patch_version = MD_PATCHLEVEL_VERSION; info.ctime = mddev->ctime; info.level = mddev->level; - info.size = mddev->size; - if (info.size != mddev->size) /* overflow */ + info.size = mddev->dev_sectors / 2; + if (info.size != mddev->dev_sectors / 2) /* overflow */ info.size = -1; info.nr_disks = nr; info.raid_disks = mddev->raid_disks; @@ -4788,7 +4791,7 @@ static int set_array_info(mddev_t * mddev, mdu_array_info_t *info) mddev->level = info->level; mddev->clevel[0] = 0; - mddev->size = info->size; + mddev->dev_sectors = 2 * (sector_t)info->size; mddev->raid_disks = info->raid_disks; /* don't set md_minor, it is determined by which /dev/md* was * openned @@ -4926,12 +4929,18 @@ static int update_array_info(mddev_t *mddev, mdu_array_info_t *info) ) return -EINVAL; /* Check there is only one change */ - if (info->size >= 0 && mddev->size != info->size) cnt++; - if (mddev->raid_disks != info->raid_disks) cnt++; - if (mddev->layout != info->layout) cnt++; - if ((state ^ info->state) & (1< 1) return -EINVAL; + if (info->size >= 0 && mddev->dev_sectors / 2 != info->size) + cnt++; + if (mddev->raid_disks != info->raid_disks) + cnt++; + if (mddev->layout != info->layout) + cnt++; + if ((state ^ info->state) & (1< 1) + return -EINVAL; if (mddev->layout != info->layout) { /* Change layout @@ -4943,7 +4952,7 @@ static int update_array_info(mddev_t *mddev, mdu_array_info_t *info) else return mddev->pers->reconfig(mddev, info->layout, -1); } - if (info->size >= 0 && mddev->size != info->size) + if (info->size >= 0 && mddev->dev_sectors / 2 != info->size) rv = update_size(mddev, (sector_t)info->size * 2); if (mddev->raid_disks != info->raid_disks) @@ -5443,7 +5452,7 @@ static void status_resync(struct seq_file *seq, mddev_t * mddev) if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) max_blocks = mddev->resync_max_sectors >> 1; else - max_blocks = mddev->size; + max_blocks = mddev->dev_sectors / 2; /* * Should not happen. @@ -6019,10 +6028,10 @@ void md_do_sync(mddev_t *mddev) j = mddev->recovery_cp; } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) - max_sectors = mddev->size << 1; + max_sectors = mddev->dev_sectors; else { /* recovery follows the physical size of devices */ - max_sectors = mddev->size << 1; + max_sectors = mddev->dev_sectors; j = MaxSector; list_for_each_entry(rdev, &mddev->disks, same_set) if (rdev->raid_disk >= 0 && diff --git a/drivers/md/md.h b/drivers/md/md.h index bede26c..9461212 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -142,7 +142,8 @@ struct mddev_s char clevel[16]; int raid_disks; int max_disks; - sector_t size; /* used size of component devices */ + sector_t dev_sectors; /* used size of + * component devices */ sector_t array_sectors; /* exported array size */ __u64 events; diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c index 0ed1005..87accf7 100644 --- a/drivers/md/multipath.c +++ b/drivers/md/multipath.c @@ -502,7 +502,7 @@ static int multipath_run (mddev_t *mddev) /* * Ok, everything is just fine now */ - mddev->array_sectors = mddev->size * 2; + mddev->array_sectors = mddev->dev_sectors; mddev->queue->unplug_fn = multipath_unplug; mddev->queue->backing_dev_info.congested_fn = multipath_congested; diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 051ecfa..7799587 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1726,7 +1726,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i return 0; } - max_sector = mddev->size << 1; + max_sector = mddev->dev_sectors; if (sector_nr >= max_sector) { /* If we aborted, we need to abort the * sync on the 'current' bitmap chunk (there will @@ -2051,7 +2051,7 @@ static int run(mddev_t *mddev) /* * Ok, everything is just fine now */ - mddev->array_sectors = mddev->size * 2; + mddev->array_sectors = mddev->dev_sectors; mddev->queue->unplug_fn = raid1_unplug; mddev->queue->backing_dev_info.congested_fn = raid1_congested; @@ -2116,12 +2116,12 @@ static int raid1_resize(mddev_t *mddev, sector_t sectors) mddev->array_sectors = sectors; set_capacity(mddev->gendisk, mddev->array_sectors); mddev->changed = 1; - if (mddev->array_sectors / 2 > mddev->size && + if (mddev->array_sectors > mddev->dev_sectors && mddev->recovery_cp == MaxSector) { - mddev->recovery_cp = mddev->size << 1; + mddev->recovery_cp = mddev->dev_sectors; set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); } - mddev->size = mddev->array_sectors / 2; + mddev->dev_sectors = mddev->array_sectors; mddev->resync_max_sectors = sectors; return 0; } diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index fea61e3..d56cb2a 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1698,7 +1698,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i return 0; skipped: - max_sector = mddev->size << 1; + max_sector = mddev->dev_sectors; if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) max_sector = mddev->resync_max_sectors; if (sector_nr >= max_sector) { @@ -2079,7 +2079,7 @@ static int run(mddev_t *mddev) conf->far_offset = fo; conf->chunk_mask = (sector_t)(mddev->chunk_size>>9)-1; conf->chunk_shift = ffz(~mddev->chunk_size) - 9; - size = mddev->size >> (conf->chunk_shift-1); + size = mddev->dev_sectors >> conf->chunk_shift; sector_div(size, fc); size = size * conf->raid_disks; sector_div(size, nc); @@ -2092,7 +2092,7 @@ static int run(mddev_t *mddev) */ stride += conf->raid_disks - 1; sector_div(stride, conf->raid_disks); - mddev->size = stride << (conf->chunk_shift-1); + mddev->dev_sectors = stride << conf->chunk_shift; if (fo) stride = 1; diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 849478e..4d71423 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -3629,8 +3629,8 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped *(new_data_disks) -1, raid_disks, data_disks, &dd_idx, &pd_idx, conf); - if (last_sector >= (mddev->size<<1)) - last_sector = (mddev->size<<1)-1; + if (last_sector >= mddev->dev_sectors) + last_sector = mddev->dev_sectors - 1; while (first_sector <= last_sector) { pd_idx = stripe_to_pdidx(first_sector, conf, conf->previous_raid_disks); @@ -3670,7 +3670,7 @@ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *ski struct stripe_head *sh; int pd_idx; int raid_disks = conf->raid_disks; - sector_t max_sector = mddev->size << 1; + sector_t max_sector = mddev->dev_sectors; int sync_blocks; int still_degraded = 0; int i; @@ -3708,7 +3708,7 @@ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *ski */ if (mddev->degraded >= conf->max_degraded && test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { - sector_t rv = (mddev->size << 1) - sector_nr; + sector_t rv = mddev->dev_sectors - sector_nr; *skipped = 1; return rv; } @@ -4146,8 +4146,8 @@ static int run(mddev_t *mddev) conf->expand_progress = mddev->reshape_position; /* device size must be a multiple of chunk size */ - mddev->size &= ~(mddev->chunk_size/1024 -1); - mddev->resync_max_sectors = mddev->size << 1; + mddev->dev_sectors &= ~(mddev->chunk_size / 512 - 1); + mddev->resync_max_sectors = mddev->dev_sectors; if (conf->level == 6 && conf->raid_disks < 4) { printk(KERN_ERR "raid6: not enough configured devices for %s (%d, minimum 4)\n", @@ -4254,8 +4254,8 @@ static int run(mddev_t *mddev) mddev->queue->backing_dev_info.congested_data = mddev; mddev->queue->backing_dev_info.congested_fn = raid5_congested; - mddev->array_sectors = 2 * mddev->size * (conf->previous_raid_disks - - conf->max_degraded); + mddev->array_sectors = mddev->dev_sectors * + (conf->previous_raid_disks - conf->max_degraded); blk_queue_merge_bvec(mddev->queue, raid5_mergeable_bvec); @@ -4482,11 +4482,11 @@ static int raid5_resize(mddev_t *mddev, sector_t sectors) - conf->max_degraded); set_capacity(mddev->gendisk, mddev->array_sectors); mddev->changed = 1; - if (sectors/2 > mddev->size && mddev->recovery_cp == MaxSector) { - mddev->recovery_cp = mddev->size << 1; + if (sectors > mddev->dev_sectors && mddev->recovery_cp == MaxSector) { + mddev->recovery_cp = mddev->dev_sectors; set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); } - mddev->size = sectors /2; + mddev->dev_sectors = sectors; mddev->resync_max_sectors = sectors; return 0; } @@ -4615,7 +4615,7 @@ static void end_reshape(raid5_conf_t *conf) struct block_device *bdev; if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) { - conf->mddev->array_sectors = 2 * conf->mddev->size * + conf->mddev->array_sectors = conf->mddev->dev_sectors * (conf->raid_disks - conf->max_degraded); set_capacity(conf->mddev->gendisk, conf->mddev->array_sectors); conf->mddev->changed = 1; -- cgit v0.10.2 From dd8ac336c13fd8afdb082ebacb1cddd5cf727889 Mon Sep 17 00:00:00 2001 From: Andre Noll Date: Tue, 31 Mar 2009 14:33:13 +1100 Subject: md: Represent raid device size in sectors. This patch renames the "size" field of struct mdk_rdev_s to "sectors" and changes this field to store sectors instead of blocks. All users of this field, linear.c, raid0.c and md.c, are fixed up accordingly which gets rid of many multiplications and divisions. Signed-off-by: Andre Noll Signed-off-by: NeilBrown diff --git a/drivers/md/linear.c b/drivers/md/linear.c index f248834..d5d99290 100644 --- a/drivers/md/linear.c +++ b/drivers/md/linear.c @@ -139,8 +139,8 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks) mddev->queue->max_sectors > (PAGE_SIZE>>9)) blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); - disk->num_sectors = rdev->size * 2; - conf->array_sectors += rdev->size * 2; + disk->num_sectors = rdev->sectors; + conf->array_sectors += rdev->sectors; cnt++; } diff --git a/drivers/md/md.c b/drivers/md/md.c index be4a131..07ab679 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -413,7 +413,7 @@ static void free_disk_sb(mdk_rdev_t * rdev) rdev->sb_loaded = 0; rdev->sb_page = NULL; rdev->sb_start = 0; - rdev->size = 0; + rdev->sectors = 0; } } @@ -779,9 +779,9 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version else ret = 0; } - rdev->size = calc_num_sectors(rdev, sb->chunk_size) / 2; + rdev->sectors = calc_num_sectors(rdev, sb->chunk_size); - if (rdev->size < sb->size && sb->level > 1) + if (rdev->sectors < sb->size * 2 && sb->level > 1) /* "this cannot possibly happen" ... */ ret = -EINVAL; @@ -1184,16 +1184,17 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) ret = 0; } if (minor_version) - rdev->size = ((rdev->bdev->bd_inode->i_size>>9) - le64_to_cpu(sb->data_offset)) / 2; + rdev->sectors = (rdev->bdev->bd_inode->i_size >> 9) - + le64_to_cpu(sb->data_offset); else - rdev->size = rdev->sb_start / 2; - if (rdev->size < le64_to_cpu(sb->data_size)/2) + rdev->sectors = rdev->sb_start; + if (rdev->sectors < le64_to_cpu(sb->data_size)) return -EINVAL; - rdev->size = le64_to_cpu(sb->data_size)/2; + rdev->sectors = le64_to_cpu(sb->data_size); if (le32_to_cpu(sb->chunksize)) - rdev->size &= ~((sector_t)le32_to_cpu(sb->chunksize)/2 - 1); + rdev->sectors &= ~((sector_t)le32_to_cpu(sb->chunksize) - 1); - if (le64_to_cpu(sb->size) > rdev->size*2) + if (le64_to_cpu(sb->size) > rdev->sectors) return -EINVAL; return ret; } @@ -1390,7 +1391,7 @@ super_1_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors) sector_t sb_start; sb_start = (rdev->bdev->bd_inode->i_size >> 9) - 8*2; sb_start &= ~(sector_t)(4*2 - 1); - max_sectors = rdev->size * 2 + sb_start - rdev->sb_start; + max_sectors = rdev->sectors + sb_start - rdev->sb_start; if (!num_sectors || num_sectors > max_sectors) num_sectors = max_sectors; rdev->sb_start = sb_start; @@ -1490,9 +1491,9 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev) if (find_rdev(mddev, rdev->bdev->bd_dev)) return -EEXIST; - /* make sure rdev->size exceeds mddev->dev_sectors / 2 */ - if (rdev->size && (mddev->dev_sectors == 0 || - rdev->size < mddev->dev_sectors / 2)) { + /* make sure rdev->sectors exceeds mddev->dev_sectors */ + if (rdev->sectors && (mddev->dev_sectors == 0 || + rdev->sectors < mddev->dev_sectors)) { if (mddev->pers) { /* Cannot change size, so fail * If mddev->level <= 0, then we don't care @@ -1501,7 +1502,7 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev) if (mddev->level > 0) return -ENOSPC; } else - mddev->dev_sectors = rdev->size * 2; + mddev->dev_sectors = rdev->sectors; } /* Verify rdev->desc_nr is unique. @@ -1757,8 +1758,8 @@ static void print_sb_1(struct mdp_superblock_1 *sb) static void print_rdev(mdk_rdev_t *rdev, int major_version) { char b[BDEVNAME_SIZE]; - printk(KERN_INFO "md: rdev %s, SZ:%08llu F:%d S:%d DN:%u\n", - bdevname(rdev->bdev,b), (unsigned long long)rdev->size, + printk(KERN_INFO "md: rdev %s, Sect:%08llu F:%d S:%d DN:%u\n", + bdevname(rdev->bdev, b), (unsigned long long)rdev->sectors, test_bit(Faulty, &rdev->flags), test_bit(In_sync, &rdev->flags), rdev->desc_nr); if (rdev->sb_loaded) { @@ -2197,7 +2198,7 @@ offset_store(mdk_rdev_t *rdev, const char *buf, size_t len) return -EINVAL; if (rdev->mddev->pers && rdev->raid_disk >= 0) return -EBUSY; - if (rdev->size && rdev->mddev->external) + if (rdev->sectors && rdev->mddev->external) /* Must set offset before size, so overlap checks * can be sane */ return -EBUSY; @@ -2211,7 +2212,7 @@ __ATTR(offset, S_IRUGO|S_IWUSR, offset_show, offset_store); static ssize_t rdev_size_show(mdk_rdev_t *rdev, char *page) { - return sprintf(page, "%llu\n", (unsigned long long)rdev->size); + return sprintf(page, "%llu\n", (unsigned long long)rdev->sectors / 2); } static int overlaps(sector_t s1, sector_t l1, sector_t s2, sector_t l2) @@ -2227,31 +2228,31 @@ static int overlaps(sector_t s1, sector_t l1, sector_t s2, sector_t l2) static ssize_t rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len) { - unsigned long long size; - unsigned long long oldsize = rdev->size; mddev_t *my_mddev = rdev->mddev; + sector_t oldsectors = rdev->sectors; + unsigned long long sectors; - if (strict_strtoull(buf, 10, &size) < 0) + if (strict_strtoull(buf, 10, §ors) < 0) return -EINVAL; + sectors *= 2; if (my_mddev->pers && rdev->raid_disk >= 0) { if (my_mddev->persistent) { - size = super_types[my_mddev->major_version]. - rdev_size_change(rdev, size * 2); - if (!size) + sectors = super_types[my_mddev->major_version]. + rdev_size_change(rdev, sectors); + if (!sectors) return -EBUSY; - } else if (!size) { - size = (rdev->bdev->bd_inode->i_size >> 10); - size -= rdev->data_offset/2; - } + } else if (!sectors) + sectors = (rdev->bdev->bd_inode->i_size >> 9) - + rdev->data_offset; } - if (size < my_mddev->dev_sectors / 2) + if (sectors < my_mddev->dev_sectors) return -EINVAL; /* component must fit device */ - rdev->size = size; - if (size > oldsize && my_mddev->external) { + rdev->sectors = sectors; + if (sectors > oldsectors && my_mddev->external) { /* need to check that all other rdevs with the same ->bdev * do not overlap. We need to unlock the mddev to avoid - * a deadlock. We have already changed rdev->size, and if + * a deadlock. We have already changed rdev->sectors, and if * we have to change it back, we will have the lock again. */ mddev_t *mddev; @@ -2267,9 +2268,9 @@ rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len) if (test_bit(AllReserved, &rdev2->flags) || (rdev->bdev == rdev2->bdev && rdev != rdev2 && - overlaps(rdev->data_offset, rdev->size * 2, + overlaps(rdev->data_offset, rdev->sectors, rdev2->data_offset, - rdev2->size * 2))) { + rdev2->sectors))) { overlap = 1; break; } @@ -2283,11 +2284,11 @@ rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len) if (overlap) { /* Someone else could have slipped in a size * change here, but doing so is just silly. - * We put oldsize back because we *know* it is + * We put oldsectors back because we *know* it is * safe, and trust userspace not to race with * itself */ - rdev->size = oldsize; + rdev->sectors = oldsectors; return -EBUSY; } } @@ -3760,13 +3761,13 @@ static int do_md_run(mddev_t * mddev) list_for_each_entry(rdev, &mddev->disks, same_set) { if (test_bit(Faulty, &rdev->flags)) continue; - if (rdev->size < chunk_size / 1024) { + if (rdev->sectors < chunk_size / 512) { printk(KERN_WARNING "md: Dev %s smaller than chunk_size:" - " %lluk < %dk\n", + " %llu < %d\n", bdevname(rdev->bdev,b), - (unsigned long long)rdev->size, - chunk_size / 1024); + (unsigned long long)rdev->sectors, + chunk_size / 512); return -EINVAL; } } @@ -4585,7 +4586,7 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info) rdev->sb_start = rdev->bdev->bd_inode->i_size / 512; } else rdev->sb_start = calc_dev_sboffset(rdev->bdev); - rdev->size = calc_num_sectors(rdev, mddev->chunk_size) / 2; + rdev->sectors = calc_num_sectors(rdev, mddev->chunk_size); err = bind_rdev_to_array(rdev, mddev); if (err) { @@ -4655,7 +4656,7 @@ static int hot_add_disk(mddev_t * mddev, dev_t dev) else rdev->sb_start = rdev->bdev->bd_inode->i_size / 512; - rdev->size = calc_num_sectors(rdev, mddev->chunk_size) / 2; + rdev->sectors = calc_num_sectors(rdev, mddev->chunk_size); if (test_bit(Faulty, &rdev->flags)) { printk(KERN_WARNING @@ -4856,8 +4857,7 @@ static int update_size(mddev_t *mddev, sector_t num_sectors) */ return -EBUSY; list_for_each_entry(rdev, &mddev->disks, same_set) { - sector_t avail; - avail = rdev->size * 2; + sector_t avail = rdev->sectors; if (fit && (num_sectors == 0 || num_sectors > avail)) num_sectors = avail; @@ -5585,7 +5585,7 @@ struct mdstat_info { static int md_seq_show(struct seq_file *seq, void *v) { mddev_t *mddev = v; - sector_t size; + sector_t sectors; mdk_rdev_t *rdev; struct mdstat_info *mi = seq->private; struct bitmap *bitmap; @@ -5621,7 +5621,7 @@ static int md_seq_show(struct seq_file *seq, void *v) seq_printf(seq, " %s", mddev->pers->name); } - size = 0; + sectors = 0; list_for_each_entry(rdev, &mddev->disks, same_set) { char b[BDEVNAME_SIZE]; seq_printf(seq, " %s[%d]", @@ -5633,7 +5633,7 @@ static int md_seq_show(struct seq_file *seq, void *v) continue; } else if (rdev->raid_disk < 0) seq_printf(seq, "(S)"); /* spare */ - size += rdev->size; + sectors += rdev->sectors; } if (!list_empty(&mddev->disks)) { @@ -5643,7 +5643,7 @@ static int md_seq_show(struct seq_file *seq, void *v) mddev->array_sectors / 2); else seq_printf(seq, "\n %llu blocks", - (unsigned long long)size); + (unsigned long long)sectors / 2); } if (mddev->persistent) { if (mddev->major_version != 0 || diff --git a/drivers/md/md.h b/drivers/md/md.h index 9461212..c07ea91 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -36,7 +36,7 @@ struct mdk_rdev_s { struct list_head same_set; /* RAID devices within the same set */ - sector_t size; /* Device size (in blocks) */ + sector_t sectors; /* Device size (in 512bytes sectors) */ mddev_t *mddev; /* RAID array if running */ int last_events; /* IO event timestamp */ diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index 3d06df8..9aebb4c 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -76,16 +76,15 @@ static int create_strip_zones (mddev_t *mddev) list_for_each_entry(rdev2, &mddev->disks, same_set) { printk(KERN_INFO "raid0: comparing %s(%llu)", bdevname(rdev1->bdev,b), - (unsigned long long)rdev1->size); + (unsigned long long)rdev1->sectors); printk(KERN_INFO " with %s(%llu)\n", bdevname(rdev2->bdev,b), - (unsigned long long)rdev2->size); + (unsigned long long)rdev2->sectors); if (rdev2 == rdev1) { printk(KERN_INFO "raid0: END\n"); break; } - if (rdev2->size == rdev1->size) - { + if (rdev2->sectors == rdev1->sectors) { /* * Not unique, don't count it as a new * group @@ -148,7 +147,7 @@ static int create_strip_zones (mddev_t *mddev) mddev->queue->max_sectors > (PAGE_SIZE>>9)) blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); - if (!smallest || (rdev1->size size)) + if (!smallest || (rdev1->sectors < smallest->sectors)) smallest = rdev1; cnt++; } @@ -158,10 +157,10 @@ static int create_strip_zones (mddev_t *mddev) goto abort; } zone->nb_dev = cnt; - zone->sectors = smallest->size * cnt * 2; + zone->sectors = smallest->sectors * cnt; zone->zone_start = 0; - current_start = smallest->size * 2; + current_start = smallest->sectors; curr_zone_start = zone->sectors; /* now do the other zones */ @@ -180,29 +179,29 @@ static int create_strip_zones (mddev_t *mddev) rdev = conf->strip_zone[0].dev[j]; printk(KERN_INFO "raid0: checking %s ...", bdevname(rdev->bdev, b)); - if (rdev->size > current_start / 2) { - printk(KERN_INFO " contained as device %d\n", - c); - zone->dev[c] = rdev; - c++; - if (!smallest || (rdev->size size)) { - smallest = rdev; - printk(KERN_INFO " (%llu) is smallest!.\n", - (unsigned long long)rdev->size); - } - } else + if (rdev->sectors <= current_start) { printk(KERN_INFO " nope.\n"); + continue; + } + printk(KERN_INFO " contained as device %d\n", c); + zone->dev[c] = rdev; + c++; + if (!smallest || rdev->sectors < smallest->sectors) { + smallest = rdev; + printk(KERN_INFO " (%llu) is smallest!.\n", + (unsigned long long)rdev->sectors); + } } zone->nb_dev = c; - zone->sectors = (smallest->size * 2 - current_start) * c; + zone->sectors = (smallest->sectors - current_start) * c; printk(KERN_INFO "raid0: zone->nb_dev: %d, sectors: %llu\n", zone->nb_dev, (unsigned long long)zone->sectors); zone->zone_start = curr_zone_start; curr_zone_start += zone->sectors; - current_start = smallest->size * 2; + current_start = smallest->sectors; printk(KERN_INFO "raid0: current zone start: %llu\n", (unsigned long long)current_start); } @@ -296,7 +295,7 @@ static int raid0_run (mddev_t *mddev) /* calculate array device size */ mddev->array_sectors = 0; list_for_each_entry(rdev, &mddev->disks, same_set) - mddev->array_sectors += rdev->size * 2; + mddev->array_sectors += rdev->sectors; printk(KERN_INFO "raid0 : md_size is %llu sectors.\n", (unsigned long long)mddev->array_sectors); -- cgit v0.10.2 From b5663ba405fe3e51176ddb6c91a5e186590c26b5 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 31 Mar 2009 14:39:38 +1100 Subject: md/raid5: simplify interface for init_stripe and get_active_stripe Rather than passing 'pd_idx' and 'disks' to these functions, just pass 'previous' which tells whether to use the 'previous' or 'current' geometry during a reshape, and let init_stripe calculate disks and pd_idx and anything else it might need. This is not a substantial simplification and even adds a division. However we will shortly be adding more complexity to init_stripe to handle more interesting 'reshape' activities, and without this change, the interface to these functions would get very complex. Signed-off-by: NeilBrown diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 4d71423..c38310b 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -274,8 +274,9 @@ static int grow_buffers(struct stripe_head *sh, int num) } static void raid5_build_block(struct stripe_head *sh, int i); +static int stripe_to_pdidx(sector_t stripe, raid5_conf_t *conf, int disks); -static void init_stripe(struct stripe_head *sh, sector_t sector, int pd_idx, int disks) +static void init_stripe(struct stripe_head *sh, sector_t sector, int previous) { raid5_conf_t *conf = sh->raid_conf; int i; @@ -290,11 +291,11 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int pd_idx, int remove_hash(sh); + sh->disks = previous ? conf->previous_raid_disks : conf->raid_disks; sh->sector = sector; - sh->pd_idx = pd_idx; + sh->pd_idx = stripe_to_pdidx(sector, conf, sh->disks); sh->state = 0; - sh->disks = disks; for (i = sh->disks; i--; ) { struct r5dev *dev = &sh->dev[i]; @@ -330,10 +331,12 @@ static struct stripe_head *__find_stripe(raid5_conf_t *conf, sector_t sector, in static void unplug_slaves(mddev_t *mddev); static void raid5_unplug_device(struct request_queue *q); -static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector, int disks, - int pd_idx, int noblock) +static struct stripe_head * +get_active_stripe(raid5_conf_t *conf, sector_t sector, + int previous, int noblock) { struct stripe_head *sh; + int disks = previous ? conf->previous_raid_disks : conf->raid_disks; pr_debug("get_stripe, sector %llu\n", (unsigned long long)sector); @@ -361,7 +364,7 @@ static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector ); conf->inactive_blocked = 0; } else - init_stripe(sh, sector, pd_idx, disks); + init_stripe(sh, sector, previous); } else { if (atomic_read(&sh->count)) { BUG_ON(!list_empty(&sh->lru)); @@ -2479,8 +2482,7 @@ static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh, conf->raid_disks - conf->max_degraded, &dd_idx, &pd_idx, conf); - sh2 = get_active_stripe(conf, s, conf->raid_disks, - pd_idx, 1); + sh2 = get_active_stripe(conf, s, 0, 1); if (sh2 == NULL) /* so far only the early blocks of this stripe * have been requested. When later blocks @@ -3413,8 +3415,10 @@ static int make_request(struct request_queue *q, struct bio * bi) for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) { DEFINE_WAIT(w); int disks, data_disks; + int previous; retry: + previous = 0; prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE); if (likely(conf->expand_progress == MaxSector)) disks = conf->raid_disks; @@ -3429,9 +3433,10 @@ static int make_request(struct request_queue *q, struct bio * bi) */ spin_lock_irq(&conf->device_lock); disks = conf->raid_disks; - if (logical_sector >= conf->expand_progress) + if (logical_sector >= conf->expand_progress) { disks = conf->previous_raid_disks; - else { + previous = 1; + } else { if (logical_sector >= conf->expand_lo) { spin_unlock_irq(&conf->device_lock); schedule(); @@ -3448,7 +3453,8 @@ static int make_request(struct request_queue *q, struct bio * bi) (unsigned long long)new_sector, (unsigned long long)logical_sector); - sh = get_active_stripe(conf, new_sector, disks, pd_idx, (bi->bi_rw&RWA_MASK)); + sh = get_active_stripe(conf, new_sector, previous, + (bi->bi_rw&RWA_MASK)); if (sh) { if (unlikely(conf->expand_progress != MaxSector)) { /* expansion might have moved on while waiting for a @@ -3582,9 +3588,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped for (i=0; i < conf->chunk_size/512; i+= STRIPE_SECTORS) { int j; int skipped = 0; - pd_idx = stripe_to_pdidx(sector_nr+i, conf, conf->raid_disks); - sh = get_active_stripe(conf, sector_nr+i, - conf->raid_disks, pd_idx, 0); + sh = get_active_stripe(conf, sector_nr+i, 0, 0); set_bit(STRIPE_EXPANDING, &sh->state); atomic_inc(&conf->reshape_stripes); /* If any of this stripe is beyond the end of the old @@ -3632,10 +3636,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped if (last_sector >= mddev->dev_sectors) last_sector = mddev->dev_sectors - 1; while (first_sector <= last_sector) { - pd_idx = stripe_to_pdidx(first_sector, conf, - conf->previous_raid_disks); - sh = get_active_stripe(conf, first_sector, - conf->previous_raid_disks, pd_idx, 0); + sh = get_active_stripe(conf, first_sector, 1, 0); set_bit(STRIPE_EXPAND_SOURCE, &sh->state); set_bit(STRIPE_HANDLE, &sh->state); release_stripe(sh); @@ -3725,9 +3726,9 @@ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *ski bitmap_cond_end_sync(mddev->bitmap, sector_nr); pd_idx = stripe_to_pdidx(sector_nr, conf, raid_disks); - sh = get_active_stripe(conf, sector_nr, raid_disks, pd_idx, 1); + sh = get_active_stripe(conf, sector_nr, 0, 1); if (sh == NULL) { - sh = get_active_stripe(conf, sector_nr, raid_disks, pd_idx, 0); + sh = get_active_stripe(conf, sector_nr, 0, 0); /* make sure we don't swamp the stripe cache if someone else * is trying to get access */ @@ -3793,7 +3794,7 @@ static int retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio) /* already done this stripe */ continue; - sh = get_active_stripe(conf, sector, conf->raid_disks, pd_idx, 1); + sh = get_active_stripe(conf, sector, 0, 1); if (!sh) { /* failed to get a stripe - must wait */ -- cgit v0.10.2 From 112bf8970dbdfc00bd4667da5996e57c2ce58066 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 31 Mar 2009 14:39:38 +1100 Subject: md/raid5: change raid5_compute_sector and stripe_to_pdidx to take a 'previous' argument This similar to the recent change to get_active_stripe. There is no functional change, just come rearrangement to make future patches cleaner. Signed-off-by: NeilBrown diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index c38310b..c33073f 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -274,7 +274,7 @@ static int grow_buffers(struct stripe_head *sh, int num) } static void raid5_build_block(struct stripe_head *sh, int i); -static int stripe_to_pdidx(sector_t stripe, raid5_conf_t *conf, int disks); +static int stripe_to_pdidx(sector_t stripe, raid5_conf_t *conf, int previous); static void init_stripe(struct stripe_head *sh, sector_t sector, int previous) { @@ -293,7 +293,7 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int previous) sh->disks = previous ? conf->previous_raid_disks : conf->raid_disks; sh->sector = sector; - sh->pd_idx = stripe_to_pdidx(sector, conf, sh->disks); + sh->pd_idx = stripe_to_pdidx(sector, conf, previous); sh->state = 0; @@ -1233,15 +1233,18 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev) * Input: a 'big' sector number, * Output: index of the data and parity disk, and the sector # in them. */ -static sector_t raid5_compute_sector(sector_t r_sector, unsigned int raid_disks, - unsigned int data_disks, unsigned int * dd_idx, - unsigned int * pd_idx, raid5_conf_t *conf) +static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector, + int previous, + int *dd_idx, int *pd_idx) { long stripe; unsigned long chunk_number; unsigned int chunk_offset; sector_t new_sector; int sectors_per_chunk = conf->chunk_size >> 9; + int raid_disks = previous ? conf->previous_raid_disks + : conf->raid_disks; + int data_disks = raid_disks - conf->max_degraded; /* First compute the information on this sector */ @@ -1406,7 +1409,9 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i) chunk_number = stripe * data_disks + i; r_sector = (sector_t)chunk_number * sectors_per_chunk + chunk_offset; - check = raid5_compute_sector(r_sector, raid_disks, data_disks, &dummy1, &dummy2, conf); + check = raid5_compute_sector(conf, r_sector, + (raid_disks != conf->raid_disks), + &dummy1, &dummy2); if (check != sh->sector || dummy1 != dd_idx || dummy2 != sh->pd_idx) { printk(KERN_ERR "compute_blocknr: map not correct\n"); return 0; @@ -1806,16 +1811,18 @@ static int page_is_zero(struct page *p) memcmp(a, a+4, STRIPE_SIZE-4)==0); } -static int stripe_to_pdidx(sector_t stripe, raid5_conf_t *conf, int disks) +static int stripe_to_pdidx(sector_t stripe, raid5_conf_t *conf, int previous) { int sectors_per_chunk = conf->chunk_size >> 9; int pd_idx, dd_idx; int chunk_offset = sector_div(stripe, sectors_per_chunk); + int disks = previous ? conf->previous_raid_disks : conf->raid_disks; - raid5_compute_sector(stripe * (disks - conf->max_degraded) + raid5_compute_sector(conf, + stripe * (disks - conf->max_degraded) *sectors_per_chunk + chunk_offset, - disks, disks - conf->max_degraded, - &dd_idx, &pd_idx, conf); + previous, + &dd_idx, &pd_idx); return pd_idx; } @@ -2478,10 +2485,8 @@ static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh, struct stripe_head *sh2; sector_t bn = compute_blocknr(sh, i); - sector_t s = raid5_compute_sector(bn, conf->raid_disks, - conf->raid_disks - - conf->max_degraded, &dd_idx, - &pd_idx, conf); + sector_t s = raid5_compute_sector(conf, bn, 0, + &dd_idx, &pd_idx); sh2 = get_active_stripe(conf, s, 0, 1); if (sh2 == NULL) /* so far only the early blocks of this stripe @@ -2768,8 +2773,7 @@ static bool handle_stripe5(struct stripe_head *sh) !sh->reconstruct_state) { /* Need to write out all blocks after computing parity */ sh->disks = conf->raid_disks; - sh->pd_idx = stripe_to_pdidx(sh->sector, conf, - conf->raid_disks); + sh->pd_idx = stripe_to_pdidx(sh->sector, conf, 0); schedule_reconstruction5(sh, &s, 1, 1); } else if (s.expanded && !sh->reconstruct_state && s.locked == 0) { clear_bit(STRIPE_EXPAND_READY, &sh->state); @@ -2987,8 +2991,7 @@ static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page) if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state)) { /* Need to write out all blocks after computing P&Q */ sh->disks = conf->raid_disks; - sh->pd_idx = stripe_to_pdidx(sh->sector, conf, - conf->raid_disks); + sh->pd_idx = stripe_to_pdidx(sh->sector, conf, 0); compute_parity6(sh, RECONSTRUCT_WRITE); for (i = conf->raid_disks ; i-- ; ) { set_bit(R5_LOCKED, &sh->dev[i].flags); @@ -3260,8 +3263,6 @@ static int chunk_aligned_read(struct request_queue *q, struct bio * raid_bio) { mddev_t *mddev = q->queuedata; raid5_conf_t *conf = mddev_to_conf(mddev); - const unsigned int raid_disks = conf->raid_disks; - const unsigned int data_disks = raid_disks - conf->max_degraded; unsigned int dd_idx, pd_idx; struct bio* align_bi; mdk_rdev_t *rdev; @@ -3285,12 +3286,9 @@ static int chunk_aligned_read(struct request_queue *q, struct bio * raid_bio) /* * compute position */ - align_bi->bi_sector = raid5_compute_sector(raid_bio->bi_sector, - raid_disks, - data_disks, - &dd_idx, - &pd_idx, - conf); + align_bi->bi_sector = raid5_compute_sector(conf, raid_bio->bi_sector, + 0, + &dd_idx, &pd_idx); rcu_read_lock(); rdev = rcu_dereference(conf->disks[dd_idx].rdev); @@ -3447,8 +3445,9 @@ static int make_request(struct request_queue *q, struct bio * bi) } data_disks = disks - conf->max_degraded; - new_sector = raid5_compute_sector(logical_sector, disks, data_disks, - &dd_idx, &pd_idx, conf); + new_sector = raid5_compute_sector(conf, logical_sector, + previous, + &dd_idx, &pd_idx); pr_debug("raid5: make_request, sector %llu logical %llu\n", (unsigned long long)new_sector, (unsigned long long)logical_sector); @@ -3625,14 +3624,12 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped * block on the destination stripes. */ first_sector = - raid5_compute_sector(sector_nr*(new_data_disks), - raid_disks, data_disks, - &dd_idx, &pd_idx, conf); + raid5_compute_sector(conf, sector_nr*(new_data_disks), + 1, &dd_idx, &pd_idx); last_sector = - raid5_compute_sector((sector_nr+conf->chunk_size/512) - *(new_data_disks) -1, - raid_disks, data_disks, - &dd_idx, &pd_idx, conf); + raid5_compute_sector(conf, ((sector_nr+conf->chunk_size/512) + *(new_data_disks) - 1), + 1, &dd_idx, &pd_idx); if (last_sector >= mddev->dev_sectors) last_sector = mddev->dev_sectors - 1; while (first_sector <= last_sector) { @@ -3669,8 +3666,6 @@ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *ski { raid5_conf_t *conf = (raid5_conf_t *) mddev->private; struct stripe_head *sh; - int pd_idx; - int raid_disks = conf->raid_disks; sector_t max_sector = mddev->dev_sectors; int sync_blocks; int still_degraded = 0; @@ -3725,7 +3720,6 @@ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *ski bitmap_cond_end_sync(mddev->bitmap, sector_nr); - pd_idx = stripe_to_pdidx(sector_nr, conf, raid_disks); sh = get_active_stripe(conf, sector_nr, 0, 1); if (sh == NULL) { sh = get_active_stripe(conf, sector_nr, 0, 0); @@ -3777,12 +3771,8 @@ static int retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio) int handled = 0; logical_sector = raid_bio->bi_sector & ~((sector_t)STRIPE_SECTORS-1); - sector = raid5_compute_sector( logical_sector, - conf->raid_disks, - conf->raid_disks - conf->max_degraded, - &dd_idx, - &pd_idx, - conf); + sector = raid5_compute_sector(conf, logical_sector, + 0, &dd_idx, &pd_idx); last_sector = raid_bio->bi_sector + (raid_bio->bi_size>>9); for (; logical_sector < last_sector; -- cgit v0.10.2 From d0dabf7e577411c2bf6b616c751544dc241213d4 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 31 Mar 2009 14:39:38 +1100 Subject: md/raid6: remove expectation that Q device is immediately after P device. Code currently assumes that the devices in a raid6 stripe are 0 1 ... N-1 P Q in some rotated order. We will shortly add new layouts in which this strict pattern is broken. So remove this expectation. We still assume that the data disks are roughly in-order. However P and Q can be inserted anywhere within that order. Signed-off-by: NeilBrown diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index c33073f..cb3e157 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -133,12 +133,36 @@ static inline void raid5_set_bi_hw_segments(struct bio *bio, unsigned int cnt) bio->bi_phys_segments = raid5_bi_phys_segments(bio) || (cnt << 16); } +/* Find first data disk in a raid6 stripe */ +static inline int raid6_d0(struct stripe_head *sh) +{ + if (sh->qd_idx == sh->disks - 1) + return 0; + else + return sh->qd_idx + 1; +} static inline int raid6_next_disk(int disk, int raid_disks) { disk++; return (disk < raid_disks) ? disk : 0; } +/* When walking through the disks in a raid5, starting at raid6_d0, + * We need to map each disk to a 'slot', where the data disks are slot + * 0 .. raid_disks-3, the parity disk is raid_disks-2 and the Q disk + * is raid_disks-1. This help does that mapping. + */ +static int raid6_idx_to_slot(int idx, struct stripe_head *sh, int *count) +{ + int slot; + if (idx == sh->pd_idx) + return sh->disks - 2; + if (idx == sh->qd_idx) + return sh->disks - 1; + slot = (*count)++; + return slot; +} + static void return_io(struct bio *return_bi) { struct bio *bi = return_bi; @@ -196,6 +220,7 @@ static void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh) } } } + static void release_stripe(struct stripe_head *sh) { raid5_conf_t *conf = sh->raid_conf; @@ -274,12 +299,14 @@ static int grow_buffers(struct stripe_head *sh, int num) } static void raid5_build_block(struct stripe_head *sh, int i); -static int stripe_to_pdidx(sector_t stripe, raid5_conf_t *conf, int previous); +static int stripe_to_pdidx(sector_t stripe, raid5_conf_t *conf, int previous, + int *qd_idx); static void init_stripe(struct stripe_head *sh, sector_t sector, int previous) { raid5_conf_t *conf = sh->raid_conf; int i; + int qd_idx; BUG_ON(atomic_read(&sh->count) != 0); BUG_ON(test_bit(STRIPE_HANDLE, &sh->state)); @@ -293,7 +320,8 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int previous) sh->disks = previous ? conf->previous_raid_disks : conf->raid_disks; sh->sector = sector; - sh->pd_idx = stripe_to_pdidx(sector, conf, previous); + sh->pd_idx = stripe_to_pdidx(sector, conf, previous, &qd_idx); + sh->qd_idx = qd_idx; sh->state = 0; @@ -1235,7 +1263,7 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev) */ static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector, int previous, - int *dd_idx, int *pd_idx) + int *dd_idx, int *pd_idx, int *qd_idx) { long stripe; unsigned long chunk_number; @@ -1268,6 +1296,7 @@ static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector, /* * Select the parity disk based on the user selected algorithm. */ + *qd_idx = ~0; switch(conf->level) { case 4: *pd_idx = data_disks; @@ -1303,24 +1332,30 @@ static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector, switch (conf->algorithm) { case ALGORITHM_LEFT_ASYMMETRIC: *pd_idx = raid_disks - 1 - (stripe % raid_disks); - if (*pd_idx == raid_disks-1) + *qd_idx = *pd_idx + 1; + if (*pd_idx == raid_disks-1) { (*dd_idx)++; /* Q D D D P */ - else if (*dd_idx >= *pd_idx) + *qd_idx = 0; + } else if (*dd_idx >= *pd_idx) (*dd_idx) += 2; /* D D P Q D */ break; case ALGORITHM_RIGHT_ASYMMETRIC: *pd_idx = stripe % raid_disks; - if (*pd_idx == raid_disks-1) + *qd_idx = *pd_idx + 1; + if (*pd_idx == raid_disks-1) { (*dd_idx)++; /* Q D D D P */ - else if (*dd_idx >= *pd_idx) + *qd_idx = 0; + } else if (*dd_idx >= *pd_idx) (*dd_idx) += 2; /* D D P Q D */ break; case ALGORITHM_LEFT_SYMMETRIC: *pd_idx = raid_disks - 1 - (stripe % raid_disks); + *qd_idx = (*pd_idx + 1) % raid_disks; *dd_idx = (*pd_idx + 2 + *dd_idx) % raid_disks; break; case ALGORITHM_RIGHT_SYMMETRIC: *pd_idx = stripe % raid_disks; + *qd_idx = (*pd_idx + 1) % raid_disks; *dd_idx = (*pd_idx + 2 + *dd_idx) % raid_disks; break; default: @@ -1347,7 +1382,7 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i) int sectors_per_chunk = conf->chunk_size >> 9; sector_t stripe; int chunk_offset; - int chunk_number, dummy1, dummy2, dd_idx = i; + int chunk_number, dummy1, dummy2, dummy3, dd_idx = i; sector_t r_sector; @@ -1378,7 +1413,7 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i) } break; case 6: - if (i == raid6_next_disk(sh->pd_idx, raid_disks)) + if (i == sh->qd_idx) return 0; /* It is the Q disk */ switch (conf->algorithm) { case ALGORITHM_LEFT_ASYMMETRIC: @@ -1411,7 +1446,7 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i) check = raid5_compute_sector(conf, r_sector, (raid_disks != conf->raid_disks), - &dummy1, &dummy2); + &dummy1, &dummy2, &dummy3); if (check != sh->sector || dummy1 != dd_idx || dummy2 != sh->pd_idx) { printk(KERN_ERR "compute_blocknr: map not correct\n"); return 0; @@ -1480,13 +1515,14 @@ static void copy_data(int frombio, struct bio *bio, static void compute_parity6(struct stripe_head *sh, int method) { raid5_conf_t *conf = sh->raid_conf; - int i, pd_idx = sh->pd_idx, qd_idx, d0_idx, disks = sh->disks, count; + int i, pd_idx, qd_idx, d0_idx, disks = sh->disks, count; struct bio *chosen; /**** FIX THIS: This could be very bad if disks is close to 256 ****/ void *ptrs[disks]; - qd_idx = raid6_next_disk(pd_idx, disks); - d0_idx = raid6_next_disk(qd_idx, disks); + pd_idx = sh->pd_idx; + qd_idx = sh->qd_idx; + d0_idx = raid6_d0(sh); pr_debug("compute_parity, stripe %llu, method %d\n", (unsigned long long)sh->sector, method); @@ -1524,22 +1560,22 @@ static void compute_parity6(struct stripe_head *sh, int method) set_bit(R5_UPTODATE, &sh->dev[i].flags); } -// switch(method) { -// case RECONSTRUCT_WRITE: -// case CHECK_PARITY: -// case UPDATE_PARITY: - /* Note that unlike RAID-5, the ordering of the disks matters greatly. */ - /* FIX: Is this ordering of drives even remotely optimal? */ - count = 0; - i = d0_idx; - do { - ptrs[count++] = page_address(sh->dev[i].page); - if (count <= disks-2 && !test_bit(R5_UPTODATE, &sh->dev[i].flags)) - printk("block %d/%d not uptodate on parity calc\n", i,count); - i = raid6_next_disk(i, disks); - } while ( i != d0_idx ); -// break; -// } + /* Note that unlike RAID-5, the ordering of the disks matters greatly.*/ + /* FIX: Is this ordering of drives even remotely optimal? */ + count = 0; + i = d0_idx; + do { + int slot = raid6_idx_to_slot(i, sh, &count); + ptrs[slot] = page_address(sh->dev[i].page); + if (slot < sh->disks - 2 && + !test_bit(R5_UPTODATE, &sh->dev[i].flags)) { + printk(KERN_ERR "block %d/%d not uptodate " + "on parity calc\n", i, count); + BUG(); + } + i = raid6_next_disk(i, disks); + } while (i != d0_idx); + BUG_ON(count+2 != disks); raid6_call.gen_syndrome(disks, STRIPE_SIZE, ptrs); @@ -1563,8 +1599,7 @@ static void compute_block_1(struct stripe_head *sh, int dd_idx, int nozero) { int i, count, disks = sh->disks; void *ptr[MAX_XOR_BLOCKS], *dest, *p; - int pd_idx = sh->pd_idx; - int qd_idx = raid6_next_disk(pd_idx, disks); + int qd_idx = sh->qd_idx; pr_debug("compute_block_1, stripe %llu, idx %d\n", (unsigned long long)sh->sector, dd_idx); @@ -1600,21 +1635,31 @@ static void compute_block_1(struct stripe_head *sh, int dd_idx, int nozero) static void compute_block_2(struct stripe_head *sh, int dd_idx1, int dd_idx2) { int i, count, disks = sh->disks; - int pd_idx = sh->pd_idx; - int qd_idx = raid6_next_disk(pd_idx, disks); - int d0_idx = raid6_next_disk(qd_idx, disks); - int faila, failb; + int d0_idx = raid6_d0(sh); + int faila = -1, failb = -1; + /**** FIX THIS: This could be very bad if disks is close to 256 ****/ + void *ptrs[disks]; - /* faila and failb are disk numbers relative to d0_idx */ - /* pd_idx become disks-2 and qd_idx become disks-1 */ - faila = (dd_idx1 < d0_idx) ? dd_idx1+(disks-d0_idx) : dd_idx1-d0_idx; - failb = (dd_idx2 < d0_idx) ? dd_idx2+(disks-d0_idx) : dd_idx2-d0_idx; + count = 0; + i = d0_idx; + do { + int slot; + slot = raid6_idx_to_slot(i, sh, &count); + ptrs[slot] = page_address(sh->dev[i].page); + if (i == dd_idx1) + faila = slot; + if (i == dd_idx2) + failb = slot; + i = raid6_next_disk(i, disks); + } while (i != d0_idx); + BUG_ON(count+2 != disks); BUG_ON(faila == failb); if ( failb < faila ) { int tmp = faila; faila = failb; failb = tmp; } pr_debug("compute_block_2, stripe %llu, idx %d,%d (%d,%d)\n", - (unsigned long long)sh->sector, dd_idx1, dd_idx2, faila, failb); + (unsigned long long)sh->sector, dd_idx1, dd_idx2, + faila, failb); if ( failb == disks-1 ) { /* Q disk is one of the missing disks */ @@ -1624,39 +1669,26 @@ static void compute_block_2(struct stripe_head *sh, int dd_idx1, int dd_idx2) return; } else { /* We're missing D+Q; recompute D from P */ - compute_block_1(sh, (dd_idx1 == qd_idx) ? dd_idx2 : dd_idx1, 0); + compute_block_1(sh, ((dd_idx1 == sh->qd_idx) ? + dd_idx2 : dd_idx1), + 0); compute_parity6(sh, UPDATE_PARITY); /* Is this necessary? */ return; } } - /* We're missing D+P or D+D; build pointer table */ - { - /**** FIX THIS: This could be very bad if disks is close to 256 ****/ - void *ptrs[disks]; - - count = 0; - i = d0_idx; - do { - ptrs[count++] = page_address(sh->dev[i].page); - i = raid6_next_disk(i, disks); - if (i != dd_idx1 && i != dd_idx2 && - !test_bit(R5_UPTODATE, &sh->dev[i].flags)) - printk("compute_2 with missing block %d/%d\n", count, i); - } while ( i != d0_idx ); - - if ( failb == disks-2 ) { - /* We're missing D+P. */ - raid6_datap_recov(disks, STRIPE_SIZE, faila, ptrs); - } else { - /* We're missing D+D. */ - raid6_2data_recov(disks, STRIPE_SIZE, faila, failb, ptrs); - } - - /* Both the above update both missing blocks */ - set_bit(R5_UPTODATE, &sh->dev[dd_idx1].flags); - set_bit(R5_UPTODATE, &sh->dev[dd_idx2].flags); + /* We're missing D+P or D+D; */ + if (failb == disks-2) { + /* We're missing D+P. */ + raid6_datap_recov(disks, STRIPE_SIZE, faila, ptrs); + } else { + /* We're missing D+D. */ + raid6_2data_recov(disks, STRIPE_SIZE, faila, failb, ptrs); } + + /* Both the above update both missing blocks */ + set_bit(R5_UPTODATE, &sh->dev[dd_idx1].flags); + set_bit(R5_UPTODATE, &sh->dev[dd_idx2].flags); } static void @@ -1811,7 +1843,8 @@ static int page_is_zero(struct page *p) memcmp(a, a+4, STRIPE_SIZE-4)==0); } -static int stripe_to_pdidx(sector_t stripe, raid5_conf_t *conf, int previous) +static int stripe_to_pdidx(sector_t stripe, raid5_conf_t *conf, int previous, + int *qd_idxp) { int sectors_per_chunk = conf->chunk_size >> 9; int pd_idx, dd_idx; @@ -1822,7 +1855,7 @@ static int stripe_to_pdidx(sector_t stripe, raid5_conf_t *conf, int previous) stripe * (disks - conf->max_degraded) *sectors_per_chunk + chunk_offset, previous, - &dd_idx, &pd_idx); + &dd_idx, &pd_idx, qd_idxp); return pd_idx; } @@ -2481,12 +2514,13 @@ static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh, clear_bit(STRIPE_EXPAND_SOURCE, &sh->state); for (i = 0; i < sh->disks; i++) if (i != sh->pd_idx && (!r6s || i != r6s->qd_idx)) { - int dd_idx, pd_idx, j; + int dd_idx, pd_idx, qd_idx, j; struct stripe_head *sh2; sector_t bn = compute_blocknr(sh, i); - sector_t s = raid5_compute_sector(conf, bn, 0, - &dd_idx, &pd_idx); + sector_t s = + raid5_compute_sector(conf, bn, 0, + &dd_idx, &pd_idx, &qd_idx); sh2 = get_active_stripe(conf, s, 0, 1); if (sh2 == NULL) /* so far only the early blocks of this stripe @@ -2510,8 +2544,7 @@ static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh, set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags); for (j = 0; j < conf->raid_disks; j++) if (j != sh2->pd_idx && - (!r6s || j != raid6_next_disk(sh2->pd_idx, - sh2->disks)) && + (!r6s || j != sh2->qd_idx) && !test_bit(R5_Expanded, &sh2->dev[j].flags)) break; if (j == conf->raid_disks) { @@ -2771,9 +2804,11 @@ static bool handle_stripe5(struct stripe_head *sh) if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) && !sh->reconstruct_state) { + int qd_idx; /* Need to write out all blocks after computing parity */ sh->disks = conf->raid_disks; - sh->pd_idx = stripe_to_pdidx(sh->sector, conf, 0); + sh->pd_idx = stripe_to_pdidx(sh->sector, conf, 0, &qd_idx); + sh->qd_idx = qd_idx; schedule_reconstruction5(sh, &s, 1, 1); } else if (s.expanded && !sh->reconstruct_state && s.locked == 0) { clear_bit(STRIPE_EXPAND_READY, &sh->state); @@ -2814,7 +2849,7 @@ static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page) struct r5dev *dev, *pdev, *qdev; mdk_rdev_t *blocked_rdev = NULL; - r6s.qd_idx = raid6_next_disk(pd_idx, disks); + r6s.qd_idx = sh->qd_idx; pr_debug("handling stripe %llu, state=%#lx cnt=%d, " "pd_idx=%d, qd_idx=%d\n", (unsigned long long)sh->sector, sh->state, @@ -2990,8 +3025,10 @@ static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page) if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state)) { /* Need to write out all blocks after computing P&Q */ + int qd_idx; sh->disks = conf->raid_disks; - sh->pd_idx = stripe_to_pdidx(sh->sector, conf, 0); + sh->pd_idx = stripe_to_pdidx(sh->sector, conf, 0, &qd_idx); + sh->qd_idx = qd_idx; compute_parity6(sh, RECONSTRUCT_WRITE); for (i = conf->raid_disks ; i-- ; ) { set_bit(R5_LOCKED, &sh->dev[i].flags); @@ -3263,7 +3300,7 @@ static int chunk_aligned_read(struct request_queue *q, struct bio * raid_bio) { mddev_t *mddev = q->queuedata; raid5_conf_t *conf = mddev_to_conf(mddev); - unsigned int dd_idx, pd_idx; + unsigned int dd_idx, pd_idx, qd_idx; struct bio* align_bi; mdk_rdev_t *rdev; @@ -3288,7 +3325,7 @@ static int chunk_aligned_read(struct request_queue *q, struct bio * raid_bio) */ align_bi->bi_sector = raid5_compute_sector(conf, raid_bio->bi_sector, 0, - &dd_idx, &pd_idx); + &dd_idx, &pd_idx, &qd_idx); rcu_read_lock(); rdev = rcu_dereference(conf->disks[dd_idx].rdev); @@ -3380,7 +3417,7 @@ static int make_request(struct request_queue *q, struct bio * bi) { mddev_t *mddev = q->queuedata; raid5_conf_t *conf = mddev_to_conf(mddev); - unsigned int dd_idx, pd_idx; + int dd_idx, pd_idx, qd_idx; sector_t new_sector; sector_t logical_sector, last_sector; struct stripe_head *sh; @@ -3447,7 +3484,7 @@ static int make_request(struct request_queue *q, struct bio * bi) new_sector = raid5_compute_sector(conf, logical_sector, previous, - &dd_idx, &pd_idx); + &dd_idx, &pd_idx, &qd_idx); pr_debug("raid5: make_request, sector %llu logical %llu\n", (unsigned long long)new_sector, (unsigned long long)logical_sector); @@ -3535,7 +3572,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped */ raid5_conf_t *conf = (raid5_conf_t *) mddev->private; struct stripe_head *sh; - int pd_idx; + int pd_idx, qd_idx; sector_t first_sector, last_sector; int raid_disks = conf->previous_raid_disks; int data_disks = raid_disks - conf->max_degraded; @@ -3598,7 +3635,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped if (j == sh->pd_idx) continue; if (conf->level == 6 && - j == raid6_next_disk(sh->pd_idx, sh->disks)) + j == sh->qd_idx) continue; s = compute_blocknr(sh, j); if (s < mddev->array_sectors) { @@ -3625,11 +3662,11 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped */ first_sector = raid5_compute_sector(conf, sector_nr*(new_data_disks), - 1, &dd_idx, &pd_idx); + 1, &dd_idx, &pd_idx, &qd_idx); last_sector = raid5_compute_sector(conf, ((sector_nr+conf->chunk_size/512) *(new_data_disks) - 1), - 1, &dd_idx, &pd_idx); + 1, &dd_idx, &pd_idx, &qd_idx); if (last_sector >= mddev->dev_sectors) last_sector = mddev->dev_sectors - 1; while (first_sector <= last_sector) { @@ -3764,7 +3801,7 @@ static int retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio) * it will be only one 'dd_idx' and only need one call to raid5_compute_sector. */ struct stripe_head *sh; - int dd_idx, pd_idx; + int dd_idx, pd_idx, qd_idx; sector_t sector, logical_sector, last_sector; int scnt = 0; int remaining; @@ -3772,7 +3809,7 @@ static int retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio) logical_sector = raid_bio->bi_sector & ~((sector_t)STRIPE_SECTORS-1); sector = raid5_compute_sector(conf, logical_sector, - 0, &dd_idx, &pd_idx); + 0, &dd_idx, &pd_idx, &qd_idx); last_sector = raid_bio->bi_sector + (raid_bio->bi_size>>9); for (; logical_sector < last_sector; diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index 0ed22df..0c7375a 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -196,15 +196,16 @@ enum reconstruct_states { struct stripe_head { struct hlist_node hash; - struct list_head lru; /* inactive_list or handle_list */ - struct raid5_private_data *raid_conf; - sector_t sector; /* sector of this row */ - int pd_idx; /* parity disk index */ - unsigned long state; /* state flags */ - atomic_t count; /* nr of active thread/requests */ + struct list_head lru; /* inactive_list or handle_list */ + struct raid5_private_data *raid_conf; + sector_t sector; /* sector of this row */ + short pd_idx; /* parity disk index */ + short qd_idx; /* 'Q' disk index for raid6 */ + unsigned long state; /* state flags */ + atomic_t count; /* nr of active thread/requests */ spinlock_t lock; int bm_seq; /* sequence number for bitmap flushes */ - int disks; /* disks in stripe */ + int disks; /* disks in stripe */ enum check_states check_state; enum reconstruct_states reconstruct_state; /* stripe_operations -- cgit v0.10.2 From 911d4ee8536d89ea8a6cd3e96b1c95a3ebc5ea66 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 31 Mar 2009 14:39:38 +1100 Subject: md/raid5: simplify raid5_compute_sector interface Rather than passing 'pd_idx' and 'qd_idx' to be filled in, pass a 'struct stripe_head *' and fill in the relevant fields. This is more extensible. Signed-off-by: NeilBrown diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index cb3e157..2e2e64f 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -299,14 +299,13 @@ static int grow_buffers(struct stripe_head *sh, int num) } static void raid5_build_block(struct stripe_head *sh, int i); -static int stripe_to_pdidx(sector_t stripe, raid5_conf_t *conf, int previous, - int *qd_idx); +static void stripe_set_idx(sector_t stripe, raid5_conf_t *conf, int previous, + struct stripe_head *sh); static void init_stripe(struct stripe_head *sh, sector_t sector, int previous) { raid5_conf_t *conf = sh->raid_conf; int i; - int qd_idx; BUG_ON(atomic_read(&sh->count) != 0); BUG_ON(test_bit(STRIPE_HANDLE, &sh->state)); @@ -320,8 +319,7 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int previous) sh->disks = previous ? conf->previous_raid_disks : conf->raid_disks; sh->sector = sector; - sh->pd_idx = stripe_to_pdidx(sector, conf, previous, &qd_idx); - sh->qd_idx = qd_idx; + stripe_set_idx(sector, conf, previous, sh); sh->state = 0; @@ -1262,12 +1260,13 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev) * Output: index of the data and parity disk, and the sector # in them. */ static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector, - int previous, - int *dd_idx, int *pd_idx, int *qd_idx) + int previous, int *dd_idx, + struct stripe_head *sh) { long stripe; unsigned long chunk_number; unsigned int chunk_offset; + int pd_idx, qd_idx; sector_t new_sector; int sectors_per_chunk = conf->chunk_size >> 9; int raid_disks = previous ? conf->previous_raid_disks @@ -1296,30 +1295,30 @@ static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector, /* * Select the parity disk based on the user selected algorithm. */ - *qd_idx = ~0; + pd_idx = qd_idx = ~0; switch(conf->level) { case 4: - *pd_idx = data_disks; + pd_idx = data_disks; break; case 5: switch (conf->algorithm) { case ALGORITHM_LEFT_ASYMMETRIC: - *pd_idx = data_disks - stripe % raid_disks; - if (*dd_idx >= *pd_idx) + pd_idx = data_disks - stripe % raid_disks; + if (*dd_idx >= pd_idx) (*dd_idx)++; break; case ALGORITHM_RIGHT_ASYMMETRIC: - *pd_idx = stripe % raid_disks; - if (*dd_idx >= *pd_idx) + pd_idx = stripe % raid_disks; + if (*dd_idx >= pd_idx) (*dd_idx)++; break; case ALGORITHM_LEFT_SYMMETRIC: - *pd_idx = data_disks - stripe % raid_disks; - *dd_idx = (*pd_idx + 1 + *dd_idx) % raid_disks; + pd_idx = data_disks - stripe % raid_disks; + *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks; break; case ALGORITHM_RIGHT_SYMMETRIC: - *pd_idx = stripe % raid_disks; - *dd_idx = (*pd_idx + 1 + *dd_idx) % raid_disks; + pd_idx = stripe % raid_disks; + *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks; break; default: printk(KERN_ERR "raid5: unsupported algorithm %d\n", @@ -1331,32 +1330,32 @@ static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector, /**** FIX THIS ****/ switch (conf->algorithm) { case ALGORITHM_LEFT_ASYMMETRIC: - *pd_idx = raid_disks - 1 - (stripe % raid_disks); - *qd_idx = *pd_idx + 1; - if (*pd_idx == raid_disks-1) { + pd_idx = raid_disks - 1 - (stripe % raid_disks); + qd_idx = pd_idx + 1; + if (pd_idx == raid_disks-1) { (*dd_idx)++; /* Q D D D P */ - *qd_idx = 0; - } else if (*dd_idx >= *pd_idx) + qd_idx = 0; + } else if (*dd_idx >= pd_idx) (*dd_idx) += 2; /* D D P Q D */ break; case ALGORITHM_RIGHT_ASYMMETRIC: - *pd_idx = stripe % raid_disks; - *qd_idx = *pd_idx + 1; - if (*pd_idx == raid_disks-1) { + pd_idx = stripe % raid_disks; + qd_idx = pd_idx + 1; + if (pd_idx == raid_disks-1) { (*dd_idx)++; /* Q D D D P */ - *qd_idx = 0; - } else if (*dd_idx >= *pd_idx) + qd_idx = 0; + } else if (*dd_idx >= pd_idx) (*dd_idx) += 2; /* D D P Q D */ break; case ALGORITHM_LEFT_SYMMETRIC: - *pd_idx = raid_disks - 1 - (stripe % raid_disks); - *qd_idx = (*pd_idx + 1) % raid_disks; - *dd_idx = (*pd_idx + 2 + *dd_idx) % raid_disks; + pd_idx = raid_disks - 1 - (stripe % raid_disks); + qd_idx = (pd_idx + 1) % raid_disks; + *dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks; break; case ALGORITHM_RIGHT_SYMMETRIC: - *pd_idx = stripe % raid_disks; - *qd_idx = (*pd_idx + 1) % raid_disks; - *dd_idx = (*pd_idx + 2 + *dd_idx) % raid_disks; + pd_idx = stripe % raid_disks; + qd_idx = (pd_idx + 1) % raid_disks; + *dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks; break; default: printk(KERN_CRIT "raid6: unsupported algorithm %d\n", @@ -1365,6 +1364,10 @@ static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector, break; } + if (sh) { + sh->pd_idx = pd_idx; + sh->qd_idx = qd_idx; + } /* * Finally, compute the new sector number */ @@ -1382,8 +1385,9 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i) int sectors_per_chunk = conf->chunk_size >> 9; sector_t stripe; int chunk_offset; - int chunk_number, dummy1, dummy2, dummy3, dd_idx = i; + int chunk_number, dummy1, dd_idx = i; sector_t r_sector; + struct stripe_head sh2; chunk_offset = sector_div(new_sector, sectors_per_chunk); @@ -1446,8 +1450,9 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i) check = raid5_compute_sector(conf, r_sector, (raid_disks != conf->raid_disks), - &dummy1, &dummy2, &dummy3); - if (check != sh->sector || dummy1 != dd_idx || dummy2 != sh->pd_idx) { + &dummy1, &sh2); + if (check != sh->sector || dummy1 != dd_idx || sh2.pd_idx != sh->pd_idx + || sh2.qd_idx != sh->qd_idx) { printk(KERN_ERR "compute_blocknr: map not correct\n"); return 0; } @@ -1843,11 +1848,11 @@ static int page_is_zero(struct page *p) memcmp(a, a+4, STRIPE_SIZE-4)==0); } -static int stripe_to_pdidx(sector_t stripe, raid5_conf_t *conf, int previous, - int *qd_idxp) +static void stripe_set_idx(sector_t stripe, raid5_conf_t *conf, int previous, + struct stripe_head *sh) { int sectors_per_chunk = conf->chunk_size >> 9; - int pd_idx, dd_idx; + int dd_idx; int chunk_offset = sector_div(stripe, sectors_per_chunk); int disks = previous ? conf->previous_raid_disks : conf->raid_disks; @@ -1855,8 +1860,7 @@ static int stripe_to_pdidx(sector_t stripe, raid5_conf_t *conf, int previous, stripe * (disks - conf->max_degraded) *sectors_per_chunk + chunk_offset, previous, - &dd_idx, &pd_idx, qd_idxp); - return pd_idx; + &dd_idx, sh); } static void @@ -2514,13 +2518,12 @@ static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh, clear_bit(STRIPE_EXPAND_SOURCE, &sh->state); for (i = 0; i < sh->disks; i++) if (i != sh->pd_idx && (!r6s || i != r6s->qd_idx)) { - int dd_idx, pd_idx, qd_idx, j; + int dd_idx, j; struct stripe_head *sh2; sector_t bn = compute_blocknr(sh, i); - sector_t s = - raid5_compute_sector(conf, bn, 0, - &dd_idx, &pd_idx, &qd_idx); + sector_t s = raid5_compute_sector(conf, bn, 0, + &dd_idx, NULL); sh2 = get_active_stripe(conf, s, 0, 1); if (sh2 == NULL) /* so far only the early blocks of this stripe @@ -2804,11 +2807,9 @@ static bool handle_stripe5(struct stripe_head *sh) if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) && !sh->reconstruct_state) { - int qd_idx; /* Need to write out all blocks after computing parity */ sh->disks = conf->raid_disks; - sh->pd_idx = stripe_to_pdidx(sh->sector, conf, 0, &qd_idx); - sh->qd_idx = qd_idx; + stripe_set_idx(sh->sector, conf, 0, sh); schedule_reconstruction5(sh, &s, 1, 1); } else if (s.expanded && !sh->reconstruct_state && s.locked == 0) { clear_bit(STRIPE_EXPAND_READY, &sh->state); @@ -3025,10 +3026,8 @@ static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page) if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state)) { /* Need to write out all blocks after computing P&Q */ - int qd_idx; sh->disks = conf->raid_disks; - sh->pd_idx = stripe_to_pdidx(sh->sector, conf, 0, &qd_idx); - sh->qd_idx = qd_idx; + stripe_set_idx(sh->sector, conf, 0, sh); compute_parity6(sh, RECONSTRUCT_WRITE); for (i = conf->raid_disks ; i-- ; ) { set_bit(R5_LOCKED, &sh->dev[i].flags); @@ -3300,7 +3299,7 @@ static int chunk_aligned_read(struct request_queue *q, struct bio * raid_bio) { mddev_t *mddev = q->queuedata; raid5_conf_t *conf = mddev_to_conf(mddev); - unsigned int dd_idx, pd_idx, qd_idx; + unsigned int dd_idx; struct bio* align_bi; mdk_rdev_t *rdev; @@ -3325,7 +3324,7 @@ static int chunk_aligned_read(struct request_queue *q, struct bio * raid_bio) */ align_bi->bi_sector = raid5_compute_sector(conf, raid_bio->bi_sector, 0, - &dd_idx, &pd_idx, &qd_idx); + &dd_idx, NULL); rcu_read_lock(); rdev = rcu_dereference(conf->disks[dd_idx].rdev); @@ -3417,7 +3416,7 @@ static int make_request(struct request_queue *q, struct bio * bi) { mddev_t *mddev = q->queuedata; raid5_conf_t *conf = mddev_to_conf(mddev); - int dd_idx, pd_idx, qd_idx; + int dd_idx; sector_t new_sector; sector_t logical_sector, last_sector; struct stripe_head *sh; @@ -3484,7 +3483,7 @@ static int make_request(struct request_queue *q, struct bio * bi) new_sector = raid5_compute_sector(conf, logical_sector, previous, - &dd_idx, &pd_idx, &qd_idx); + &dd_idx, NULL); pr_debug("raid5: make_request, sector %llu logical %llu\n", (unsigned long long)new_sector, (unsigned long long)logical_sector); @@ -3572,7 +3571,6 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped */ raid5_conf_t *conf = (raid5_conf_t *) mddev->private; struct stripe_head *sh; - int pd_idx, qd_idx; sector_t first_sector, last_sector; int raid_disks = conf->previous_raid_disks; int data_disks = raid_disks - conf->max_degraded; @@ -3662,11 +3660,11 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped */ first_sector = raid5_compute_sector(conf, sector_nr*(new_data_disks), - 1, &dd_idx, &pd_idx, &qd_idx); + 1, &dd_idx, NULL); last_sector = raid5_compute_sector(conf, ((sector_nr+conf->chunk_size/512) *(new_data_disks) - 1), - 1, &dd_idx, &pd_idx, &qd_idx); + 1, &dd_idx, NULL); if (last_sector >= mddev->dev_sectors) last_sector = mddev->dev_sectors - 1; while (first_sector <= last_sector) { @@ -3801,7 +3799,7 @@ static int retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio) * it will be only one 'dd_idx' and only need one call to raid5_compute_sector. */ struct stripe_head *sh; - int dd_idx, pd_idx, qd_idx; + int dd_idx; sector_t sector, logical_sector, last_sector; int scnt = 0; int remaining; @@ -3809,7 +3807,7 @@ static int retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio) logical_sector = raid_bio->bi_sector & ~((sector_t)STRIPE_SECTORS-1); sector = raid5_compute_sector(conf, logical_sector, - 0, &dd_idx, &pd_idx, &qd_idx); + 0, &dd_idx, NULL); last_sector = raid_bio->bi_sector + (raid_bio->bi_size>>9); for (; logical_sector < last_sector; -- cgit v0.10.2 From 99c0fb5f92828ae96909d390f2df137b89093b37 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 31 Mar 2009 14:39:38 +1100 Subject: md/raid5: Add support for new layouts for raid5 and raid6. DDF uses different layouts for P and Q blocks than current md/raid6 so add those that are missing. Also add support for RAID6 layouts that are identical to various raid5 layouts with the simple addition of one device to hold all of the 'Q' blocks. Finally add 'raid5' layouts to match raid4. These last to will allow online level conversion. Note that this does not provide correct support for DDF/raid6 yet as the order in which data blocks are summed to produce the Q block is significant and different between current md code and DDF requirements. Signed-off-by: NeilBrown diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 2e2e64f..c1d94ed 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -1098,7 +1098,7 @@ static void shrink_stripes(raid5_conf_t *conf) static void raid5_end_read_request(struct bio * bi, int error) { - struct stripe_head *sh = bi->bi_private; + struct stripe_head *sh = bi->bi_private; raid5_conf_t *conf = sh->raid_conf; int disks = sh->disks, i; int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); @@ -1180,7 +1180,7 @@ static void raid5_end_read_request(struct bio * bi, int error) static void raid5_end_write_request(struct bio *bi, int error) { - struct stripe_head *sh = bi->bi_private; + struct stripe_head *sh = bi->bi_private; raid5_conf_t *conf = sh->raid_conf; int disks = sh->disks, i; int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); @@ -1320,20 +1320,27 @@ static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector, pd_idx = stripe % raid_disks; *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks; break; + case ALGORITHM_PARITY_0: + pd_idx = 0; + (*dd_idx)++; + break; + case ALGORITHM_PARITY_N: + pd_idx = data_disks; + break; default: printk(KERN_ERR "raid5: unsupported algorithm %d\n", conf->algorithm); + BUG(); } break; case 6: - /**** FIX THIS ****/ switch (conf->algorithm) { case ALGORITHM_LEFT_ASYMMETRIC: pd_idx = raid_disks - 1 - (stripe % raid_disks); qd_idx = pd_idx + 1; if (pd_idx == raid_disks-1) { - (*dd_idx)++; /* Q D D D P */ + (*dd_idx)++; /* Q D D D P */ qd_idx = 0; } else if (*dd_idx >= pd_idx) (*dd_idx) += 2; /* D D P Q D */ @@ -1342,7 +1349,7 @@ static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector, pd_idx = stripe % raid_disks; qd_idx = pd_idx + 1; if (pd_idx == raid_disks-1) { - (*dd_idx)++; /* Q D D D P */ + (*dd_idx)++; /* Q D D D P */ qd_idx = 0; } else if (*dd_idx >= pd_idx) (*dd_idx) += 2; /* D D P Q D */ @@ -1357,9 +1364,89 @@ static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector, qd_idx = (pd_idx + 1) % raid_disks; *dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks; break; + + case ALGORITHM_PARITY_0: + pd_idx = 0; + qd_idx = 1; + (*dd_idx) += 2; + break; + case ALGORITHM_PARITY_N: + pd_idx = data_disks; + qd_idx = data_disks + 1; + break; + + case ALGORITHM_ROTATING_ZERO_RESTART: + /* Exactly the same as RIGHT_ASYMMETRIC, but or + * of blocks for computing Q is different. + */ + pd_idx = stripe % raid_disks; + qd_idx = pd_idx + 1; + if (pd_idx == raid_disks-1) { + (*dd_idx)++; /* Q D D D P */ + qd_idx = 0; + } else if (*dd_idx >= pd_idx) + (*dd_idx) += 2; /* D D P Q D */ + break; + + case ALGORITHM_ROTATING_N_RESTART: + /* Same a left_asymmetric, by first stripe is + * D D D P Q rather than + * Q D D D P + */ + pd_idx = raid_disks - 1 - ((stripe + 1) % raid_disks); + qd_idx = pd_idx + 1; + if (pd_idx == raid_disks-1) { + (*dd_idx)++; /* Q D D D P */ + qd_idx = 0; + } else if (*dd_idx >= pd_idx) + (*dd_idx) += 2; /* D D P Q D */ + break; + + case ALGORITHM_ROTATING_N_CONTINUE: + /* Same as left_symmetric but Q is before P */ + pd_idx = raid_disks - 1 - (stripe % raid_disks); + qd_idx = (pd_idx + raid_disks - 1) % raid_disks; + *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks; + break; + + case ALGORITHM_LEFT_ASYMMETRIC_6: + /* RAID5 left_asymmetric, with Q on last device */ + pd_idx = data_disks - stripe % (raid_disks-1); + if (*dd_idx >= pd_idx) + (*dd_idx)++; + qd_idx = raid_disks - 1; + break; + + case ALGORITHM_RIGHT_ASYMMETRIC_6: + pd_idx = stripe % (raid_disks-1); + if (*dd_idx >= pd_idx) + (*dd_idx)++; + qd_idx = raid_disks - 1; + break; + + case ALGORITHM_LEFT_SYMMETRIC_6: + pd_idx = data_disks - stripe % (raid_disks-1); + *dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1); + qd_idx = raid_disks - 1; + break; + + case ALGORITHM_RIGHT_SYMMETRIC_6: + pd_idx = stripe % (raid_disks-1); + *dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1); + qd_idx = raid_disks - 1; + break; + + case ALGORITHM_PARITY_0_6: + pd_idx = 0; + (*dd_idx)++; + qd_idx = raid_disks - 1; + break; + + default: printk(KERN_CRIT "raid6: unsupported algorithm %d\n", conf->algorithm); + BUG(); } break; } @@ -1411,9 +1498,15 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i) i += raid_disks; i -= (sh->pd_idx + 1); break; + case ALGORITHM_PARITY_0: + i -= 1; + break; + case ALGORITHM_PARITY_N: + break; default: printk(KERN_ERR "raid5: unsupported algorithm %d\n", conf->algorithm); + BUG(); } break; case 6: @@ -1422,8 +1515,10 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i) switch (conf->algorithm) { case ALGORITHM_LEFT_ASYMMETRIC: case ALGORITHM_RIGHT_ASYMMETRIC: - if (sh->pd_idx == raid_disks-1) - i--; /* Q D D D P */ + case ALGORITHM_ROTATING_ZERO_RESTART: + case ALGORITHM_ROTATING_N_RESTART: + if (sh->pd_idx == raid_disks-1) + i--; /* Q D D D P */ else if (i > sh->pd_idx) i -= 2; /* D D P Q D */ break; @@ -1438,9 +1533,35 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i) i -= (sh->pd_idx + 2); } break; + case ALGORITHM_PARITY_0: + i -= 2; + break; + case ALGORITHM_PARITY_N: + break; + case ALGORITHM_ROTATING_N_CONTINUE: + if (sh->pd_idx == 0) + i--; /* P D D D Q */ + else if (i > sh->pd_idx) + i -= 2; /* D D Q P D */ + break; + case ALGORITHM_LEFT_ASYMMETRIC_6: + case ALGORITHM_RIGHT_ASYMMETRIC_6: + if (i > sh->pd_idx) + i--; + break; + case ALGORITHM_LEFT_SYMMETRIC_6: + case ALGORITHM_RIGHT_SYMMETRIC_6: + if (i < sh->pd_idx) + i += data_disks + 1; + i -= (sh->pd_idx + 1); + break; + case ALGORITHM_PARITY_0_6: + i -= 1; + break; default: printk(KERN_CRIT "raid6: unsupported algorithm %d\n", conf->algorithm); + BUG(); } break; } @@ -3308,7 +3429,7 @@ static int chunk_aligned_read(struct request_queue *q, struct bio * raid_bio) return 0; } /* - * use bio_clone to make a copy of the bio + * use bio_clone to make a copy of the bio */ align_bi = bio_clone(raid_bio, GFP_NOIO); if (!align_bi) @@ -3439,7 +3560,7 @@ static int make_request(struct request_queue *q, struct bio * bi) if (rw == READ && mddev->reshape_position == MaxSector && chunk_aligned_read(q,bi)) - return 0; + return 0; logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1); last_sector = bi->bi_sector + (bi->bi_size>>9); @@ -4034,6 +4155,12 @@ static int run(mddev_t *mddev) mdname(mddev), mddev->level); return -EIO; } + if ((mddev->level == 5 && !algorithm_valid_raid5(mddev->layout)) || + (mddev->level == 6 && !algorithm_valid_raid6(mddev->layout))) { + printk(KERN_ERR "raid5: %s: layout %d not supported\n", + mdname(mddev), mddev->layout); + return -EIO; + } if (mddev->chunk_size < PAGE_SIZE) { printk(KERN_ERR "md/raid5: chunk_size must be at least " @@ -4185,12 +4312,6 @@ static int run(mddev_t *mddev) conf->chunk_size, mdname(mddev)); goto abort; } - if (conf->algorithm > ALGORITHM_RIGHT_SYMMETRIC) { - printk(KERN_ERR - "raid5: unsupported parity algorithm %d for %s\n", - conf->algorithm, mdname(mddev)); - goto abort; - } if (mddev->degraded > conf->max_degraded) { printk(KERN_ERR "raid5: not enough operational devices for %s" " (%d/%d failed)\n", diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index 0c7375a..633d792 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -394,9 +394,62 @@ typedef struct raid5_private_data raid5_conf_t; /* * Our supported algorithms */ -#define ALGORITHM_LEFT_ASYMMETRIC 0 -#define ALGORITHM_RIGHT_ASYMMETRIC 1 -#define ALGORITHM_LEFT_SYMMETRIC 2 -#define ALGORITHM_RIGHT_SYMMETRIC 3 +#define ALGORITHM_LEFT_ASYMMETRIC 0 /* Rotating Parity N with Data Restart */ +#define ALGORITHM_RIGHT_ASYMMETRIC 1 /* Rotating Parity 0 with Data Restart */ +#define ALGORITHM_LEFT_SYMMETRIC 2 /* Rotating Parity N with Data Continuation */ +#define ALGORITHM_RIGHT_SYMMETRIC 3 /* Rotating Parity 0 with Data Continuation */ +/* Define non-rotating (raid4) algorithms. These allow + * conversion of raid4 to raid5. + */ +#define ALGORITHM_PARITY_0 4 /* P or P,Q are initial devices */ +#define ALGORITHM_PARITY_N 5 /* P or P,Q are final devices. */ + +/* DDF RAID6 layouts differ from md/raid6 layouts in two ways. + * Firstly, the exact positioning of the parity block is slightly + * different between the 'LEFT_*' modes of md and the "_N_*" modes + * of DDF. + * Secondly, or order of datablocks over which the Q syndrome is computed + * is different. + * Consequently we have different layouts for DDF/raid6 than md/raid6. + * These layouts are from the DDFv1.2 spec. + * Interestingly DDFv1.2-Errata-A does not specify N_CONTINUE but + * leaves RLQ=3 as 'Vendor Specific' + */ + +#define ALGORITHM_ROTATING_ZERO_RESTART 8 /* DDF PRL=6 RLQ=1 */ +#define ALGORITHM_ROTATING_N_RESTART 9 /* DDF PRL=6 RLQ=2 */ +#define ALGORITHM_ROTATING_N_CONTINUE 10 /*DDF PRL=6 RLQ=3 */ + + +/* For every RAID5 algorithm we define a RAID6 algorithm + * with exactly the same layout for data and parity, and + * with the Q block always on the last device (N-1). + * This allows trivial conversion from RAID5 to RAID6 + */ +#define ALGORITHM_LEFT_ASYMMETRIC_6 16 +#define ALGORITHM_RIGHT_ASYMMETRIC_6 17 +#define ALGORITHM_LEFT_SYMMETRIC_6 18 +#define ALGORITHM_RIGHT_SYMMETRIC_6 19 +#define ALGORITHM_PARITY_0_6 20 +#define ALGORITHM_PARITY_N_6 ALGORITHM_PARITY_N + +static inline int algorithm_valid_raid5(int layout) +{ + return (layout >= 0) && + (layout <= 5); +} +static inline int algorithm_valid_raid6(int layout) +{ + return (layout >= 0 && layout <= 5) + || + (layout == 8 || layout == 10) + || + (layout >= 16 && layout <= 20); +} + +static inline int algorithm_is_DDF(int layout) +{ + return layout >= 8 && layout <= 10; +} #endif -- cgit v0.10.2 From 67cc2b8165857ba019920d1f00d64bcc4140075d Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 31 Mar 2009 14:39:38 +1100 Subject: md/raid5: finish support for DDF/raid6 DDF requires RAID6 calculations over different devices in a different order. For md/raid6, we calculate over just the data devices, starting immediately after the 'Q' block. For ddf/raid6 we calculate over all devices, using zeros in place of the P and Q blocks. This requires unfortunately complex loops... Signed-off-by: NeilBrown diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index c1d94ed..edbc80c 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -136,6 +136,10 @@ static inline void raid5_set_bi_hw_segments(struct bio *bio, unsigned int cnt) /* Find first data disk in a raid6 stripe */ static inline int raid6_d0(struct stripe_head *sh) { + if (sh->ddf_layout) + /* ddf always start from first device */ + return 0; + /* md starts just after Q block */ if (sh->qd_idx == sh->disks - 1) return 0; else @@ -152,13 +156,15 @@ static inline int raid6_next_disk(int disk, int raid_disks) * 0 .. raid_disks-3, the parity disk is raid_disks-2 and the Q disk * is raid_disks-1. This help does that mapping. */ -static int raid6_idx_to_slot(int idx, struct stripe_head *sh, int *count) +static int raid6_idx_to_slot(int idx, struct stripe_head *sh, + int *count, int syndrome_disks) { int slot; + if (idx == sh->pd_idx) - return sh->disks - 2; + return syndrome_disks; if (idx == sh->qd_idx) - return sh->disks - 1; + return syndrome_disks + 1; slot = (*count)++; return slot; } @@ -1267,6 +1273,7 @@ static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector, unsigned long chunk_number; unsigned int chunk_offset; int pd_idx, qd_idx; + int ddf_layout = 0; sector_t new_sector; int sectors_per_chunk = conf->chunk_size >> 9; int raid_disks = previous ? conf->previous_raid_disks @@ -1386,6 +1393,7 @@ static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector, qd_idx = 0; } else if (*dd_idx >= pd_idx) (*dd_idx) += 2; /* D D P Q D */ + ddf_layout = 1; break; case ALGORITHM_ROTATING_N_RESTART: @@ -1400,6 +1408,7 @@ static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector, qd_idx = 0; } else if (*dd_idx >= pd_idx) (*dd_idx) += 2; /* D D P Q D */ + ddf_layout = 1; break; case ALGORITHM_ROTATING_N_CONTINUE: @@ -1407,6 +1416,7 @@ static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector, pd_idx = raid_disks - 1 - (stripe % raid_disks); qd_idx = (pd_idx + raid_disks - 1) % raid_disks; *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks; + ddf_layout = 1; break; case ALGORITHM_LEFT_ASYMMETRIC_6: @@ -1454,6 +1464,7 @@ static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector, if (sh) { sh->pd_idx = pd_idx; sh->qd_idx = qd_idx; + sh->ddf_layout = ddf_layout; } /* * Finally, compute the new sector number @@ -1642,9 +1653,10 @@ static void compute_parity6(struct stripe_head *sh, int method) { raid5_conf_t *conf = sh->raid_conf; int i, pd_idx, qd_idx, d0_idx, disks = sh->disks, count; + int syndrome_disks = sh->ddf_layout ? disks : (disks - 2); struct bio *chosen; /**** FIX THIS: This could be very bad if disks is close to 256 ****/ - void *ptrs[disks]; + void *ptrs[syndrome_disks+2]; pd_idx = sh->pd_idx; qd_idx = sh->qd_idx; @@ -1687,23 +1699,28 @@ static void compute_parity6(struct stripe_head *sh, int method) } /* Note that unlike RAID-5, the ordering of the disks matters greatly.*/ - /* FIX: Is this ordering of drives even remotely optimal? */ + + for (i = 0; i < disks; i++) + ptrs[i] = (void *)raid6_empty_zero_page; + count = 0; i = d0_idx; do { - int slot = raid6_idx_to_slot(i, sh, &count); + int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks); + ptrs[slot] = page_address(sh->dev[i].page); - if (slot < sh->disks - 2 && + if (slot < syndrome_disks && !test_bit(R5_UPTODATE, &sh->dev[i].flags)) { printk(KERN_ERR "block %d/%d not uptodate " "on parity calc\n", i, count); BUG(); } + i = raid6_next_disk(i, disks); } while (i != d0_idx); - BUG_ON(count+2 != disks); + BUG_ON(count != syndrome_disks); - raid6_call.gen_syndrome(disks, STRIPE_SIZE, ptrs); + raid6_call.gen_syndrome(syndrome_disks+2, STRIPE_SIZE, ptrs); switch(method) { case RECONSTRUCT_WRITE: @@ -1761,24 +1778,28 @@ static void compute_block_1(struct stripe_head *sh, int dd_idx, int nozero) static void compute_block_2(struct stripe_head *sh, int dd_idx1, int dd_idx2) { int i, count, disks = sh->disks; + int syndrome_disks = sh->ddf_layout ? disks : disks-2; int d0_idx = raid6_d0(sh); int faila = -1, failb = -1; /**** FIX THIS: This could be very bad if disks is close to 256 ****/ - void *ptrs[disks]; + void *ptrs[syndrome_disks+2]; + for (i = 0; i < disks ; i++) + ptrs[i] = (void *)raid6_empty_zero_page; count = 0; i = d0_idx; do { - int slot; - slot = raid6_idx_to_slot(i, sh, &count); + int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks); + ptrs[slot] = page_address(sh->dev[i].page); + if (i == dd_idx1) faila = slot; if (i == dd_idx2) failb = slot; i = raid6_next_disk(i, disks); } while (i != d0_idx); - BUG_ON(count+2 != disks); + BUG_ON(count != syndrome_disks); BUG_ON(faila == failb); if ( failb < faila ) { int tmp = faila; faila = failb; failb = tmp; } @@ -1787,9 +1808,9 @@ static void compute_block_2(struct stripe_head *sh, int dd_idx1, int dd_idx2) (unsigned long long)sh->sector, dd_idx1, dd_idx2, faila, failb); - if ( failb == disks-1 ) { + if (failb == syndrome_disks+1) { /* Q disk is one of the missing disks */ - if ( faila == disks-2 ) { + if (faila == syndrome_disks) { /* Missing P+Q, just recompute */ compute_parity6(sh, UPDATE_PARITY); return; @@ -1804,12 +1825,13 @@ static void compute_block_2(struct stripe_head *sh, int dd_idx1, int dd_idx2) } /* We're missing D+P or D+D; */ - if (failb == disks-2) { + if (failb == syndrome_disks) { /* We're missing D+P. */ - raid6_datap_recov(disks, STRIPE_SIZE, faila, ptrs); + raid6_datap_recov(syndrome_disks+2, STRIPE_SIZE, faila, ptrs); } else { /* We're missing D+D. */ - raid6_2data_recov(disks, STRIPE_SIZE, faila, failb, ptrs); + raid6_2data_recov(syndrome_disks+2, STRIPE_SIZE, faila, failb, + ptrs); } /* Both the above update both missing blocks */ diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index 633d792..84456b1 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -201,6 +201,7 @@ struct stripe_head { sector_t sector; /* sector of this row */ short pd_idx; /* parity disk index */ short qd_idx; /* 'Q' disk index for raid6 */ + short ddf_layout;/* use DDF ordering to calculate Q */ unsigned long state; /* state flags */ atomic_t count; /* nr of active thread/requests */ spinlock_t lock; -- cgit v0.10.2 From 34817e8c3948ea20316dfa8fd8947d6d0ee82ba9 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 31 Mar 2009 14:39:38 +1100 Subject: md: make sure new_level, new_chunksize, new_layout always have sensible values. When an md array is undergoing a change, we have new_* fields that show the new values. When no change is happening, it is least confusing if these have the same value as the normal fields. This is true in most cases, but not when the values are set via sysfs. So fix this up. A subsequent patch will BUG_ON if these things aren't consistent. Signed-off-by: NeilBrown diff --git a/drivers/md/md.c b/drivers/md/md.c index 07ab679..117ea5f 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -2633,9 +2633,9 @@ layout_store(mddev_t *mddev, const char *buf, size_t len) if (mddev->pers) return -EBUSY; - if (mddev->reshape_position != MaxSector) - mddev->new_layout = n; - else + + mddev->new_layout = n; + if (mddev->reshape_position == MaxSector) mddev->layout = n; return len; } @@ -2702,9 +2702,9 @@ chunk_size_store(mddev_t *mddev, const char *buf, size_t len) if (mddev->pers) return -EBUSY; - else if (mddev->reshape_position != MaxSector) - mddev->new_chunk = n; - else + + mddev->new_chunk = n; + if (mddev->reshape_position == MaxSector) mddev->chunk_size = n; return len; } @@ -3831,7 +3831,10 @@ static int do_md_run(mddev_t * mddev) } mddev->pers = pers; spin_unlock(&pers_lock); - mddev->level = pers->level; + if (mddev->level != pers->level) { + mddev->level = pers->level; + mddev->new_level = pers->level; + } strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); if (pers->level >= 4 && pers->level <= 6) -- cgit v0.10.2 From 91adb56473febeeb3ef657bb5147ddd355465700 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 31 Mar 2009 14:39:39 +1100 Subject: md/raid5: refactor raid5 "run" .. so that the code to create the private data structures is separate. This will help with future code to change the level of an active array. Signed-off-by: NeilBrown diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index edbc80c..d019a85 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -4164,95 +4164,49 @@ static struct attribute_group raid5_attrs_group = { .attrs = raid5_attrs, }; -static int run(mddev_t *mddev) +static raid5_conf_t *setup_conf(mddev_t *mddev) { raid5_conf_t *conf; int raid_disk, memory; mdk_rdev_t *rdev; struct disk_info *disk; - int working_disks = 0; - if (mddev->level != 5 && mddev->level != 4 && mddev->level != 6) { + if (mddev->new_level != 5 + && mddev->new_level != 4 + && mddev->new_level != 6) { printk(KERN_ERR "raid5: %s: raid level not set to 4/5/6 (%d)\n", - mdname(mddev), mddev->level); - return -EIO; + mdname(mddev), mddev->new_level); + return ERR_PTR(-EIO); } - if ((mddev->level == 5 && !algorithm_valid_raid5(mddev->layout)) || - (mddev->level == 6 && !algorithm_valid_raid6(mddev->layout))) { + if ((mddev->new_level == 5 + && !algorithm_valid_raid5(mddev->new_layout)) || + (mddev->new_level == 6 + && !algorithm_valid_raid6(mddev->new_layout))) { printk(KERN_ERR "raid5: %s: layout %d not supported\n", - mdname(mddev), mddev->layout); - return -EIO; + mdname(mddev), mddev->new_layout); + return ERR_PTR(-EIO); } - - if (mddev->chunk_size < PAGE_SIZE) { - printk(KERN_ERR "md/raid5: chunk_size must be at least " - "PAGE_SIZE but %d < %ld\n", - mddev->chunk_size, PAGE_SIZE); - return -EINVAL; + if (mddev->new_level == 6 && mddev->raid_disks < 4) { + printk(KERN_ERR "raid6: not enough configured devices for %s (%d, minimum 4)\n", + mdname(mddev), mddev->raid_disks); + return ERR_PTR(-EINVAL); } - if (mddev->reshape_position != MaxSector) { - /* Check that we can continue the reshape. - * Currently only disks can change, it must - * increase, and we must be past the point where - * a stripe over-writes itself - */ - sector_t here_new, here_old; - int old_disks; - int max_degraded = (mddev->level == 5 ? 1 : 2); - - if (mddev->new_level != mddev->level || - mddev->new_layout != mddev->layout || - mddev->new_chunk != mddev->chunk_size) { - printk(KERN_ERR "raid5: %s: unsupported reshape " - "required - aborting.\n", - mdname(mddev)); - return -EINVAL; - } - if (mddev->delta_disks <= 0) { - printk(KERN_ERR "raid5: %s: unsupported reshape " - "(reduce disks) required - aborting.\n", - mdname(mddev)); - return -EINVAL; - } - old_disks = mddev->raid_disks - mddev->delta_disks; - /* reshape_position must be on a new-stripe boundary, and one - * further up in new geometry must map after here in old - * geometry. - */ - here_new = mddev->reshape_position; - if (sector_div(here_new, (mddev->chunk_size>>9)* - (mddev->raid_disks - max_degraded))) { - printk(KERN_ERR "raid5: reshape_position not " - "on a stripe boundary\n"); - return -EINVAL; - } - /* here_new is the stripe we will write to */ - here_old = mddev->reshape_position; - sector_div(here_old, (mddev->chunk_size>>9)* - (old_disks-max_degraded)); - /* here_old is the first stripe that we might need to read - * from */ - if (here_new >= here_old) { - /* Reading from the same stripe as writing to - bad */ - printk(KERN_ERR "raid5: reshape_position too early for " - "auto-recovery - aborting.\n"); - return -EINVAL; - } - printk(KERN_INFO "raid5: reshape will continue\n"); - /* OK, we should be able to continue; */ + if (!mddev->new_chunk || mddev->new_chunk % PAGE_SIZE) { + printk(KERN_ERR "raid5: invalid chunk size %d for %s\n", + mddev->new_chunk, mdname(mddev)); + return ERR_PTR(-EINVAL); } - - mddev->private = kzalloc(sizeof (raid5_conf_t), GFP_KERNEL); - if ((conf = mddev->private) == NULL) + conf = kzalloc(sizeof(raid5_conf_t), GFP_KERNEL); + if (conf == NULL) goto abort; - if (mddev->reshape_position == MaxSector) { - conf->previous_raid_disks = conf->raid_disks = mddev->raid_disks; - } else { - conf->raid_disks = mddev->raid_disks; + + conf->raid_disks = mddev->raid_disks; + if (mddev->reshape_position == MaxSector) + conf->previous_raid_disks = mddev->raid_disks; + else conf->previous_raid_disks = mddev->raid_disks - mddev->delta_disks; - } conf->disks = kzalloc(conf->raid_disks * sizeof(struct disk_info), GFP_KERNEL); @@ -4264,13 +4218,12 @@ static int run(mddev_t *mddev) if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL) goto abort; - if (mddev->level == 6) { + if (mddev->new_level == 6) { conf->spare_page = alloc_page(GFP_KERNEL); if (!conf->spare_page) goto abort; } spin_lock_init(&conf->device_lock); - mddev->queue->queue_lock = &conf->device_lock; init_waitqueue_head(&conf->wait_for_stripe); init_waitqueue_head(&conf->wait_for_overlap); INIT_LIST_HEAD(&conf->handle_list); @@ -4299,41 +4252,136 @@ static int run(mddev_t *mddev) printk(KERN_INFO "raid5: device %s operational as raid" " disk %d\n", bdevname(rdev->bdev,b), raid_disk); - working_disks++; } else /* Cannot rely on bitmap to complete recovery */ conf->fullsync = 1; } - /* - * 0 for a fully functional array, 1 or 2 for a degraded array. - */ - mddev->degraded = conf->raid_disks - working_disks; - conf->mddev = mddev; - conf->chunk_size = mddev->chunk_size; - conf->level = mddev->level; + conf->chunk_size = mddev->new_chunk; + conf->level = mddev->new_level; if (conf->level == 6) conf->max_degraded = 2; else conf->max_degraded = 1; - conf->algorithm = mddev->layout; + conf->algorithm = mddev->new_layout; conf->max_nr_stripes = NR_STRIPES; conf->expand_progress = mddev->reshape_position; - /* device size must be a multiple of chunk size */ - mddev->dev_sectors &= ~(mddev->chunk_size / 512 - 1); - mddev->resync_max_sectors = mddev->dev_sectors; + memory = conf->max_nr_stripes * (sizeof(struct stripe_head) + + conf->raid_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024; + if (grow_stripes(conf, conf->max_nr_stripes)) { + printk(KERN_ERR + "raid5: couldn't allocate %dkB for buffers\n", memory); + goto abort; + } else + printk(KERN_INFO "raid5: allocated %dkB for %s\n", + memory, mdname(mddev)); - if (conf->level == 6 && conf->raid_disks < 4) { - printk(KERN_ERR "raid6: not enough configured devices for %s (%d, minimum 4)\n", - mdname(mddev), conf->raid_disks); + conf->thread = md_register_thread(raid5d, mddev, "%s_raid5"); + if (!conf->thread) { + printk(KERN_ERR + "raid5: couldn't allocate thread for %s\n", + mdname(mddev)); goto abort; } - if (!conf->chunk_size || conf->chunk_size % 4) { - printk(KERN_ERR "raid5: invalid chunk size %d for %s\n", - conf->chunk_size, mdname(mddev)); - goto abort; + + return conf; + + abort: + if (conf) { + shrink_stripes(conf); + safe_put_page(conf->spare_page); + kfree(conf->disks); + kfree(conf->stripe_hashtbl); + kfree(conf); + return ERR_PTR(-EIO); + } else + return ERR_PTR(-ENOMEM); +} + +static int run(mddev_t *mddev) +{ + raid5_conf_t *conf; + int working_disks = 0; + mdk_rdev_t *rdev; + + if (mddev->reshape_position != MaxSector) { + /* Check that we can continue the reshape. + * Currently only disks can change, it must + * increase, and we must be past the point where + * a stripe over-writes itself + */ + sector_t here_new, here_old; + int old_disks; + int max_degraded = (mddev->level == 5 ? 1 : 2); + + if (mddev->new_level != mddev->level || + mddev->new_layout != mddev->layout || + mddev->new_chunk != mddev->chunk_size) { + printk(KERN_ERR "raid5: %s: unsupported reshape " + "required - aborting.\n", + mdname(mddev)); + return -EINVAL; + } + if (mddev->delta_disks <= 0) { + printk(KERN_ERR "raid5: %s: unsupported reshape " + "(reduce disks) required - aborting.\n", + mdname(mddev)); + return -EINVAL; + } + old_disks = mddev->raid_disks - mddev->delta_disks; + /* reshape_position must be on a new-stripe boundary, and one + * further up in new geometry must map after here in old + * geometry. + */ + here_new = mddev->reshape_position; + if (sector_div(here_new, (mddev->chunk_size>>9)* + (mddev->raid_disks - max_degraded))) { + printk(KERN_ERR "raid5: reshape_position not " + "on a stripe boundary\n"); + return -EINVAL; + } + /* here_new is the stripe we will write to */ + here_old = mddev->reshape_position; + sector_div(here_old, (mddev->chunk_size>>9)* + (old_disks-max_degraded)); + /* here_old is the first stripe that we might need to read + * from */ + if (here_new >= here_old) { + /* Reading from the same stripe as writing to - bad */ + printk(KERN_ERR "raid5: reshape_position too early for " + "auto-recovery - aborting.\n"); + return -EINVAL; + } + printk(KERN_INFO "raid5: reshape will continue\n"); + /* OK, we should be able to continue; */ + } else { + BUG_ON(mddev->level != mddev->new_level); + BUG_ON(mddev->layout != mddev->new_layout); + BUG_ON(mddev->chunk_size != mddev->new_chunk); + BUG_ON(mddev->delta_disks != 0); } + conf = setup_conf(mddev); + + if (conf == NULL) + return -EIO; + if (IS_ERR(conf)) + return PTR_ERR(conf); + + mddev->thread = conf->thread; + conf->thread = NULL; + mddev->private = conf; + + /* + * 0 for a fully functional array, 1 or 2 for a degraded array. + */ + list_for_each_entry(rdev, &mddev->disks, same_set) + if (rdev->raid_disk >= 0 && + test_bit(In_sync, &rdev->flags)) + working_disks++; + + mddev->degraded = conf->raid_disks - working_disks; + if (mddev->degraded > conf->max_degraded) { printk(KERN_ERR "raid5: not enough operational devices for %s" " (%d/%d failed)\n", @@ -4341,6 +4389,10 @@ static int run(mddev_t *mddev) goto abort; } + /* device size must be a multiple of chunk size */ + mddev->dev_sectors &= ~(mddev->chunk_size / 512 - 1); + mddev->resync_max_sectors = mddev->dev_sectors; + if (mddev->degraded > 0 && mddev->recovery_cp != MaxSector) { if (mddev->ok_start_degraded) @@ -4356,27 +4408,6 @@ static int run(mddev_t *mddev) } } - { - mddev->thread = md_register_thread(raid5d, mddev, "%s_raid5"); - if (!mddev->thread) { - printk(KERN_ERR - "raid5: couldn't allocate thread for %s\n", - mdname(mddev)); - goto abort; - } - } - memory = conf->max_nr_stripes * (sizeof(struct stripe_head) + - conf->raid_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024; - if (grow_stripes(conf, conf->max_nr_stripes)) { - printk(KERN_ERR - "raid5: couldn't allocate %dkB for buffers\n", memory); - shrink_stripes(conf); - md_unregister_thread(mddev->thread); - goto abort; - } else - printk(KERN_INFO "raid5: allocated %dkB for %s\n", - memory, mdname(mddev)); - if (mddev->degraded == 0) printk("raid5: raid level %d set %s active with %d out of %d" " devices, algorithm %d\n", conf->level, mdname(mddev), @@ -4419,6 +4450,8 @@ static int run(mddev_t *mddev) "raid5: failed to create sysfs attributes for %s\n", mdname(mddev)); + mddev->queue->queue_lock = &conf->device_lock; + mddev->queue->unplug_fn = raid5_unplug_device; mddev->queue->backing_dev_info.congested_data = mddev; mddev->queue->backing_dev_info.congested_fn = raid5_congested; @@ -4430,7 +4463,11 @@ static int run(mddev_t *mddev) return 0; abort: + if (mddev->thread) + md_unregister_thread(mddev->thread); + mddev->thread = NULL; if (conf) { + shrink_stripes(conf); print_raid5_conf(conf); safe_put_page(conf->spare_page); kfree(conf->disks); diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index 84456b1..c172371 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -386,6 +386,11 @@ struct raid5_private_data { int pool_size; /* number of disks in stripeheads in pool */ spinlock_t device_lock; struct disk_info *disks; + + /* When taking over an array from a different personality, we store + * the new thread here until we fully activate the array. + */ + struct mdk_thread_s *thread; }; typedef struct raid5_private_data raid5_conf_t; -- cgit v0.10.2 From e0cf8f045b2023b0b3f919ee93eb94345f648434 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 31 Mar 2009 14:39:39 +1100 Subject: md: md_unregister_thread should cope with being passed NULL Mostly md_unregister_thread is only called when we know that the thread is NULL, but sometimes we need to check first. It is safer to put the check inside md_unregister_thread itself. Signed-off-by: NeilBrown diff --git a/drivers/md/md.c b/drivers/md/md.c index 117ea5f..f30f09c 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -5382,6 +5382,8 @@ mdk_thread_t *md_register_thread(void (*run) (mddev_t *), mddev_t *mddev, void md_unregister_thread(mdk_thread_t *thread) { + if (!thread) + return; dprintk("interrupting MD-thread pid %d\n", task_pid_nr(thread->tsk)); kthread_stop(thread->tsk); diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index d019a85..81789fa 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -4463,8 +4463,7 @@ static int run(mddev_t *mddev) return 0; abort: - if (mddev->thread) - md_unregister_thread(mddev->thread); + md_unregister_thread(mddev->thread); mddev->thread = NULL; if (conf) { shrink_stripes(conf); -- cgit v0.10.2 From 409c57f3801701dfee27a28103dda4831306cb20 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 31 Mar 2009 14:39:39 +1100 Subject: md: enable suspend/resume of md devices. To be able to change the 'level' of an md/raid array, we need to suspend the device so that no requests are active - then move some pointers around etc. The code already keeps counts of active requests and the ->quiesce function can be used to wait until those counts hit zero. However the quiesce function blocks new requests once they are all ready 'inside' the personality module, and that is too late if we want to replace the personality modules. So make all md requests come in through a common md_make_request function that keeps track of how many requests have entered the modules but may not yet be on the internal reference counts. Allow md_make_request to be blocked when we want to suspend the device, and make it possible to wait for all those in-transit requests to be added to internal lists so that ->quiesce can wait for them. There is still a problem that when a request completes, we drop the ref count inside the personality code so there is a short time between when the refcount hits zero, and when the personality code is no longer being used. The personality code never blocks (schedule or spinlock) between dropping the refcount and exiting the routine, so this should be safe (as put_module calls synchronize_sched() before unmapping the module code). Signed-off-by: NeilBrown diff --git a/drivers/md/md.c b/drivers/md/md.c index f30f09c..6cb31f8 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -201,12 +201,68 @@ static DEFINE_SPINLOCK(all_mddevs_lock); ) -static int md_fail_request(struct request_queue *q, struct bio *bio) +/* Rather than calling directly into the personality make_request function, + * IO requests come here first so that we can check if the device is + * being suspended pending a reconfiguration. + * We hold a refcount over the call to ->make_request. By the time that + * call has finished, the bio has been linked into some internal structure + * and so is visible to ->quiesce(), so we don't need the refcount any more. + */ +static int md_make_request(struct request_queue *q, struct bio *bio) { - bio_io_error(bio); - return 0; + mddev_t *mddev = q->queuedata; + int rv; + if (mddev == NULL || mddev->pers == NULL) { + bio_io_error(bio); + return 0; + } + rcu_read_lock(); + if (mddev->suspended) { + DEFINE_WAIT(__wait); + for (;;) { + prepare_to_wait(&mddev->sb_wait, &__wait, + TASK_UNINTERRUPTIBLE); + if (!mddev->suspended) + break; + rcu_read_unlock(); + schedule(); + rcu_read_lock(); + } + finish_wait(&mddev->sb_wait, &__wait); + } + atomic_inc(&mddev->active_io); + rcu_read_unlock(); + rv = mddev->pers->make_request(q, bio); + if (atomic_dec_and_test(&mddev->active_io) && mddev->suspended) + wake_up(&mddev->sb_wait); + + return rv; } +static void mddev_suspend(mddev_t *mddev) +{ + BUG_ON(mddev->suspended); + mddev->suspended = 1; + synchronize_rcu(); + wait_event(mddev->sb_wait, atomic_read(&mddev->active_io) == 0); + mddev->pers->quiesce(mddev, 1); + md_unregister_thread(mddev->thread); + mddev->thread = NULL; + /* we now know that no code is executing in the personality module, + * except possibly the tail end of a ->bi_end_io function, but that + * is certain to complete before the module has a chance to get + * unloaded + */ +} + +static void mddev_resume(mddev_t *mddev) +{ + mddev->suspended = 0; + wake_up(&mddev->sb_wait); + mddev->pers->quiesce(mddev, 0); +} + + static inline mddev_t *mddev_get(mddev_t *mddev) { atomic_inc(&mddev->active); @@ -314,6 +370,7 @@ static mddev_t * mddev_find(dev_t unit) init_timer(&new->safemode_timer); atomic_set(&new->active, 1); atomic_set(&new->openers, 0); + atomic_set(&new->active_io, 0); spin_lock_init(&new->write_lock); init_waitqueue_head(&new->sb_wait); init_waitqueue_head(&new->recovery_wait); @@ -3632,10 +3689,12 @@ static int md_alloc(dev_t dev, char *name) mddev_put(mddev); return -ENOMEM; } + mddev->queue->queuedata = mddev; + /* Can be unlocked because the queue is new: no concurrency */ queue_flag_set_unlocked(QUEUE_FLAG_CLUSTER, mddev->queue); - blk_queue_make_request(mddev->queue, md_fail_request); + blk_queue_make_request(mddev->queue, md_make_request); disk = alloc_disk(1 << shift); if (!disk) { @@ -3938,16 +3997,6 @@ static int do_md_run(mddev_t * mddev) set_capacity(disk, mddev->array_sectors); - /* If we call blk_queue_make_request here, it will - * re-initialise max_sectors etc which may have been - * refined inside -> run. So just set the bits we need to set. - * Most initialisation happended when we called - * blk_queue_make_request(..., md_fail_request) - * earlier. - */ - mddev->queue->queuedata = mddev; - mddev->queue->make_request_fn = mddev->pers->make_request; - /* If there is a partially-recovered drive we need to * start recovery here. If we leave it to md_check_recovery, * it will remove the drives and not do the right thing @@ -4077,7 +4126,7 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open) md_super_wait(mddev); if (mddev->ro) set_disk_ro(disk, 0); - blk_queue_make_request(mddev->queue, md_fail_request); + mddev->pers->stop(mddev); mddev->queue->merge_bvec_fn = NULL; mddev->queue->unplug_fn = NULL; diff --git a/drivers/md/md.h b/drivers/md/md.h index c07ea91..84b22d6 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -119,6 +119,8 @@ struct mddev_s #define MD_CHANGE_CLEAN 1 /* transition to or from 'clean' */ #define MD_CHANGE_PENDING 2 /* superblock update in progress */ + int suspended; + atomic_t active_io; int ro; struct gendisk *gendisk; diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 7799587..7eaca32 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -2092,6 +2092,9 @@ static int stop(mddev_t *mddev) /* need to kick something here to make sure I/O goes? */ } + raise_barrier(conf); + lower_barrier(conf); + md_unregister_thread(mddev->thread); mddev->thread = NULL; blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index d56cb2a..c2059e2 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -2211,6 +2211,9 @@ static int stop(mddev_t *mddev) { conf_t *conf = mddev_to_conf(mddev); + raise_barrier(conf, 0); + lower_barrier(conf); + md_unregister_thread(mddev->thread); mddev->thread = NULL; blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ -- cgit v0.10.2 From 245f46c2c221ef09c7db892f0e3fc2149be42052 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 31 Mar 2009 14:39:39 +1100 Subject: md: add ->takeover method to support changing the personality managing an array Implement this for RAID6 to be able to 'takeover' a RAID5 array. The new RAID6 will use a layout which places Q on the last device, and that device will be missing. If there are any available spares, one will immediately have Q recovered onto it. Signed-off-by: NeilBrown diff --git a/drivers/md/md.c b/drivers/md/md.c index 6cb31f8..05b613b 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -2649,18 +2649,101 @@ level_show(mddev_t *mddev, char *page) static ssize_t level_store(mddev_t *mddev, const char *buf, size_t len) { + char level[16]; ssize_t rv = len; - if (mddev->pers) + struct mdk_personality *pers; + void *priv; + + if (mddev->pers == NULL) { + if (len == 0) + return 0; + if (len >= sizeof(mddev->clevel)) + return -ENOSPC; + strncpy(mddev->clevel, buf, len); + if (mddev->clevel[len-1] == '\n') + len--; + mddev->clevel[len] = 0; + mddev->level = LEVEL_NONE; + return rv; + } + + /* request to change the personality. Need to ensure: + * - array is not engaged in resync/recovery/reshape + * - old personality can be suspended + * - new personality will access other array. + */ + + if (mddev->sync_thread || mddev->reshape_position != MaxSector) return -EBUSY; - if (len == 0) - return 0; - if (len >= sizeof(mddev->clevel)) - return -ENOSPC; - strncpy(mddev->clevel, buf, len); - if (mddev->clevel[len-1] == '\n') + + if (!mddev->pers->quiesce) { + printk(KERN_WARNING "md: %s: %s does not support online personality change\n", + mdname(mddev), mddev->pers->name); + return -EINVAL; + } + + /* Now find the new personality */ + if (len == 0 || len >= sizeof(level)) + return -EINVAL; + strncpy(level, buf, len); + if (level[len-1] == '\n') len--; - mddev->clevel[len] = 0; - mddev->level = LEVEL_NONE; + level[len] = 0; + + request_module("md-%s", level); + spin_lock(&pers_lock); + pers = find_pers(LEVEL_NONE, level); + if (!pers || !try_module_get(pers->owner)) { + spin_unlock(&pers_lock); + printk(KERN_WARNING "md: personality %s not loaded\n", level); + return -EINVAL; + } + spin_unlock(&pers_lock); + + if (pers == mddev->pers) { + /* Nothing to do! */ + module_put(pers->owner); + return rv; + } + if (!pers->takeover) { + module_put(pers->owner); + printk(KERN_WARNING "md: %s: %s does not support personality takeover\n", + mdname(mddev), level); + return -EINVAL; + } + + /* ->takeover must set new_* and/or delta_disks + * if it succeeds, and may set them when it fails. + */ + priv = pers->takeover(mddev); + if (IS_ERR(priv)) { + mddev->new_level = mddev->level; + mddev->new_layout = mddev->layout; + mddev->new_chunk = mddev->chunk_size; + mddev->raid_disks -= mddev->delta_disks; + mddev->delta_disks = 0; + module_put(pers->owner); + printk(KERN_WARNING "md: %s: %s would not accept array\n", + mdname(mddev), level); + return PTR_ERR(priv); + } + + /* Looks like we have a winner */ + mddev_suspend(mddev); + mddev->pers->stop(mddev); + module_put(mddev->pers->owner); + mddev->pers = pers; + mddev->private = priv; + strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); + mddev->level = mddev->new_level; + mddev->layout = mddev->new_layout; + mddev->chunk_size = mddev->new_chunk; + mddev->delta_disks = 0; + pers->run(mddev); + mddev_resume(mddev); + set_bit(MD_CHANGE_DEVS, &mddev->flags); + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + md_wakeup_thread(mddev->thread); return rv; } diff --git a/drivers/md/md.h b/drivers/md/md.h index 84b22d6..8034f62 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -321,6 +321,16 @@ struct mdk_personality * others - reserved */ void (*quiesce) (mddev_t *mddev, int state); + /* takeover is used to transition an array from one + * personality to another. The new personality must be able + * to handle the data in the current layout. + * e.g. 2drive raid1 -> 2drive raid5 + * ndrive raid5 -> degraded n+1drive raid6 with special layout + * If the takeover succeeds, a new 'private' structure is returned. + * This needs to be installed and then ->run used to activate the + * array. + */ + void *(*takeover) (mddev_t *mddev); }; diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 81789fa..5b346b4 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -933,8 +933,10 @@ static int grow_stripes(raid5_conf_t *conf, int num) struct kmem_cache *sc; int devs = conf->raid_disks; - sprintf(conf->cache_name[0], "raid5-%s", mdname(conf->mddev)); - sprintf(conf->cache_name[1], "raid5-%s-alt", mdname(conf->mddev)); + sprintf(conf->cache_name[0], + "raid%d-%s", conf->level, mdname(conf->mddev)); + sprintf(conf->cache_name[1], + "raid%d-%s-alt", conf->level, mdname(conf->mddev)); conf->active_name = 0; sc = kmem_cache_create(conf->cache_name[conf->active_name], sizeof(struct stripe_head)+(devs-1)*sizeof(struct r5dev), @@ -4361,10 +4363,12 @@ static int run(mddev_t *mddev) BUG_ON(mddev->chunk_size != mddev->new_chunk); BUG_ON(mddev->delta_disks != 0); } - conf = setup_conf(mddev); - if (conf == NULL) - return -EIO; + if (mddev->private == NULL) + conf = setup_conf(mddev); + else + conf = mddev->private; + if (IS_ERR(conf)) return PTR_ERR(conf); @@ -4880,6 +4884,55 @@ static void raid5_quiesce(mddev_t *mddev, int state) } } +static struct mdk_personality raid5_personality; + +static void *raid6_takeover(mddev_t *mddev) +{ + /* Currently can only take over a raid5. We map the + * personality to an equivalent raid6 personality + * with the Q block at the end. + */ + int new_layout; + + if (mddev->pers != &raid5_personality) + return ERR_PTR(-EINVAL); + if (mddev->degraded > 1) + return ERR_PTR(-EINVAL); + if (mddev->raid_disks > 253) + return ERR_PTR(-EINVAL); + if (mddev->raid_disks < 3) + return ERR_PTR(-EINVAL); + + switch (mddev->layout) { + case ALGORITHM_LEFT_ASYMMETRIC: + new_layout = ALGORITHM_LEFT_ASYMMETRIC_6; + break; + case ALGORITHM_RIGHT_ASYMMETRIC: + new_layout = ALGORITHM_RIGHT_ASYMMETRIC_6; + break; + case ALGORITHM_LEFT_SYMMETRIC: + new_layout = ALGORITHM_LEFT_SYMMETRIC_6; + break; + case ALGORITHM_RIGHT_SYMMETRIC: + new_layout = ALGORITHM_RIGHT_SYMMETRIC_6; + break; + case ALGORITHM_PARITY_0: + new_layout = ALGORITHM_PARITY_0_6; + break; + case ALGORITHM_PARITY_N: + new_layout = ALGORITHM_PARITY_N; + break; + default: + return ERR_PTR(-EINVAL); + } + mddev->new_level = 6; + mddev->new_layout = new_layout; + mddev->delta_disks = 1; + mddev->raid_disks += 1; + return setup_conf(mddev); +} + + static struct mdk_personality raid6_personality = { .name = "raid6", @@ -4900,6 +4953,7 @@ static struct mdk_personality raid6_personality = .start_reshape = raid5_start_reshape, #endif .quiesce = raid5_quiesce, + .takeover = raid6_takeover, }; static struct mdk_personality raid5_personality = { -- cgit v0.10.2 From d562b0c4313e3ddea402a400371afa47ddf679f9 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 31 Mar 2009 14:39:39 +1100 Subject: md: add ->takeover method for raid5 to be able to take over raid1 The RAID1 must have two drives and be a suitable size to be a multiple of a chunksize that isn't too small. Signed-off-by: NeilBrown diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 5b346b4..611ea7b 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -4884,6 +4884,53 @@ static void raid5_quiesce(mddev_t *mddev, int state) } } + +static void *raid5_takeover_raid1(mddev_t *mddev) +{ + int chunksect; + + if (mddev->raid_disks != 2 || + mddev->degraded > 1) + return ERR_PTR(-EINVAL); + + /* Should check if there are write-behind devices? */ + + chunksect = 64*2; /* 64K by default */ + + /* The array must be an exact multiple of chunksize */ + while (chunksect && (mddev->array_sectors & (chunksect-1))) + chunksect >>= 1; + + if ((chunksect<<9) < STRIPE_SIZE) + /* array size does not allow a suitable chunk size */ + return ERR_PTR(-EINVAL); + + mddev->new_level = 5; + mddev->new_layout = ALGORITHM_LEFT_SYMMETRIC; + mddev->new_chunk = chunksect << 9; + + return setup_conf(mddev); +} + + +static void *raid5_takeover(mddev_t *mddev) +{ + /* raid5 can take over: + * raid0 - if all devices are the same - make it a raid4 layout + * raid1 - if there are two drives. We need to know the chunk size + * raid4 - trivial - just use a raid4 layout. + * raid6 - Providing it is a *_6 layout + * + * For now, just do raid1 + */ + + if (mddev->level == 1) + return raid5_takeover_raid1(mddev); + + return ERR_PTR(-EINVAL); +} + + static struct mdk_personality raid5_personality; static void *raid6_takeover(mddev_t *mddev) @@ -4975,6 +5022,7 @@ static struct mdk_personality raid5_personality = .start_reshape = raid5_start_reshape, #endif .quiesce = raid5_quiesce, + .takeover = raid5_takeover, }; static struct mdk_personality raid4_personality = -- cgit v0.10.2 From b3546035277847028df650b147469fc943cf5c71 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 31 Mar 2009 14:56:41 +1100 Subject: md/raid5: allow layout/chunksize to be changed on an active 2-drive raid5. 2-drive raid5's aren't very interesting. But if you are converting a raid1 into a raid5, you will at least temporarily have one. And that it a good time to set the layout/chunksize for the new RAID5 if you aren't happy with the defaults. layout and chunksize don't actually affect the placement of data on a 2-drive raid5, so we just do some internal book-keeping. Signed-off-by: NeilBrown diff --git a/drivers/md/md.c b/drivers/md/md.c index 05b613b..0689d89 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -2771,12 +2771,18 @@ layout_store(mddev_t *mddev, const char *buf, size_t len) if (!*buf || (*e && *e != '\n')) return -EINVAL; - if (mddev->pers) - return -EBUSY; - - mddev->new_layout = n; - if (mddev->reshape_position == MaxSector) - mddev->layout = n; + if (mddev->pers) { + int err; + if (mddev->pers->reconfig == NULL) + return -EBUSY; + err = mddev->pers->reconfig(mddev, n, -1); + if (err) + return err; + } else { + mddev->new_layout = n; + if (mddev->reshape_position == MaxSector) + mddev->layout = n; + } return len; } static struct md_sysfs_entry md_layout = @@ -2833,19 +2839,24 @@ chunk_size_show(mddev_t *mddev, char *page) static ssize_t chunk_size_store(mddev_t *mddev, const char *buf, size_t len) { - /* can only set chunk_size if array is not yet active */ char *e; unsigned long n = simple_strtoul(buf, &e, 10); if (!*buf || (*e && *e != '\n')) return -EINVAL; - if (mddev->pers) - return -EBUSY; - - mddev->new_chunk = n; - if (mddev->reshape_position == MaxSector) - mddev->chunk_size = n; + if (mddev->pers) { + int err; + if (mddev->pers->reconfig == NULL) + return -EBUSY; + err = mddev->pers->reconfig(mddev, -1, n); + if (err) + return err; + } else { + mddev->new_chunk = n; + if (mddev->reshape_position == MaxSector) + mddev->chunk_size = n; + } return len; } static struct md_sysfs_entry md_chunk_size = diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 611ea7b..8a5e14e 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -4913,6 +4913,47 @@ static void *raid5_takeover_raid1(mddev_t *mddev) } +static int raid5_reconfig(mddev_t *mddev, int new_layout, int new_chunk) +{ + /* Currently the layout and chunk size can only be changed + * for a 2-drive raid array, as in that case no data shuffling + * is required. + * Later we might validate these and set new_* so a reshape + * can complete the change. + */ + raid5_conf_t *conf = mddev_to_conf(mddev); + + if (new_layout >= 0 && !algorithm_valid_raid5(new_layout)) + return -EINVAL; + if (new_chunk > 0) { + if (new_chunk & (new_chunk-1)) + /* not a power of 2 */ + return -EINVAL; + if (new_chunk < PAGE_SIZE) + return -EINVAL; + if (mddev->array_sectors & ((new_chunk>>9)-1)) + /* not factor of array size */ + return -EINVAL; + } + + /* They look valid */ + + if (mddev->raid_disks != 2) + return -EINVAL; + + if (new_layout >= 0) { + conf->algorithm = new_layout; + mddev->layout = mddev->new_layout = new_layout; + } + if (new_chunk > 0) { + conf->chunk_size = new_chunk; + mddev->chunk_size = mddev->new_chunk = new_chunk; + } + set_bit(MD_CHANGE_DEVS, &mddev->flags); + md_wakeup_thread(mddev->thread); + return 0; +} + static void *raid5_takeover(mddev_t *mddev) { /* raid5 can take over: @@ -5023,6 +5064,7 @@ static struct mdk_personality raid5_personality = #endif .quiesce = raid5_quiesce, .takeover = raid5_takeover, + .reconfig = raid5_reconfig, }; static struct mdk_personality raid4_personality = -- cgit v0.10.2 From e9d4758f6e93488dc719a1445ce54659a570938f Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 31 Mar 2009 14:57:09 +1100 Subject: md: add takeover support for raid4 -> raid5 conversion. Signed-off-by: NeilBrown diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 8a5e14e..9595327 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -4967,6 +4967,11 @@ static void *raid5_takeover(mddev_t *mddev) if (mddev->level == 1) return raid5_takeover_raid1(mddev); + if (mddev->level == 4) { + mddev->new_layout = ALGORITHM_PARITY_N; + mddev->new_level = 5; + return setup_conf(mddev); + } return ERR_PTR(-EINVAL); } -- cgit v0.10.2 From fc9739c6d626ee79a148ec367d143b0601299a9d Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 31 Mar 2009 14:57:20 +1100 Subject: md: add takeover support for converting raid6 back into raid5 If a raid6 is still in the layout that comes from converting raid5 into a raid6. this will allow us to convert it back again. Signed-off-by: NeilBrown diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 9595327..70b50af 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -4912,6 +4912,39 @@ static void *raid5_takeover_raid1(mddev_t *mddev) return setup_conf(mddev); } +static void *raid5_takeover_raid6(mddev_t *mddev) +{ + int new_layout; + + switch (mddev->layout) { + case ALGORITHM_LEFT_ASYMMETRIC_6: + new_layout = ALGORITHM_LEFT_ASYMMETRIC; + break; + case ALGORITHM_RIGHT_ASYMMETRIC_6: + new_layout = ALGORITHM_RIGHT_ASYMMETRIC; + break; + case ALGORITHM_LEFT_SYMMETRIC_6: + new_layout = ALGORITHM_LEFT_SYMMETRIC; + break; + case ALGORITHM_RIGHT_SYMMETRIC_6: + new_layout = ALGORITHM_RIGHT_SYMMETRIC; + break; + case ALGORITHM_PARITY_0_6: + new_layout = ALGORITHM_PARITY_0; + break; + case ALGORITHM_PARITY_N: + new_layout = ALGORITHM_PARITY_N; + break; + default: + return ERR_PTR(-EINVAL); + } + mddev->new_level = 5; + mddev->new_layout = new_layout; + mddev->delta_disks = -1; + mddev->raid_disks -= 1; + return setup_conf(mddev); +} + static int raid5_reconfig(mddev_t *mddev, int new_layout, int new_chunk) { @@ -4972,6 +5005,8 @@ static void *raid5_takeover(mddev_t *mddev) mddev->new_level = 5; return setup_conf(mddev); } + if (mddev->level == 6) + return raid5_takeover_raid6(mddev); return ERR_PTR(-EINVAL); } -- cgit v0.10.2 From 93ed05e2a56b5ee06c0100a169433b11c4aa5d0b Mon Sep 17 00:00:00 2001 From: Atsushi SAKAI Date: Tue, 31 Mar 2009 14:57:37 +1100 Subject: md: fix typo in FSF address Hello, I found a typo Bosto"m" in FSF address. And I am checking around linux source code. Here is the only place which uses Bosto"m" (not Boston). Signed-off-by: Atsushi SAKAI Signed-off-by: NeilBrown diff --git a/drivers/md/raid6.h b/drivers/md/raid6.h index 66e6b0c..8a9c823 100644 --- a/drivers/md/raid6.h +++ b/drivers/md/raid6.h @@ -5,7 +5,7 @@ * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, Inc., 53 Temple Place Ste 330, - * Bostom MA 02111-1307, USA; either version 2 of the License, or + * Boston MA 02111-1307, USA; either version 2 of the License, or * (at your option) any later version; incorporated herein by reference. * * ----------------------------------------------------------------------- */ diff --git a/drivers/md/raid6algos.c b/drivers/md/raid6algos.c index 21987e3..1f6a3c8 100644 --- a/drivers/md/raid6algos.c +++ b/drivers/md/raid6algos.c @@ -5,7 +5,7 @@ * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, Inc., 53 Temple Place Ste 330, - * Bostom MA 02111-1307, USA; either version 2 of the License, or + * Boston MA 02111-1307, USA; either version 2 of the License, or * (at your option) any later version; incorporated herein by reference. * * ----------------------------------------------------------------------- */ diff --git a/drivers/md/raid6altivec.uc b/drivers/md/raid6altivec.uc index b9afd35..2175806 100644 --- a/drivers/md/raid6altivec.uc +++ b/drivers/md/raid6altivec.uc @@ -5,7 +5,7 @@ * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, Inc., 53 Temple Place Ste 330, - * Bostom MA 02111-1307, USA; either version 2 of the License, or + * Boston MA 02111-1307, USA; either version 2 of the License, or * (at your option) any later version; incorporated herein by reference. * * ----------------------------------------------------------------------- */ diff --git a/drivers/md/raid6int.uc b/drivers/md/raid6int.uc index ad004ce..32a0bac 100644 --- a/drivers/md/raid6int.uc +++ b/drivers/md/raid6int.uc @@ -5,7 +5,7 @@ * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, Inc., 53 Temple Place Ste 330, - * Bostom MA 02111-1307, USA; either version 2 of the License, or + * Boston MA 02111-1307, USA; either version 2 of the License, or * (at your option) any later version; incorporated herein by reference. * * ----------------------------------------------------------------------- */ diff --git a/drivers/md/raid6mmx.c b/drivers/md/raid6mmx.c index d4e4a1b..804cb50 100644 --- a/drivers/md/raid6mmx.c +++ b/drivers/md/raid6mmx.c @@ -5,7 +5,7 @@ * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, Inc., 53 Temple Place Ste 330, - * Bostom MA 02111-1307, USA; either version 2 of the License, or + * Boston MA 02111-1307, USA; either version 2 of the License, or * (at your option) any later version; incorporated herein by reference. * * ----------------------------------------------------------------------- */ diff --git a/drivers/md/raid6recov.c b/drivers/md/raid6recov.c index a8c4d94..7a98b86 100644 --- a/drivers/md/raid6recov.c +++ b/drivers/md/raid6recov.c @@ -5,7 +5,7 @@ * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, Inc., 53 Temple Place Ste 330, - * Bostom MA 02111-1307, USA; either version 2 of the License, or + * Boston MA 02111-1307, USA; either version 2 of the License, or * (at your option) any later version; incorporated herein by reference. * * ----------------------------------------------------------------------- */ diff --git a/drivers/md/raid6sse1.c b/drivers/md/raid6sse1.c index 0666237..15c5889 100644 --- a/drivers/md/raid6sse1.c +++ b/drivers/md/raid6sse1.c @@ -5,7 +5,7 @@ * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, Inc., 53 Temple Place Ste 330, - * Bostom MA 02111-1307, USA; either version 2 of the License, or + * Boston MA 02111-1307, USA; either version 2 of the License, or * (at your option) any later version; incorporated herein by reference. * * ----------------------------------------------------------------------- */ diff --git a/drivers/md/raid6sse2.c b/drivers/md/raid6sse2.c index b034ad8..2e92e96 100644 --- a/drivers/md/raid6sse2.c +++ b/drivers/md/raid6sse2.c @@ -5,7 +5,7 @@ * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, Inc., 53 Temple Place Ste 330, - * Bostom MA 02111-1307, USA; either version 2 of the License, or + * Boston MA 02111-1307, USA; either version 2 of the License, or * (at your option) any later version; incorporated herein by reference. * * ----------------------------------------------------------------------- */ diff --git a/drivers/md/raid6x86.h b/drivers/md/raid6x86.h index 99fea7a..4c22c15 100644 --- a/drivers/md/raid6x86.h +++ b/drivers/md/raid6x86.h @@ -5,7 +5,7 @@ * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, Inc., 53 Temple Place Ste 330, - * Bostom MA 02111-1307, USA; either version 2 of the License, or + * Boston MA 02111-1307, USA; either version 2 of the License, or * (at your option) any later version; incorporated herein by reference. * * ----------------------------------------------------------------------- */ -- cgit v0.10.2 From 80c3a6ce4ba4470379b9e6a4d9bcd9d2ee26ae03 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Tue, 17 Mar 2009 18:10:40 -0700 Subject: md: add 'size' as a personality method In preparation for giving userspace control over ->array_sectors we need to be able to retrieve the 'default' size, and the 'anticipated' size when a reshape is requested. For personalities that do not reshape emit a warning if anything but the default size is requested. In the raid5 case we need to update ->previous_raid_disks to make the new 'default' size available. Reviewed-by: Andre Noll Signed-off-by: Dan Williams diff --git a/drivers/md/faulty.c b/drivers/md/faulty.c index 18793c1..d76c87f 100644 --- a/drivers/md/faulty.c +++ b/drivers/md/faulty.c @@ -283,6 +283,17 @@ static int reconfig(mddev_t *mddev, int layout, int chunk_size) return 0; } +static sector_t faulty_size(mddev_t *mddev, sector_t sectors, int raid_disks) +{ + WARN_ONCE(raid_disks, + "%s does not support generic reshape\n", __func__); + + if (sectors == 0) + return mddev->dev_sectors; + + return sectors; +} + static int run(mddev_t *mddev) { mdk_rdev_t *rdev; @@ -301,7 +312,7 @@ static int run(mddev_t *mddev) list_for_each_entry(rdev, &mddev->disks, same_set) conf->rdev = rdev; - mddev->array_sectors = mddev->dev_sectors; + mddev->array_sectors = faulty_size(mddev, 0, 0); mddev->private = conf; reconfig(mddev, mddev->layout, -1); @@ -328,6 +339,7 @@ static struct mdk_personality faulty_personality = .stop = stop, .status = status, .reconfig = reconfig, + .size = faulty_size, }; static int __init raid_init(void) diff --git a/drivers/md/linear.c b/drivers/md/linear.c index d5d99290..b6bb976 100644 --- a/drivers/md/linear.c +++ b/drivers/md/linear.c @@ -101,6 +101,16 @@ static int linear_congested(void *data, int bits) return ret; } +static sector_t linear_size(mddev_t *mddev, sector_t sectors, int raid_disks) +{ + linear_conf_t *conf = mddev_to_conf(mddev); + + WARN_ONCE(sectors || raid_disks, + "%s does not support generic reshape\n", __func__); + + return conf->array_sectors; +} + static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks) { linear_conf_t *conf; @@ -253,7 +263,7 @@ static int linear_run (mddev_t *mddev) if (!conf) return 1; mddev->private = conf; - mddev->array_sectors = conf->array_sectors; + mddev->array_sectors = linear_size(mddev, 0, 0); blk_queue_merge_bvec(mddev->queue, linear_mergeable_bvec); mddev->queue->unplug_fn = linear_unplug; @@ -287,7 +297,7 @@ static int linear_add(mddev_t *mddev, mdk_rdev_t *rdev) newconf->prev = mddev_to_conf(mddev); mddev->private = newconf; mddev->raid_disks++; - mddev->array_sectors = newconf->array_sectors; + mddev->array_sectors = linear_size(mddev, 0, 0); set_capacity(mddev->gendisk, mddev->array_sectors); return 0; } @@ -385,6 +395,7 @@ static struct mdk_personality linear_personality = .stop = linear_stop, .status = linear_status, .hot_add_disk = linear_add, + .size = linear_size, }; static int __init linear_init (void) diff --git a/drivers/md/md.h b/drivers/md/md.h index 8034f62..d2c50da 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -312,6 +312,7 @@ struct mdk_personality int (*spare_active) (mddev_t *mddev); sector_t (*sync_request)(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster); int (*resize) (mddev_t *mddev, sector_t sectors); + sector_t (*size) (mddev_t *mddev, sector_t sectors, int raid_disks); int (*check_reshape) (mddev_t *mddev); int (*start_reshape) (mddev_t *mddev); int (*reconfig) (mddev_t *mddev, int layout, int chunk_size); diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c index 87accf7..a3f6d86 100644 --- a/drivers/md/multipath.c +++ b/drivers/md/multipath.c @@ -406,6 +406,14 @@ static void multipathd (mddev_t *mddev) spin_unlock_irqrestore(&conf->device_lock, flags); } +static sector_t multipath_size(mddev_t *mddev, sector_t sectors, int raid_disks) +{ + WARN_ONCE(sectors || raid_disks, + "%s does not support generic reshape\n", __func__); + + return mddev->dev_sectors; +} + static int multipath_run (mddev_t *mddev) { multipath_conf_t *conf; @@ -502,7 +510,7 @@ static int multipath_run (mddev_t *mddev) /* * Ok, everything is just fine now */ - mddev->array_sectors = mddev->dev_sectors; + mddev->array_sectors = multipath_size(mddev, 0, 0); mddev->queue->unplug_fn = multipath_unplug; mddev->queue->backing_dev_info.congested_fn = multipath_congested; @@ -547,6 +555,7 @@ static struct mdk_personality multipath_personality = .error_handler = multipath_error, .hot_add_disk = multipath_add_disk, .hot_remove_disk= multipath_remove_disk, + .size = multipath_size, }; static int __init multipath_init (void) diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index 9aebb4c..bb0df6a 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -263,12 +263,25 @@ static int raid0_mergeable_bvec(struct request_queue *q, return max; } +static sector_t raid0_size(mddev_t *mddev, sector_t sectors, int raid_disks) +{ + sector_t array_sectors = 0; + mdk_rdev_t *rdev; + + WARN_ONCE(sectors || raid_disks, + "%s does not support generic reshape\n", __func__); + + list_for_each_entry(rdev, &mddev->disks, same_set) + array_sectors += rdev->sectors; + + return array_sectors; +} + static int raid0_run (mddev_t *mddev) { unsigned cur=0, i=0, nb_zone; s64 sectors; raid0_conf_t *conf; - mdk_rdev_t *rdev; if (mddev->chunk_size == 0) { printk(KERN_ERR "md/raid0: non-zero chunk size required.\n"); @@ -293,9 +306,7 @@ static int raid0_run (mddev_t *mddev) goto out_free_conf; /* calculate array device size */ - mddev->array_sectors = 0; - list_for_each_entry(rdev, &mddev->disks, same_set) - mddev->array_sectors += rdev->sectors; + mddev->array_sectors = raid0_size(mddev, 0, 0); printk(KERN_INFO "raid0 : md_size is %llu sectors.\n", (unsigned long long)mddev->array_sectors); @@ -511,6 +522,7 @@ static struct mdk_personality raid0_personality= .run = raid0_run, .stop = raid0_stop, .status = raid0_status, + .size = raid0_size, }; static int __init raid0_init (void) diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 7eaca32..a2a83b0 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1922,6 +1922,14 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i return nr_sectors; } +static sector_t raid1_size(mddev_t *mddev, sector_t sectors, int raid_disks) +{ + if (sectors) + return sectors; + + return mddev->dev_sectors; +} + static int run(mddev_t *mddev) { conf_t *conf; @@ -2051,7 +2059,7 @@ static int run(mddev_t *mddev) /* * Ok, everything is just fine now */ - mddev->array_sectors = mddev->dev_sectors; + mddev->array_sectors = raid1_size(mddev, 0, 0); mddev->queue->unplug_fn = raid1_unplug; mddev->queue->backing_dev_info.congested_fn = raid1_congested; @@ -2116,7 +2124,7 @@ static int raid1_resize(mddev_t *mddev, sector_t sectors) * any io in the removed space completes, but it hardly seems * worth it. */ - mddev->array_sectors = sectors; + mddev->array_sectors = raid1_size(mddev, sectors, 0); set_capacity(mddev->gendisk, mddev->array_sectors); mddev->changed = 1; if (mddev->array_sectors > mddev->dev_sectors && @@ -2270,6 +2278,7 @@ static struct mdk_personality raid1_personality = .spare_active = raid1_spare_active, .sync_request = sync_request, .resize = raid1_resize, + .size = raid1_size, .check_reshape = raid1_reshape, .quiesce = raid1_quiesce, }; diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index c2059e2..5bf1b24 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -2023,6 +2023,25 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i goto skipped; } +static sector_t +raid10_size(mddev_t *mddev, sector_t sectors, int raid_disks) +{ + sector_t size; + conf_t *conf = mddev_to_conf(mddev); + + if (!raid_disks) + raid_disks = mddev->raid_disks; + if (!sectors) + sectors = mddev->dev_sectors; + + size = sectors >> conf->chunk_shift; + sector_div(size, conf->far_copies); + size = size * raid_disks; + sector_div(size, conf->near_copies); + + return size << conf->chunk_shift; +} + static int run(mddev_t *mddev) { conf_t *conf; @@ -2174,8 +2193,8 @@ static int run(mddev_t *mddev) /* * Ok, everything is just fine now */ - mddev->array_sectors = size << conf->chunk_shift; - mddev->resync_max_sectors = size << conf->chunk_shift; + mddev->array_sectors = raid10_size(mddev, 0, 0); + mddev->resync_max_sectors = mddev->array_sectors; mddev->queue->unplug_fn = raid10_unplug; mddev->queue->backing_dev_info.congested_fn = raid10_congested; @@ -2261,6 +2280,7 @@ static struct mdk_personality raid10_personality = .spare_active = raid10_spare_active, .sync_request = sync_request, .quiesce = raid10_quiesce, + .size = raid10_size, }; static int __init raid_init(void) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 70b50af..2cd619f 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -4166,6 +4166,20 @@ static struct attribute_group raid5_attrs_group = { .attrs = raid5_attrs, }; +static sector_t +raid5_size(mddev_t *mddev, sector_t sectors, int raid_disks) +{ + raid5_conf_t *conf = mddev_to_conf(mddev); + + if (!sectors) + sectors = mddev->dev_sectors; + if (!raid_disks) + raid_disks = conf->previous_raid_disks; + + sectors &= ~((sector_t)mddev->chunk_size/512 - 1); + return sectors * (raid_disks - conf->max_degraded); +} + static raid5_conf_t *setup_conf(mddev_t *mddev) { raid5_conf_t *conf; @@ -4460,8 +4474,7 @@ static int run(mddev_t *mddev) mddev->queue->backing_dev_info.congested_data = mddev; mddev->queue->backing_dev_info.congested_fn = raid5_congested; - mddev->array_sectors = mddev->dev_sectors * - (conf->previous_raid_disks - conf->max_degraded); + mddev->array_sectors = raid5_size(mddev, 0, 0); blk_queue_merge_bvec(mddev->queue, raid5_mergeable_bvec); @@ -4684,11 +4697,8 @@ static int raid5_resize(mddev_t *mddev, sector_t sectors) * any io in the removed space completes, but it hardly seems * worth it. */ - raid5_conf_t *conf = mddev_to_conf(mddev); - sectors &= ~((sector_t)mddev->chunk_size/512 - 1); - mddev->array_sectors = sectors * (mddev->raid_disks - - conf->max_degraded); + mddev->array_sectors = raid5_size(mddev, sectors, mddev->raid_disks); set_capacity(mddev->gendisk, mddev->array_sectors); mddev->changed = 1; if (sectors > mddev->dev_sectors && mddev->recovery_cp == MaxSector) { @@ -4824,10 +4834,12 @@ static void end_reshape(raid5_conf_t *conf) struct block_device *bdev; if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) { - conf->mddev->array_sectors = conf->mddev->dev_sectors * - (conf->raid_disks - conf->max_degraded); - set_capacity(conf->mddev->gendisk, conf->mddev->array_sectors); - conf->mddev->changed = 1; + mddev_t *mddev = conf->mddev; + + mddev->array_sectors = raid5_size(mddev, 0, conf->raid_disks); + set_capacity(mddev->gendisk, mddev->array_sectors); + mddev->changed = 1; + conf->previous_raid_disks = conf->raid_disks; bdev = bdget_disk(conf->mddev->gendisk, 0); if (bdev) { @@ -5076,6 +5088,7 @@ static struct mdk_personality raid6_personality = .spare_active = raid5_spare_active, .sync_request = sync_request, .resize = raid5_resize, + .size = raid5_size, #ifdef CONFIG_MD_RAID5_RESHAPE .check_reshape = raid5_check_reshape, .start_reshape = raid5_start_reshape, @@ -5098,6 +5111,7 @@ static struct mdk_personality raid5_personality = .spare_active = raid5_spare_active, .sync_request = sync_request, .resize = raid5_resize, + .size = raid5_size, #ifdef CONFIG_MD_RAID5_RESHAPE .check_reshape = raid5_check_reshape, .start_reshape = raid5_start_reshape, @@ -5122,6 +5136,7 @@ static struct mdk_personality raid4_personality = .spare_active = raid5_spare_active, .sync_request = sync_request, .resize = raid5_resize, + .size = raid5_size, #ifdef CONFIG_MD_RAID5_RESHAPE .check_reshape = raid5_check_reshape, .start_reshape = raid5_start_reshape, -- cgit v0.10.2 From 1f403624bde3c678a166984b1e6a727a0ce06f2b Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Tue, 31 Mar 2009 14:59:03 +1100 Subject: md: centralize ->array_sectors modifications Get personalities out of the business of directly modifying ->array_sectors. Lays groundwork to introduce policy on when ->array_sectors can be modified. Reviewed-by: Andre Noll Signed-off-by: Dan Williams diff --git a/drivers/md/faulty.c b/drivers/md/faulty.c index d76c87f..8695809 100644 --- a/drivers/md/faulty.c +++ b/drivers/md/faulty.c @@ -312,7 +312,7 @@ static int run(mddev_t *mddev) list_for_each_entry(rdev, &mddev->disks, same_set) conf->rdev = rdev; - mddev->array_sectors = faulty_size(mddev, 0, 0); + md_set_array_sectors(mddev, faulty_size(mddev, 0, 0)); mddev->private = conf; reconfig(mddev, mddev->layout, -1); diff --git a/drivers/md/linear.c b/drivers/md/linear.c index b6bb976..7a36e38 100644 --- a/drivers/md/linear.c +++ b/drivers/md/linear.c @@ -263,7 +263,7 @@ static int linear_run (mddev_t *mddev) if (!conf) return 1; mddev->private = conf; - mddev->array_sectors = linear_size(mddev, 0, 0); + md_set_array_sectors(mddev, linear_size(mddev, 0, 0)); blk_queue_merge_bvec(mddev->queue, linear_mergeable_bvec); mddev->queue->unplug_fn = linear_unplug; @@ -297,7 +297,7 @@ static int linear_add(mddev_t *mddev, mdk_rdev_t *rdev) newconf->prev = mddev_to_conf(mddev); mddev->private = newconf; mddev->raid_disks++; - mddev->array_sectors = linear_size(mddev, 0, 0); + md_set_array_sectors(mddev, linear_size(mddev, 0, 0)); set_capacity(mddev->gendisk, mddev->array_sectors); return 0; } diff --git a/drivers/md/md.c b/drivers/md/md.c index 0689d89..76ba69b 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -4977,6 +4977,12 @@ static int set_array_info(mddev_t * mddev, mdu_array_info_t *info) return 0; } +void md_set_array_sectors(mddev_t *mddev, sector_t array_sectors) +{ + mddev->array_sectors = array_sectors; +} +EXPORT_SYMBOL(md_set_array_sectors); + static int update_size(mddev_t *mddev, sector_t num_sectors) { mdk_rdev_t *rdev; diff --git a/drivers/md/md.h b/drivers/md/md.h index d2c50da..ce89dda 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -430,3 +430,4 @@ extern void md_do_sync(mddev_t *mddev); extern void md_new_event(mddev_t *mddev); extern int md_allow_write(mddev_t *mddev); extern void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev); +extern void md_set_array_sectors(mddev_t *mddev, sector_t array_sectors); diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c index a3f6d86..41ced0c 100644 --- a/drivers/md/multipath.c +++ b/drivers/md/multipath.c @@ -510,7 +510,7 @@ static int multipath_run (mddev_t *mddev) /* * Ok, everything is just fine now */ - mddev->array_sectors = multipath_size(mddev, 0, 0); + md_set_array_sectors(mddev, multipath_size(mddev, 0, 0)); mddev->queue->unplug_fn = multipath_unplug; mddev->queue->backing_dev_info.congested_fn = multipath_congested; diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index bb0df6a..6f7e538 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -306,7 +306,7 @@ static int raid0_run (mddev_t *mddev) goto out_free_conf; /* calculate array device size */ - mddev->array_sectors = raid0_size(mddev, 0, 0); + md_set_array_sectors(mddev, raid0_size(mddev, 0, 0)); printk(KERN_INFO "raid0 : md_size is %llu sectors.\n", (unsigned long long)mddev->array_sectors); diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index a2a83b0..044116b 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -2059,7 +2059,7 @@ static int run(mddev_t *mddev) /* * Ok, everything is just fine now */ - mddev->array_sectors = raid1_size(mddev, 0, 0); + md_set_array_sectors(mddev, raid1_size(mddev, 0, 0)); mddev->queue->unplug_fn = raid1_unplug; mddev->queue->backing_dev_info.congested_fn = raid1_congested; @@ -2124,7 +2124,7 @@ static int raid1_resize(mddev_t *mddev, sector_t sectors) * any io in the removed space completes, but it hardly seems * worth it. */ - mddev->array_sectors = raid1_size(mddev, sectors, 0); + md_set_array_sectors(mddev, raid1_size(mddev, sectors, 0)); set_capacity(mddev->gendisk, mddev->array_sectors); mddev->changed = 1; if (mddev->array_sectors > mddev->dev_sectors && diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 5bf1b24..ad153b2 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -2193,7 +2193,7 @@ static int run(mddev_t *mddev) /* * Ok, everything is just fine now */ - mddev->array_sectors = raid10_size(mddev, 0, 0); + md_set_array_sectors(mddev, raid10_size(mddev, 0, 0)); mddev->resync_max_sectors = mddev->array_sectors; mddev->queue->unplug_fn = raid10_unplug; diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 2cd619f..2930fc2 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -4474,7 +4474,7 @@ static int run(mddev_t *mddev) mddev->queue->backing_dev_info.congested_data = mddev; mddev->queue->backing_dev_info.congested_fn = raid5_congested; - mddev->array_sectors = raid5_size(mddev, 0, 0); + md_set_array_sectors(mddev, raid5_size(mddev, 0, 0)); blk_queue_merge_bvec(mddev->queue, raid5_mergeable_bvec); @@ -4698,7 +4698,8 @@ static int raid5_resize(mddev_t *mddev, sector_t sectors) * worth it. */ sectors &= ~((sector_t)mddev->chunk_size/512 - 1); - mddev->array_sectors = raid5_size(mddev, sectors, mddev->raid_disks); + md_set_array_sectors(mddev, raid5_size(mddev, sectors, + mddev->raid_disks)); set_capacity(mddev->gendisk, mddev->array_sectors); mddev->changed = 1; if (sectors > mddev->dev_sectors && mddev->recovery_cp == MaxSector) { @@ -4836,7 +4837,8 @@ static void end_reshape(raid5_conf_t *conf) if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) { mddev_t *mddev = conf->mddev; - mddev->array_sectors = raid5_size(mddev, 0, conf->raid_disks); + md_set_array_sectors(mddev, raid5_size(mddev, 0, + conf->raid_disks)); set_capacity(mddev->gendisk, mddev->array_sectors); mddev->changed = 1; conf->previous_raid_disks = conf->raid_disks; -- cgit v0.10.2 From b522adcde9c4d3fb7b579cfa9160d8bde7744be8 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Tue, 31 Mar 2009 15:00:31 +1100 Subject: md: 'array_size' sysfs attribute Allow userspace to set the size of the array according to the following semantics: 1/ size must be <= to the size returned by mddev->pers->size(mddev, 0, 0) a) If size is set before the array is running, do_md_run will fail if size is greater than the default size b) A reshape attempt that reduces the default size to less than the set array size should be blocked 2/ once userspace sets the size the kernel will not change it 3/ writing 'default' to this attribute returns control of the size to the kernel and reverts to the size reported by the personality Also, convert locations that need to know the default size from directly reading ->array_sectors to _size. Resync/reshape operations always follow the default size. Finally, fixup other locations that read a number of 1k-blocks from userspace to use strict_blocks_to_sectors() which checks for unsigned long long to sector_t overflow and blocks to sectors overflow. Reviewed-by: Andre Noll Signed-off-by: Dan Williams diff --git a/drivers/md/md.c b/drivers/md/md.c index 76ba69b..923d125 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -387,6 +387,11 @@ static inline int mddev_lock(mddev_t * mddev) return mutex_lock_interruptible(&mddev->reconfig_mutex); } +static inline int mddev_is_locked(mddev_t *mddev) +{ + return mutex_is_locked(&mddev->reconfig_mutex); +} + static inline int mddev_trylock(mddev_t * mddev) { return mutex_trylock(&mddev->reconfig_mutex); @@ -2282,16 +2287,34 @@ static int overlaps(sector_t s1, sector_t l1, sector_t s2, sector_t l2) return 1; } +static int strict_blocks_to_sectors(const char *buf, sector_t *sectors) +{ + unsigned long long blocks; + sector_t new; + + if (strict_strtoull(buf, 10, &blocks) < 0) + return -EINVAL; + + if (blocks & 1ULL << (8 * sizeof(blocks) - 1)) + return -EINVAL; /* sector conversion overflow */ + + new = blocks * 2; + if (new != blocks * 2) + return -EINVAL; /* unsigned long long to sector_t overflow */ + + *sectors = new; + return 0; +} + static ssize_t rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len) { mddev_t *my_mddev = rdev->mddev; sector_t oldsectors = rdev->sectors; - unsigned long long sectors; + sector_t sectors; - if (strict_strtoull(buf, 10, §ors) < 0) + if (strict_blocks_to_sectors(buf, §ors) < 0) return -EINVAL; - sectors *= 2; if (my_mddev->pers && rdev->raid_disk >= 0) { if (my_mddev->persistent) { sectors = super_types[my_mddev->major_version]. @@ -3182,12 +3205,11 @@ size_store(mddev_t *mddev, const char *buf, size_t len) * not increase it (except from 0). * If array is active, we can try an on-line resize */ - unsigned long long sectors; - int err = strict_strtoull(buf, 10, §ors); + sector_t sectors; + int err = strict_blocks_to_sectors(buf, §ors); if (err < 0) return err; - sectors *= 2; if (mddev->pers) { err = update_size(mddev, sectors); md_update_sb(mddev, 1); @@ -3627,6 +3649,57 @@ static struct md_sysfs_entry md_reshape_position = __ATTR(reshape_position, S_IRUGO|S_IWUSR, reshape_position_show, reshape_position_store); +static ssize_t +array_size_show(mddev_t *mddev, char *page) +{ + if (mddev->external_size) + return sprintf(page, "%llu\n", + (unsigned long long)mddev->array_sectors/2); + else + return sprintf(page, "default\n"); +} + +static ssize_t +array_size_store(mddev_t *mddev, const char *buf, size_t len) +{ + sector_t sectors; + + if (strncmp(buf, "default", 7) == 0) { + if (mddev->pers) + sectors = mddev->pers->size(mddev, 0, 0); + else + sectors = mddev->array_sectors; + + mddev->external_size = 0; + } else { + if (strict_blocks_to_sectors(buf, §ors) < 0) + return -EINVAL; + if (mddev->pers && mddev->pers->size(mddev, 0, 0) < sectors) + return -EINVAL; + + mddev->external_size = 1; + } + + mddev->array_sectors = sectors; + set_capacity(mddev->gendisk, mddev->array_sectors); + if (mddev->pers) { + struct block_device *bdev = bdget_disk(mddev->gendisk, 0); + + if (bdev) { + mutex_lock(&bdev->bd_inode->i_mutex); + i_size_write(bdev->bd_inode, + (loff_t)mddev->array_sectors << 9); + mutex_unlock(&bdev->bd_inode->i_mutex); + bdput(bdev); + } + } + + return len; +} + +static struct md_sysfs_entry md_array_size = +__ATTR(array_size, S_IRUGO|S_IWUSR, array_size_show, + array_size_store); static struct attribute *md_default_attrs[] = { &md_level.attr, @@ -3640,6 +3713,7 @@ static struct attribute *md_default_attrs[] = { &md_safe_delay.attr, &md_array_state.attr, &md_reshape_position.attr, + &md_array_size.attr, NULL, }; @@ -4045,7 +4119,17 @@ static int do_md_run(mddev_t * mddev) err = mddev->pers->run(mddev); if (err) printk(KERN_ERR "md: pers->run() failed ...\n"); - else if (mddev->pers->sync_request) { + else if (mddev->pers->size(mddev, 0, 0) < mddev->array_sectors) { + WARN_ONCE(!mddev->external_size, "%s: default size too small," + " but 'external_size' not in effect?\n", __func__); + printk(KERN_ERR + "md: invalid array_size %llu > default size %llu\n", + (unsigned long long)mddev->array_sectors / 2, + (unsigned long long)mddev->pers->size(mddev, 0, 0) / 2); + err = -EINVAL; + mddev->pers->stop(mddev); + } + if (err == 0 && mddev->pers->sync_request) { err = bitmap_create(mddev); if (err) { printk(KERN_ERR "%s: failed to create bitmap (%d)\n", @@ -4281,6 +4365,7 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open) export_array(mddev); mddev->array_sectors = 0; + mddev->external_size = 0; mddev->dev_sectors = 0; mddev->raid_disks = 0; mddev->recovery_cp = 0; @@ -4979,10 +5064,23 @@ static int set_array_info(mddev_t * mddev, mdu_array_info_t *info) void md_set_array_sectors(mddev_t *mddev, sector_t array_sectors) { + WARN(!mddev_is_locked(mddev), "%s: unlocked mddev!\n", __func__); + + if (mddev->external_size) + return; + mddev->array_sectors = array_sectors; } EXPORT_SYMBOL(md_set_array_sectors); +void md_set_array_sectors_lock(mddev_t *mddev, sector_t array_sectors) +{ + mddev_lock(mddev); + md_set_array_sectors(mddev, array_sectors); + mddev_unlock(mddev); +} +EXPORT_SYMBOL(md_set_array_sectors_lock); + static int update_size(mddev_t *mddev, sector_t num_sectors) { mdk_rdev_t *rdev; diff --git a/drivers/md/md.h b/drivers/md/md.h index ce89dda..d13e34f 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -147,6 +147,8 @@ struct mddev_s sector_t dev_sectors; /* used size of * component devices */ sector_t array_sectors; /* exported array size */ + int external_size; /* size managed + * externally */ __u64 events; char uuid[16]; @@ -431,3 +433,4 @@ extern void md_new_event(mddev_t *mddev); extern int md_allow_write(mddev_t *mddev); extern void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev); extern void md_set_array_sectors(mddev_t *mddev, sector_t array_sectors); +extern void md_set_array_sectors_lock(mddev_t *mddev, sector_t array_sectors); diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index 6f7e538..c08d755 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -313,7 +313,7 @@ static int raid0_run (mddev_t *mddev) printk(KERN_INFO "raid0 : conf->spacing is %llu sectors.\n", (unsigned long long)conf->spacing); { - sector_t s = mddev->array_sectors; + sector_t s = raid0_size(mddev, 0, 0); sector_t space = conf->spacing; int round; conf->sector_shift = 0; diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 044116b..b4f4bad 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -2125,14 +2125,16 @@ static int raid1_resize(mddev_t *mddev, sector_t sectors) * worth it. */ md_set_array_sectors(mddev, raid1_size(mddev, sectors, 0)); + if (mddev->array_sectors > raid1_size(mddev, sectors, 0)) + return -EINVAL; set_capacity(mddev->gendisk, mddev->array_sectors); mddev->changed = 1; - if (mddev->array_sectors > mddev->dev_sectors && + if (sectors > mddev->dev_sectors && mddev->recovery_cp == MaxSector) { mddev->recovery_cp = mddev->dev_sectors; set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); } - mddev->dev_sectors = mddev->array_sectors; + mddev->dev_sectors = sectors; mddev->resync_max_sectors = sectors; return 0; } diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index ad153b2..e293d92 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -2194,7 +2194,7 @@ static int run(mddev_t *mddev) * Ok, everything is just fine now */ md_set_array_sectors(mddev, raid10_size(mddev, 0, 0)); - mddev->resync_max_sectors = mddev->array_sectors; + mddev->resync_max_sectors = raid10_size(mddev, 0, 0); mddev->queue->unplug_fn = raid10_unplug; mddev->queue->backing_dev_info.congested_fn = raid10_congested; diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 2930fc2..1aebd3e 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -3703,6 +3703,8 @@ static int make_request(struct request_queue *q, struct bio * bi) return 0; } +static sector_t raid5_size(mddev_t *mddev, sector_t sectors, int raid_disks); + static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped) { /* reshaping is quite different to recovery/resync so it is @@ -3781,7 +3783,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped j == sh->qd_idx) continue; s = compute_blocknr(sh, j); - if (s < mddev->array_sectors) { + if (s < raid5_size(mddev, 0, 0)) { skipped = 1; continue; } @@ -4700,6 +4702,9 @@ static int raid5_resize(mddev_t *mddev, sector_t sectors) sectors &= ~((sector_t)mddev->chunk_size/512 - 1); md_set_array_sectors(mddev, raid5_size(mddev, sectors, mddev->raid_disks)); + if (mddev->array_sectors > + raid5_size(mddev, sectors, mddev->raid_disks)) + return -EINVAL; set_capacity(mddev->gendisk, mddev->array_sectors); mddev->changed = 1; if (sectors > mddev->dev_sectors && mddev->recovery_cp == MaxSector) { @@ -4837,7 +4842,7 @@ static void end_reshape(raid5_conf_t *conf) if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) { mddev_t *mddev = conf->mddev; - md_set_array_sectors(mddev, raid5_size(mddev, 0, + md_set_array_sectors_lock(mddev, raid5_size(mddev, 0, conf->raid_disks)); set_capacity(mddev->gendisk, mddev->array_sectors); mddev->changed = 1; -- cgit v0.10.2 From 18b0033491f584a2d79697da714b1ef9d6b27d22 Mon Sep 17 00:00:00 2001 From: Andre Noll Date: Tue, 31 Mar 2009 15:00:56 +1100 Subject: md: raid5 run(): Fix max_degraded for raid level 4. raid4 allows only one failed disk. Signed-off-by: Andre Noll Signed-off-by: NeilBrown diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 1aebd3e..e1ee181 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -4331,7 +4331,7 @@ static int run(mddev_t *mddev) */ sector_t here_new, here_old; int old_disks; - int max_degraded = (mddev->level == 5 ? 1 : 2); + int max_degraded = (mddev->level == 6 ? 2 : 1); if (mddev->new_level != mddev->level || mddev->new_layout != mddev->layout || -- cgit v0.10.2 From f701d589aa34d7531183c9ac6f7713ba14212b02 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Tue, 31 Mar 2009 15:09:39 +1100 Subject: md/raid6: move raid6 data processing to raid6_pq.ko Move the raid6 data processing routines into a standalone module (raid6_pq) to prepare them to be called from async_tx wrappers and other non-md drivers/modules. This precludes a circular dependency of raid456 needing the async modules for data processing while those modules in turn depend on raid456 for the base level synchronous raid6 routines. To support this move: 1/ The exportable definitions in raid6.h move to include/linux/raid/pq.h 2/ The raid6_call, recovery calls, and table symbols are exported 3/ Extra #ifdef __KERNEL__ statements to enable the userspace raid6test to compile Signed-off-by: Dan Williams Signed-off-by: NeilBrown diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index 2281b50..449d0b9 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -121,6 +121,7 @@ config MD_RAID10 config MD_RAID456 tristate "RAID-4/RAID-5/RAID-6 mode" depends on BLK_DEV_MD + select MD_RAID6_PQ select ASYNC_MEMCPY select ASYNC_XOR ---help--- @@ -180,6 +181,9 @@ config MD_RAID5_RESHAPE If unsure, say Y. +config MD_RAID6_PQ + tristate + config MD_MULTIPATH tristate "Multipath I/O support" depends on BLK_DEV_MD diff --git a/drivers/md/Makefile b/drivers/md/Makefile index 3b118da..45cc595 100644 --- a/drivers/md/Makefile +++ b/drivers/md/Makefile @@ -9,7 +9,8 @@ dm-snapshot-y += dm-snap.o dm-exception-store.o dm-snap-transient.o \ dm-snap-persistent.o dm-mirror-y += dm-raid1.o md-mod-y += md.o bitmap.o -raid456-y += raid5.o raid6algos.o raid6recov.o raid6tables.o \ +raid456-y += raid5.o +raid6_pq-y += raid6algos.o raid6recov.o raid6tables.o \ raid6int1.o raid6int2.o raid6int4.o \ raid6int8.o raid6int16.o raid6int32.o \ raid6altivec1.o raid6altivec2.o raid6altivec4.o \ @@ -26,6 +27,7 @@ obj-$(CONFIG_MD_LINEAR) += linear.o obj-$(CONFIG_MD_RAID0) += raid0.o obj-$(CONFIG_MD_RAID1) += raid1.o obj-$(CONFIG_MD_RAID10) += raid10.o +obj-$(CONFIG_MD_RAID6_PQ) += raid6_pq.o obj-$(CONFIG_MD_RAID456) += raid456.o obj-$(CONFIG_MD_MULTIPATH) += multipath.o obj-$(CONFIG_MD_FAULTY) += faulty.o diff --git a/drivers/md/mktables.c b/drivers/md/mktables.c index b61d576..3b15008 100644 --- a/drivers/md/mktables.c +++ b/drivers/md/mktables.c @@ -59,7 +59,7 @@ int main(int argc, char *argv[]) uint8_t v; uint8_t exptbl[256], invtbl[256]; - printf("#include \"raid6.h\"\n"); + printf("#include \n"); /* Compute multiplication table */ printf("\nconst u8 __attribute__((aligned(256)))\n" @@ -76,6 +76,9 @@ int main(int argc, char *argv[]) printf("\t},\n"); } printf("};\n"); + printf("#ifdef __KERNEL__\n"); + printf("EXPORT_SYMBOL(raid6_gfmul);\n"); + printf("#endif\n"); /* Compute power-of-2 table (exponent) */ v = 1; @@ -92,6 +95,9 @@ int main(int argc, char *argv[]) } } printf("};\n"); + printf("#ifdef __KERNEL__\n"); + printf("EXPORT_SYMBOL(raid6_gfexp);\n"); + printf("#endif\n"); /* Compute inverse table x^-1 == x^254 */ printf("\nconst u8 __attribute__((aligned(256)))\n" @@ -104,6 +110,9 @@ int main(int argc, char *argv[]) } } printf("};\n"); + printf("#ifdef __KERNEL__\n"); + printf("EXPORT_SYMBOL(raid6_gfinv);\n"); + printf("#endif\n"); /* Compute inv(2^x + 1) (exponent-xor-inverse) table */ printf("\nconst u8 __attribute__((aligned(256)))\n" @@ -115,6 +124,9 @@ int main(int argc, char *argv[]) (j == 7) ? '\n' : ' '); } printf("};\n"); + printf("#ifdef __KERNEL__\n"); + printf("EXPORT_SYMBOL(raid6_gfexi);\n"); + printf("#endif\n"); return 0; } diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index e1ee181..1f1b054 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -45,11 +45,11 @@ #include #include +#include #include #include #include "md.h" #include "raid5.h" -#include "raid6.h" #include "bitmap.h" /* @@ -94,11 +94,6 @@ #define printk_rl(args...) ((void) (printk_ratelimit() && printk(args))) -#if !RAID6_USE_EMPTY_ZERO_PAGE -/* In .bss so it's zeroed */ -const char raid6_empty_zero_page[PAGE_SIZE] __attribute__((aligned(256))); -#endif - /* * We maintain a biased count of active stripes in the bottom 16 bits of * bi_phys_segments, and a count of processed stripes in the upper 16 bits @@ -5153,11 +5148,6 @@ static struct mdk_personality raid4_personality = static int __init raid5_init(void) { - int e; - - e = raid6_select_algo(); - if ( e ) - return e; register_md_personality(&raid6_personality); register_md_personality(&raid5_personality); register_md_personality(&raid4_personality); diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index c172371..2934ee0 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -269,6 +269,8 @@ struct r6_state { #define READ_MODIFY_WRITE 2 /* not a write method, but a compute_parity mode */ #define CHECK_PARITY 3 +/* Additional compute_parity mode -- updates the parity w/o LOCKING */ +#define UPDATE_PARITY 4 /* * Stripe state diff --git a/drivers/md/raid6.h b/drivers/md/raid6.h deleted file mode 100644 index 8a9c823..0000000 --- a/drivers/md/raid6.h +++ /dev/null @@ -1,126 +0,0 @@ -/* -*- linux-c -*- ------------------------------------------------------- * - * - * Copyright 2003 H. Peter Anvin - All Rights Reserved - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, Inc., 53 Temple Place Ste 330, - * Boston MA 02111-1307, USA; either version 2 of the License, or - * (at your option) any later version; incorporated herein by reference. - * - * ----------------------------------------------------------------------- */ - -#ifndef LINUX_RAID_RAID6_H -#define LINUX_RAID_RAID6_H - -#ifdef __KERNEL__ - -/* Set to 1 to use kernel-wide empty_zero_page */ -#define RAID6_USE_EMPTY_ZERO_PAGE 0 -#include - -/* Additional compute_parity mode -- updates the parity w/o LOCKING */ -#define UPDATE_PARITY 4 - -/* We need a pre-zeroed page... if we don't want to use the kernel-provided - one define it here */ -#if RAID6_USE_EMPTY_ZERO_PAGE -# define raid6_empty_zero_page empty_zero_page -#else -extern const char raid6_empty_zero_page[PAGE_SIZE]; -#endif - -#else /* ! __KERNEL__ */ -/* Used for testing in user space */ - -#include -#include -#include -#include -#include -#include - -/* Not standard, but glibc defines it */ -#define BITS_PER_LONG __WORDSIZE - -typedef uint8_t u8; -typedef uint16_t u16; -typedef uint32_t u32; -typedef uint64_t u64; - -#ifndef PAGE_SIZE -# define PAGE_SIZE 4096 -#endif -extern const char raid6_empty_zero_page[PAGE_SIZE]; - -#define __init -#define __exit -#define __attribute_const__ __attribute__((const)) -#define noinline __attribute__((noinline)) - -#define preempt_enable() -#define preempt_disable() -#define cpu_has_feature(x) 1 -#define enable_kernel_altivec() -#define disable_kernel_altivec() - -#endif /* __KERNEL__ */ - -/* Routine choices */ -struct raid6_calls { - void (*gen_syndrome)(int, size_t, void **); - int (*valid)(void); /* Returns 1 if this routine set is usable */ - const char *name; /* Name of this routine set */ - int prefer; /* Has special performance attribute */ -}; - -/* Selected algorithm */ -extern struct raid6_calls raid6_call; - -/* Algorithm list */ -extern const struct raid6_calls * const raid6_algos[]; -int raid6_select_algo(void); - -/* Return values from chk_syndrome */ -#define RAID6_OK 0 -#define RAID6_P_BAD 1 -#define RAID6_Q_BAD 2 -#define RAID6_PQ_BAD 3 - -/* Galois field tables */ -extern const u8 raid6_gfmul[256][256] __attribute__((aligned(256))); -extern const u8 raid6_gfexp[256] __attribute__((aligned(256))); -extern const u8 raid6_gfinv[256] __attribute__((aligned(256))); -extern const u8 raid6_gfexi[256] __attribute__((aligned(256))); - -/* Recovery routines */ -void raid6_2data_recov(int disks, size_t bytes, int faila, int failb, void **ptrs); -void raid6_datap_recov(int disks, size_t bytes, int faila, void **ptrs); -void raid6_dual_recov(int disks, size_t bytes, int faila, int failb, void **ptrs); - -/* Some definitions to allow code to be compiled for testing in userspace */ -#ifndef __KERNEL__ - -# define jiffies raid6_jiffies() -# define printk printf -# define GFP_KERNEL 0 -# define __get_free_pages(x,y) ((unsigned long)mmap(NULL, PAGE_SIZE << (y), PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, 0, 0)) -# define free_pages(x,y) munmap((void *)(x), (y)*PAGE_SIZE) - -static inline void cpu_relax(void) -{ - /* Nothing */ -} - -#undef HZ -#define HZ 1000 -static inline uint32_t raid6_jiffies(void) -{ - struct timeval tv; - gettimeofday(&tv, NULL); - return tv.tv_sec*1000 + tv.tv_usec/1000; -} - -#endif /* ! __KERNEL__ */ - -#endif /* LINUX_RAID_RAID6_H */ diff --git a/drivers/md/raid6algos.c b/drivers/md/raid6algos.c index 1f6a3c8..866215a 100644 --- a/drivers/md/raid6algos.c +++ b/drivers/md/raid6algos.c @@ -16,13 +16,20 @@ * Algorithm list and algorithm selection for RAID-6 */ -#include "raid6.h" +#include #ifndef __KERNEL__ #include #include +#else +#if !RAID6_USE_EMPTY_ZERO_PAGE +/* In .bss so it's zeroed */ +const char raid6_empty_zero_page[PAGE_SIZE] __attribute__((aligned(256))); +EXPORT_SYMBOL(raid6_empty_zero_page); +#endif #endif struct raid6_calls raid6_call; +EXPORT_SYMBOL_GPL(raid6_call); /* Various routine sets */ extern const struct raid6_calls raid6_intx1; @@ -79,6 +86,7 @@ const struct raid6_calls * const raid6_algos[] = { #else /* Need more time to be stable in userspace */ #define RAID6_TIME_JIFFIES_LG2 9 +#define time_before(x, y) ((x) < (y)) #endif /* Try to pick the best algorithm */ @@ -152,3 +160,12 @@ int __init raid6_select_algo(void) return best ? 0 : -EINVAL; } + +static void raid6_exit(void) +{ + do { } while (0); +} + +subsys_initcall(raid6_select_algo); +module_exit(raid6_exit); +MODULE_LICENSE("GPL"); diff --git a/drivers/md/raid6altivec.uc b/drivers/md/raid6altivec.uc index 2175806..699dfee 100644 --- a/drivers/md/raid6altivec.uc +++ b/drivers/md/raid6altivec.uc @@ -22,7 +22,7 @@ * bracked this with preempt_disable/enable or in a lock) */ -#include "raid6.h" +#include #ifdef CONFIG_ALTIVEC diff --git a/drivers/md/raid6int.uc b/drivers/md/raid6int.uc index 32a0bac..f9bf9cb 100644 --- a/drivers/md/raid6int.uc +++ b/drivers/md/raid6int.uc @@ -18,7 +18,7 @@ * This file is postprocessed using unroll.pl */ -#include "raid6.h" +#include /* * This is the C data type to use diff --git a/drivers/md/raid6mmx.c b/drivers/md/raid6mmx.c index 804cb50..e7f6c13 100644 --- a/drivers/md/raid6mmx.c +++ b/drivers/md/raid6mmx.c @@ -18,7 +18,7 @@ #if defined(__i386__) && !defined(__arch_um__) -#include "raid6.h" +#include #include "raid6x86.h" /* Shared with raid6sse1.c */ diff --git a/drivers/md/raid6recov.c b/drivers/md/raid6recov.c index 7a98b86..2609f00 100644 --- a/drivers/md/raid6recov.c +++ b/drivers/md/raid6recov.c @@ -18,7 +18,7 @@ * the syndrome.) */ -#include "raid6.h" +#include /* Recover two failed data blocks. */ void raid6_2data_recov(int disks, size_t bytes, int faila, int failb, @@ -63,9 +63,7 @@ void raid6_2data_recov(int disks, size_t bytes, int faila, int failb, p++; q++; } } - - - +EXPORT_SYMBOL_GPL(raid6_2data_recov); /* Recover failure of one data block plus the P block */ void raid6_datap_recov(int disks, size_t bytes, int faila, void **ptrs) @@ -97,9 +95,10 @@ void raid6_datap_recov(int disks, size_t bytes, int faila, void **ptrs) q++; dq++; } } +EXPORT_SYMBOL_GPL(raid6_datap_recov); - -#ifndef __KERNEL__ /* Testing only */ +#ifndef __KERNEL__ +/* Testing only */ /* Recover two failed blocks. */ void raid6_dual_recov(int disks, size_t bytes, int faila, int failb, void **ptrs) diff --git a/drivers/md/raid6sse1.c b/drivers/md/raid6sse1.c index 15c5889..b274dd5 100644 --- a/drivers/md/raid6sse1.c +++ b/drivers/md/raid6sse1.c @@ -23,7 +23,7 @@ #if defined(__i386__) && !defined(__arch_um__) -#include "raid6.h" +#include #include "raid6x86.h" /* Defined in raid6mmx.c */ diff --git a/drivers/md/raid6sse2.c b/drivers/md/raid6sse2.c index 2e92e96..6ed6c6c 100644 --- a/drivers/md/raid6sse2.c +++ b/drivers/md/raid6sse2.c @@ -19,7 +19,7 @@ #if (defined(__i386__) || defined(__x86_64__)) && !defined(__arch_um__) -#include "raid6.h" +#include #include "raid6x86.h" static const struct raid6_sse_constants { diff --git a/drivers/md/raid6test/Makefile b/drivers/md/raid6test/Makefile index 78e0396..58ffdf4 100644 --- a/drivers/md/raid6test/Makefile +++ b/drivers/md/raid6test/Makefile @@ -5,7 +5,7 @@ CC = gcc OPTFLAGS = -O2 # Adjust as desired -CFLAGS = -I.. -g $(OPTFLAGS) +CFLAGS = -I.. -I ../../../include -g $(OPTFLAGS) LD = ld PERL = perl AR = ar diff --git a/drivers/md/raid6test/test.c b/drivers/md/raid6test/test.c index 559cc41..7a93031 100644 --- a/drivers/md/raid6test/test.c +++ b/drivers/md/raid6test/test.c @@ -17,7 +17,7 @@ #include #include #include -#include "raid6.h" +#include #define NDISKS 16 /* Including P and Q */ diff --git a/include/linux/raid/pq.h b/include/linux/raid/pq.h new file mode 100644 index 0000000..d92480f --- /dev/null +++ b/include/linux/raid/pq.h @@ -0,0 +1,132 @@ +/* -*- linux-c -*- ------------------------------------------------------- * + * + * Copyright 2003 H. Peter Anvin - All Rights Reserved + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, Inc., 53 Temple Place Ste 330, + * Boston MA 02111-1307, USA; either version 2 of the License, or + * (at your option) any later version; incorporated herein by reference. + * + * ----------------------------------------------------------------------- */ + +#ifndef LINUX_RAID_RAID6_H +#define LINUX_RAID_RAID6_H + +#ifdef __KERNEL__ + +/* Set to 1 to use kernel-wide empty_zero_page */ +#define RAID6_USE_EMPTY_ZERO_PAGE 0 +#include + +/* We need a pre-zeroed page... if we don't want to use the kernel-provided + one define it here */ +#if RAID6_USE_EMPTY_ZERO_PAGE +# define raid6_empty_zero_page empty_zero_page +#else +extern const char raid6_empty_zero_page[PAGE_SIZE]; +#endif + +#else /* ! __KERNEL__ */ +/* Used for testing in user space */ + +#include +#include +#include +#include +#include +#include + +/* Not standard, but glibc defines it */ +#define BITS_PER_LONG __WORDSIZE + +typedef uint8_t u8; +typedef uint16_t u16; +typedef uint32_t u32; +typedef uint64_t u64; + +#ifndef PAGE_SIZE +# define PAGE_SIZE 4096 +#endif +extern const char raid6_empty_zero_page[PAGE_SIZE]; + +#define __init +#define __exit +#define __attribute_const__ __attribute__((const)) +#define noinline __attribute__((noinline)) + +#define preempt_enable() +#define preempt_disable() +#define cpu_has_feature(x) 1 +#define enable_kernel_altivec() +#define disable_kernel_altivec() + +#define EXPORT_SYMBOL(sym) +#define MODULE_LICENSE(licence) +#define subsys_initcall(x) +#define module_exit(x) +#endif /* __KERNEL__ */ + +/* Routine choices */ +struct raid6_calls { + void (*gen_syndrome)(int, size_t, void **); + int (*valid)(void); /* Returns 1 if this routine set is usable */ + const char *name; /* Name of this routine set */ + int prefer; /* Has special performance attribute */ +}; + +/* Selected algorithm */ +extern struct raid6_calls raid6_call; + +/* Algorithm list */ +extern const struct raid6_calls * const raid6_algos[]; +int raid6_select_algo(void); + +/* Return values from chk_syndrome */ +#define RAID6_OK 0 +#define RAID6_P_BAD 1 +#define RAID6_Q_BAD 2 +#define RAID6_PQ_BAD 3 + +/* Galois field tables */ +extern const u8 raid6_gfmul[256][256] __attribute__((aligned(256))); +extern const u8 raid6_gfexp[256] __attribute__((aligned(256))); +extern const u8 raid6_gfinv[256] __attribute__((aligned(256))); +extern const u8 raid6_gfexi[256] __attribute__((aligned(256))); + +/* Recovery routines */ +void raid6_2data_recov(int disks, size_t bytes, int faila, int failb, + void **ptrs); +void raid6_datap_recov(int disks, size_t bytes, int faila, void **ptrs); +void raid6_dual_recov(int disks, size_t bytes, int faila, int failb, + void **ptrs); + +/* Some definitions to allow code to be compiled for testing in userspace */ +#ifndef __KERNEL__ + +# define jiffies raid6_jiffies() +# define printk printf +# define GFP_KERNEL 0 +# define __get_free_pages(x, y) ((unsigned long)mmap(NULL, PAGE_SIZE << (y), \ + PROT_READ|PROT_WRITE, \ + MAP_PRIVATE|MAP_ANONYMOUS,\ + 0, 0)) +# define free_pages(x, y) munmap((void *)(x), (y)*PAGE_SIZE) + +static inline void cpu_relax(void) +{ + /* Nothing */ +} + +#undef HZ +#define HZ 1000 +static inline uint32_t raid6_jiffies(void) +{ + struct timeval tv; + gettimeofday(&tv, NULL); + return tv.tv_sec*1000 + tv.tv_usec/1000; +} + +#endif /* ! __KERNEL__ */ + +#endif /* LINUX_RAID_RAID6_H */ -- cgit v0.10.2 From 34e04e87fb8b2c62c9e8868f41c8179d0e15f51a Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 31 Mar 2009 15:10:16 +1100 Subject: md/raid5: drop qd_idx from r6_state We now have this value in stripe_head so we don't need to duplicate it. Signed-off-by: NeilBrown diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 1f1b054..3930b3e 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -2371,7 +2371,7 @@ static void handle_stripe_dirtying6(raid5_conf_t *conf, struct r6_state *r6s, int disks) { int rcw = 0, must_compute = 0, pd_idx = sh->pd_idx, i; - int qd_idx = r6s->qd_idx; + int qd_idx = sh->qd_idx; for (i = disks; i--; ) { struct r5dev *dev = &sh->dev[i]; /* Would I have to read this buffer for reconstruct_write */ @@ -2561,7 +2561,7 @@ static void handle_parity_checks6(raid5_conf_t *conf, struct stripe_head *sh, int update_p = 0, update_q = 0; struct r5dev *dev; int pd_idx = sh->pd_idx; - int qd_idx = r6s->qd_idx; + int qd_idx = sh->qd_idx; set_bit(STRIPE_HANDLE, &sh->state); @@ -2657,7 +2657,7 @@ static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh, struct dma_async_tx_descriptor *tx = NULL; clear_bit(STRIPE_EXPAND_SOURCE, &sh->state); for (i = 0; i < sh->disks; i++) - if (i != sh->pd_idx && (!r6s || i != r6s->qd_idx)) { + if (i != sh->pd_idx && i != sh->qd_idx) { int dd_idx, j; struct stripe_head *sh2; @@ -2984,17 +2984,16 @@ static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page) raid5_conf_t *conf = sh->raid_conf; int disks = sh->disks; struct bio *return_bi = NULL; - int i, pd_idx = sh->pd_idx; + int i, pd_idx = sh->pd_idx, qd_idx = sh->qd_idx; struct stripe_head_state s; struct r6_state r6s; struct r5dev *dev, *pdev, *qdev; mdk_rdev_t *blocked_rdev = NULL; - r6s.qd_idx = sh->qd_idx; pr_debug("handling stripe %llu, state=%#lx cnt=%d, " "pd_idx=%d, qd_idx=%d\n", (unsigned long long)sh->sector, sh->state, - atomic_read(&sh->count), pd_idx, r6s.qd_idx); + atomic_read(&sh->count), pd_idx, qd_idx); memset(&s, 0, sizeof(s)); spin_lock(&sh->lock); @@ -3105,9 +3104,9 @@ static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page) pdev = &sh->dev[pd_idx]; r6s.p_failed = (s.failed >= 1 && r6s.failed_num[0] == pd_idx) || (s.failed >= 2 && r6s.failed_num[1] == pd_idx); - qdev = &sh->dev[r6s.qd_idx]; - r6s.q_failed = (s.failed >= 1 && r6s.failed_num[0] == r6s.qd_idx) - || (s.failed >= 2 && r6s.failed_num[1] == r6s.qd_idx); + qdev = &sh->dev[qd_idx]; + r6s.q_failed = (s.failed >= 1 && r6s.failed_num[0] == qd_idx) + || (s.failed >= 2 && r6s.failed_num[1] == qd_idx); if ( s.written && ( r6s.p_failed || ((test_bit(R5_Insync, &pdev->flags) diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index 2934ee0..c2f37f2 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -239,7 +239,7 @@ struct stripe_head_state { /* r6_state - extra state data only relevant to r6 */ struct r6_state { - int p_failed, q_failed, qd_idx, failed_num[2]; + int p_failed, q_failed, failed_num[2]; }; /* Flags */ -- cgit v0.10.2 From 7ec0547838976d088dfb9cb0adb073e6e8a15aa3 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 31 Mar 2009 15:10:36 +1100 Subject: md/raid5: enhance raid5_size to work correctly with negative delta_disks This is the first of four patches which combine to allow md/raid5 to reduce the number of devices in the array by restriping the data over a subset of the devices. If the number of disks in a raid4/5/6 is being reduced, then the default size must be based on the new number, not the old number of devices. In general, it should be based on the smaller of new and old. Signed-off-by: NeilBrown diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 3930b3e..5694eb8 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -4169,8 +4169,13 @@ raid5_size(mddev_t *mddev, sector_t sectors, int raid_disks) if (!sectors) sectors = mddev->dev_sectors; - if (!raid_disks) - raid_disks = conf->previous_raid_disks; + if (!raid_disks) { + /* size is defined by the smallest of previous and new size */ + if (conf->raid_disks < conf->previous_raid_disks) + raid_disks = conf->raid_disks; + else + raid_disks = conf->previous_raid_disks; + } sectors &= ~((sector_t)mddev->chunk_size/512 - 1); return sectors * (raid_disks - conf->max_degraded); -- cgit v0.10.2 From cea9c22800773cecb1d41f4a6139f9eb6a95368b Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 31 Mar 2009 15:15:05 +1100 Subject: md: add explicit method to signal the end of a reshape. Currently raid5 (the only module that supports restriping) notices that the reshape has finished be sync_request being given a large value, and handles any cleanup them. This patch changes it so md_check_recovery calls into an explicit finish_reshape method as well. The clean-up from sync_request can do things that need to be done promptly, typically things local to the raid5_conf_t structure. The "finish_reshape" method is called under the mddev_lock so it can do things involving reconfiguring the device. This allows us to get rid of md_set_array_sectors_locked, which would have caused a deadlock if you tried to stop and array while a reshape was happening. Signed-off-by: NeilBrown diff --git a/drivers/md/md.c b/drivers/md/md.c index 923d125..c509313 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -5073,14 +5073,6 @@ void md_set_array_sectors(mddev_t *mddev, sector_t array_sectors) } EXPORT_SYMBOL(md_set_array_sectors); -void md_set_array_sectors_lock(mddev_t *mddev, sector_t array_sectors) -{ - mddev_lock(mddev); - md_set_array_sectors(mddev, array_sectors); - mddev_unlock(mddev); -} -EXPORT_SYMBOL(md_set_array_sectors_lock); - static int update_size(mddev_t *mddev, sector_t num_sectors) { mdk_rdev_t *rdev; @@ -6641,6 +6633,9 @@ void md_check_recovery(mddev_t *mddev) sysfs_notify(&mddev->kobj, NULL, "degraded"); } + if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && + mddev->pers->finish_reshape) + mddev->pers->finish_reshape(mddev); md_update_sb(mddev, 1); /* if array is no-longer degraded, then any saved_raid_disk diff --git a/drivers/md/md.h b/drivers/md/md.h index d13e34f..e9b7f54 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -317,6 +317,7 @@ struct mdk_personality sector_t (*size) (mddev_t *mddev, sector_t sectors, int raid_disks); int (*check_reshape) (mddev_t *mddev); int (*start_reshape) (mddev_t *mddev); + void (*finish_reshape) (mddev_t *mddev); int (*reconfig) (mddev_t *mddev, int layout, int chunk_size); /* quiesce moves between quiescence states * 0 - fully active @@ -433,4 +434,3 @@ extern void md_new_event(mddev_t *mddev); extern int md_allow_write(mddev_t *mddev); extern void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev); extern void md_set_array_sectors(mddev_t *mddev, sector_t array_sectors); -extern void md_set_array_sectors_lock(mddev_t *mddev, sector_t array_sectors); diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 5694eb8..a0f22dd 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -3850,6 +3850,7 @@ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *ski if (sector_nr >= max_sector) { /* just being told to finish up .. nothing much to do */ unplug_slaves(mddev); + if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) { end_reshape(conf); return 0; @@ -4836,43 +4837,49 @@ static int raid5_start_reshape(mddev_t *mddev) static void end_reshape(raid5_conf_t *conf) { - struct block_device *bdev; if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) { - mddev_t *mddev = conf->mddev; - - md_set_array_sectors_lock(mddev, raid5_size(mddev, 0, - conf->raid_disks)); - set_capacity(mddev->gendisk, mddev->array_sectors); - mddev->changed = 1; - conf->previous_raid_disks = conf->raid_disks; - bdev = bdget_disk(conf->mddev->gendisk, 0); - if (bdev) { - mutex_lock(&bdev->bd_inode->i_mutex); - i_size_write(bdev->bd_inode, - (loff_t)conf->mddev->array_sectors << 9); - mutex_unlock(&bdev->bd_inode->i_mutex); - bdput(bdev); - } spin_lock_irq(&conf->device_lock); + conf->previous_raid_disks = conf->raid_disks; conf->expand_progress = MaxSector; spin_unlock_irq(&conf->device_lock); - conf->mddev->reshape_position = MaxSector; /* read-ahead size must cover two whole stripes, which is * 2 * (datadisks) * chunksize where 'n' is the number of raid devices */ { - int data_disks = conf->previous_raid_disks - conf->max_degraded; - int stripe = data_disks * - (conf->mddev->chunk_size / PAGE_SIZE); + int data_disks = conf->raid_disks - conf->max_degraded; + int stripe = data_disks * (conf->chunk_size + / PAGE_SIZE); if (conf->mddev->queue->backing_dev_info.ra_pages < 2 * stripe) conf->mddev->queue->backing_dev_info.ra_pages = 2 * stripe; } } } +static void raid5_finish_reshape(mddev_t *mddev) +{ + struct block_device *bdev; + + if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { + + md_set_array_sectors(mddev, raid5_size(mddev, 0, 0)); + set_capacity(mddev->gendisk, mddev->array_sectors); + mddev->changed = 1; + mddev->reshape_position = MaxSector; + + bdev = bdget_disk(mddev->gendisk, 0); + if (bdev) { + mutex_lock(&bdev->bd_inode->i_mutex); + i_size_write(bdev->bd_inode, + (loff_t)mddev->array_sectors << 9); + mutex_unlock(&bdev->bd_inode->i_mutex); + bdput(bdev); + } + } +} + static void raid5_quiesce(mddev_t *mddev, int state) { raid5_conf_t *conf = mddev_to_conf(mddev); @@ -5098,6 +5105,7 @@ static struct mdk_personality raid6_personality = #ifdef CONFIG_MD_RAID5_RESHAPE .check_reshape = raid5_check_reshape, .start_reshape = raid5_start_reshape, + .finish_reshape = raid5_finish_reshape, #endif .quiesce = raid5_quiesce, .takeover = raid6_takeover, @@ -5121,6 +5129,7 @@ static struct mdk_personality raid5_personality = #ifdef CONFIG_MD_RAID5_RESHAPE .check_reshape = raid5_check_reshape, .start_reshape = raid5_start_reshape, + .finish_reshape = raid5_finish_reshape, #endif .quiesce = raid5_quiesce, .takeover = raid5_takeover, @@ -5146,6 +5155,7 @@ static struct mdk_personality raid4_personality = #ifdef CONFIG_MD_RAID5_RESHAPE .check_reshape = raid5_check_reshape, .start_reshape = raid5_start_reshape, + .finish_reshape = raid5_finish_reshape, #endif .quiesce = raid5_quiesce, }; -- cgit v0.10.2 From fef9c61fdfabf97a307c2cf3621a6949f0a4b995 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 31 Mar 2009 15:16:46 +1100 Subject: md/raid5: change reshape-progress measurement to cope with reshaping backwards. When reducing the number of devices in a raid4/5/6, the reshape process has to start at the end of the array and work down to the beginning. So we need to handle expand_progress and expand_lo differently. This patch renames "expand_progress" and "expand_lo" to avoid the implication that anything is getting bigger (expand->reshape) and every place they are used, we make sure that they are used the right way depending on whether delta_disks is positive or negative. Signed-off-by: NeilBrown diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index a0f22dd..1023c4e 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -3593,24 +3593,28 @@ static int make_request(struct request_queue *q, struct bio * bi) retry: previous = 0; prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE); - if (likely(conf->expand_progress == MaxSector)) + if (likely(conf->reshape_progress == MaxSector)) disks = conf->raid_disks; else { - /* spinlock is needed as expand_progress may be + /* spinlock is needed as reshape_progress may be * 64bit on a 32bit platform, and so it might be * possible to see a half-updated value - * Ofcourse expand_progress could change after + * Ofcourse reshape_progress could change after * the lock is dropped, so once we get a reference * to the stripe that we think it is, we will have * to check again. */ spin_lock_irq(&conf->device_lock); disks = conf->raid_disks; - if (logical_sector >= conf->expand_progress) { + if (mddev->delta_disks < 0 + ? logical_sector < conf->reshape_progress + : logical_sector >= conf->reshape_progress) { disks = conf->previous_raid_disks; previous = 1; } else { - if (logical_sector >= conf->expand_lo) { + if (mddev->delta_disks < 0 + ? logical_sector < conf->reshape_safe + : logical_sector >= conf->reshape_safe) { spin_unlock_irq(&conf->device_lock); schedule(); goto retry; @@ -3630,7 +3634,7 @@ static int make_request(struct request_queue *q, struct bio * bi) sh = get_active_stripe(conf, new_sector, previous, (bi->bi_rw&RWA_MASK)); if (sh) { - if (unlikely(conf->expand_progress != MaxSector)) { + if (unlikely(conf->reshape_progress != MaxSector)) { /* expansion might have moved on while waiting for a * stripe, so we must do the range check again. * Expansion could still move past after this @@ -3641,8 +3645,10 @@ static int make_request(struct request_queue *q, struct bio * bi) */ int must_retry = 0; spin_lock_irq(&conf->device_lock); - if (logical_sector < conf->expand_progress && - disks == conf->previous_raid_disks) + if ((mddev->delta_disks < 0 + ? logical_sector >= conf->reshape_progress + : logical_sector < conf->reshape_progress) + && disks == conf->previous_raid_disks) /* mismatch, need to try again */ must_retry = 1; spin_unlock_irq(&conf->device_lock); @@ -3720,13 +3726,20 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped int dd_idx; sector_t writepos, safepos, gap; - if (sector_nr == 0 && - conf->expand_progress != 0) { - /* restarting in the middle, skip the initial sectors */ - sector_nr = conf->expand_progress; + if (sector_nr == 0) { + /* If restarting in the middle, skip the initial sectors */ + if (mddev->delta_disks < 0 && + conf->reshape_progress < raid5_size(mddev, 0, 0)) { + sector_nr = raid5_size(mddev, 0, 0) + - conf->reshape_progress; + } else if (mddev->delta_disks > 0 && + conf->reshape_progress > 0) + sector_nr = conf->reshape_progress; sector_div(sector_nr, new_data_disks); - *skipped = 1; - return sector_nr; + if (sector_nr) { + *skipped = 1; + return sector_nr; + } } /* we update the metadata when there is more than 3Meg @@ -3734,28 +3747,37 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped * probably be time based) or when the data about to be * copied would over-write the source of the data at * the front of the range. - * i.e. one new_stripe forward from expand_progress new_maps - * to after where expand_lo old_maps to + * i.e. one new_stripe along from reshape_progress new_maps + * to after where reshape_safe old_maps to */ - writepos = conf->expand_progress + - conf->chunk_size/512*(new_data_disks); + writepos = conf->reshape_progress; sector_div(writepos, new_data_disks); - safepos = conf->expand_lo; + safepos = conf->reshape_safe; sector_div(safepos, data_disks); - gap = conf->expand_progress - conf->expand_lo; + if (mddev->delta_disks < 0) { + writepos -= conf->chunk_size/512; + safepos += conf->chunk_size/512; + gap = conf->reshape_safe - conf->reshape_progress; + } else { + writepos += conf->chunk_size/512; + safepos -= conf->chunk_size/512; + gap = conf->reshape_progress - conf->reshape_safe; + } - if (writepos >= safepos || + if ((mddev->delta_disks < 0 + ? writepos < safepos + : writepos > safepos) || gap > (new_data_disks)*3000*2 /*3Meg*/) { /* Cannot proceed until we've updated the superblock... */ wait_event(conf->wait_for_overlap, atomic_read(&conf->reshape_stripes)==0); - mddev->reshape_position = conf->expand_progress; + mddev->reshape_position = conf->reshape_progress; set_bit(MD_CHANGE_DEVS, &mddev->flags); md_wakeup_thread(mddev->thread); wait_event(mddev->sb_wait, mddev->flags == 0 || kthread_should_stop()); spin_lock_irq(&conf->device_lock); - conf->expand_lo = mddev->reshape_position; + conf->reshape_safe = mddev->reshape_position; spin_unlock_irq(&conf->device_lock); wake_up(&conf->wait_for_overlap); } @@ -3792,7 +3814,10 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped release_stripe(sh); } spin_lock_irq(&conf->device_lock); - conf->expand_progress = (sector_nr + i) * new_data_disks; + if (mddev->delta_disks < 0) + conf->reshape_progress -= i * new_data_disks; + else + conf->reshape_progress += i * new_data_disks; spin_unlock_irq(&conf->device_lock); /* Ok, those stripe are ready. We can start scheduling * reads on the source stripes. @@ -3823,14 +3848,14 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped /* Cannot proceed until we've updated the superblock... */ wait_event(conf->wait_for_overlap, atomic_read(&conf->reshape_stripes) == 0); - mddev->reshape_position = conf->expand_progress; + mddev->reshape_position = conf->reshape_progress; set_bit(MD_CHANGE_DEVS, &mddev->flags); md_wakeup_thread(mddev->thread); wait_event(mddev->sb_wait, !test_bit(MD_CHANGE_DEVS, &mddev->flags) || kthread_should_stop()); spin_lock_irq(&conf->device_lock); - conf->expand_lo = mddev->reshape_position; + conf->reshape_safe = mddev->reshape_position; spin_unlock_irq(&conf->device_lock); wake_up(&conf->wait_for_overlap); } @@ -4283,7 +4308,7 @@ static raid5_conf_t *setup_conf(mddev_t *mddev) conf->max_degraded = 1; conf->algorithm = mddev->new_layout; conf->max_nr_stripes = NR_STRIPES; - conf->expand_progress = mddev->reshape_position; + conf->reshape_progress = mddev->reshape_position; memory = conf->max_nr_stripes * (sizeof(struct stripe_head) + conf->raid_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024; @@ -4441,9 +4466,9 @@ static int run(mddev_t *mddev) print_raid5_conf(conf); - if (conf->expand_progress != MaxSector) { + if (conf->reshape_progress != MaxSector) { printk("...ok start reshape thread\n"); - conf->expand_lo = conf->expand_progress; + conf->reshape_safe = conf->reshape_progress; atomic_set(&conf->reshape_stripes, 0); clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); @@ -4782,8 +4807,11 @@ static int raid5_start_reshape(mddev_t *mddev) spin_lock_irq(&conf->device_lock); conf->previous_raid_disks = conf->raid_disks; conf->raid_disks += mddev->delta_disks; - conf->expand_progress = 0; - conf->expand_lo = 0; + if (mddev->delta_disks < 0) + conf->reshape_progress = raid5_size(mddev, 0, 0); + else + conf->reshape_progress = 0; + conf->reshape_safe = conf->reshape_progress; spin_unlock_irq(&conf->device_lock); /* Add some new drives, as many as will fit. @@ -4825,7 +4853,7 @@ static int raid5_start_reshape(mddev_t *mddev) mddev->recovery = 0; spin_lock_irq(&conf->device_lock); mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks; - conf->expand_progress = MaxSector; + conf->reshape_progress = MaxSector; spin_unlock_irq(&conf->device_lock); return -EAGAIN; } @@ -4842,7 +4870,7 @@ static void end_reshape(raid5_conf_t *conf) spin_lock_irq(&conf->device_lock); conf->previous_raid_disks = conf->raid_disks; - conf->expand_progress = MaxSector; + conf->reshape_progress = MaxSector; spin_unlock_irq(&conf->device_lock); /* read-ahead size must cover two whole stripes, which is diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index c2f37f2..b2edcc4 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -337,11 +337,16 @@ struct raid5_private_data { int raid_disks; int max_nr_stripes; - /* used during an expand */ - sector_t expand_progress; /* MaxSector when no expand happening */ - sector_t expand_lo; /* from here up to expand_progress it out-of-bounds - * as we haven't flushed the metadata yet - */ + /* reshape_progress is the leading edge of a 'reshape' + * It has value MaxSector when no reshape is happening + * If delta_disks < 0, it is the last sector we started work on, + * else is it the next sector to work on. + */ + sector_t reshape_progress; + /* reshape_safe is the trailing edge of a reshape. We know that + * before (or after) this address, all reshape has completed. + */ + sector_t reshape_safe; int previous_raid_disks; struct list_head handle_list; /* stripes needing handling */ -- cgit v0.10.2 From ec32a2bd35bd6b933a5db6542c48210ce069a376 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 31 Mar 2009 15:17:38 +1100 Subject: md: allow number of drives in raid5 to be reduced When reshaping a raid5 to have fewer devices, we work from the end of the array to the beginning. md_do_sync gives addresses to sync_request that go from the beginning to the end. So largely ignore them use the internal state variable "reshape_progress" to keep track of what to do next. Never allow the size to be reduced below the minimum (4 for raid6, 3 otherwise). We require that the size of the array has already been reduced before the array is reshaped to a smaller size. This is because simply reducing the size is an easily reversible operation, while the reshape is immediately destructive and so is not reversible for the blocks at the ends of the devices. Thus to reshape an array to have fewer devices, you must first write an appropriately small size to md/array_size. When reshape finished, we remove any drives that are no longer needed and fix up ->degraded. Signed-off-by: NeilBrown diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 1023c4e..76eed59 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -3725,6 +3725,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped int i; int dd_idx; sector_t writepos, safepos, gap; + sector_t stripe_addr; if (sector_nr == 0) { /* If restarting in the middle, skip the initial sectors */ @@ -3782,10 +3783,21 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped wake_up(&conf->wait_for_overlap); } + if (mddev->delta_disks < 0) { + BUG_ON(conf->reshape_progress == 0); + stripe_addr = writepos; + BUG_ON((mddev->dev_sectors & + ~((sector_t)mddev->chunk_size / 512 - 1)) + - (conf->chunk_size / 512) - stripe_addr + != sector_nr); + } else { + BUG_ON(writepos != sector_nr + conf->chunk_size / 512); + stripe_addr = sector_nr; + } for (i=0; i < conf->chunk_size/512; i+= STRIPE_SECTORS) { int j; int skipped = 0; - sh = get_active_stripe(conf, sector_nr+i, 0, 0); + sh = get_active_stripe(conf, stripe_addr+i, 0, 0); set_bit(STRIPE_EXPANDING, &sh->state); atomic_inc(&conf->reshape_stripes); /* If any of this stripe is beyond the end of the old @@ -3825,10 +3837,10 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped * block on the destination stripes. */ first_sector = - raid5_compute_sector(conf, sector_nr*(new_data_disks), + raid5_compute_sector(conf, stripe_addr*(new_data_disks), 1, &dd_idx, NULL); last_sector = - raid5_compute_sector(conf, ((sector_nr+conf->chunk_size/512) + raid5_compute_sector(conf, ((stripe_addr+conf->chunk_size/512) *(new_data_disks) - 1), 1, &dd_idx, NULL); if (last_sector >= mddev->dev_sectors) @@ -4366,12 +4378,6 @@ static int run(mddev_t *mddev) mdname(mddev)); return -EINVAL; } - if (mddev->delta_disks <= 0) { - printk(KERN_ERR "raid5: %s: unsupported reshape " - "(reduce disks) required - aborting.\n", - mdname(mddev)); - return -EINVAL; - } old_disks = mddev->raid_disks - mddev->delta_disks; /* reshape_position must be on a new-stripe boundary, and one * further up in new geometry must map after here in old @@ -4648,6 +4654,10 @@ static int raid5_remove_disk(mddev_t *mddev, int number) print_raid5_conf(conf); rdev = p->rdev; if (rdev) { + if (number >= conf->raid_disks && + conf->reshape_progress == MaxSector) + clear_bit(In_sync, &rdev->flags); + if (test_bit(In_sync, &rdev->flags) || atomic_read(&rdev->nr_pending)) { err = -EBUSY; @@ -4657,7 +4667,8 @@ static int raid5_remove_disk(mddev_t *mddev, int number) * isn't possible. */ if (!test_bit(Faulty, &rdev->flags) && - mddev->degraded <= conf->max_degraded) { + mddev->degraded <= conf->max_degraded && + number < conf->raid_disks) { err = -EBUSY; goto abort; } @@ -4745,16 +4756,26 @@ static int raid5_resize(mddev_t *mddev, sector_t sectors) static int raid5_check_reshape(mddev_t *mddev) { raid5_conf_t *conf = mddev_to_conf(mddev); - int err; - if (mddev->delta_disks < 0 || - mddev->new_level != mddev->level) - return -EINVAL; /* Cannot shrink array or change level yet */ if (mddev->delta_disks == 0) return 0; /* nothing to do */ if (mddev->bitmap) /* Cannot grow a bitmap yet */ return -EBUSY; + if (mddev->degraded > conf->max_degraded) + return -EINVAL; + if (mddev->delta_disks < 0) { + /* We might be able to shrink, but the devices must + * be made bigger first. + * For raid6, 4 is the minimum size. + * Otherwise 2 is the minimum + */ + int min = 2; + if (mddev->level == 6) + min = 4; + if (mddev->raid_disks + mddev->delta_disks < min) + return -EINVAL; + } /* Can only proceed if there are plenty of stripe_heads. * We need a minimum of one full stripe,, and for sensible progress @@ -4771,14 +4792,7 @@ static int raid5_check_reshape(mddev_t *mddev) return -ENOSPC; } - err = resize_stripes(conf, conf->raid_disks + mddev->delta_disks); - if (err) - return err; - - if (mddev->degraded > conf->max_degraded) - return -EINVAL; - /* looks like we might be able to manage this */ - return 0; + return resize_stripes(conf, conf->raid_disks + mddev->delta_disks); } static int raid5_start_reshape(mddev_t *mddev) @@ -4803,6 +4817,17 @@ static int raid5_start_reshape(mddev_t *mddev) */ return -EINVAL; + /* Refuse to reduce size of the array. Any reductions in + * array size must be through explicit setting of array_size + * attribute. + */ + if (raid5_size(mddev, 0, conf->raid_disks + mddev->delta_disks) + < mddev->array_sectors) { + printk(KERN_ERR "md: %s: array size must be reduced " + "before number of disks\n", mdname(mddev)); + return -EINVAL; + } + atomic_set(&conf->reshape_stripes, 0); spin_lock_irq(&conf->device_lock); conf->previous_raid_disks = conf->raid_disks; @@ -4836,9 +4861,12 @@ static int raid5_start_reshape(mddev_t *mddev) break; } - spin_lock_irqsave(&conf->device_lock, flags); - mddev->degraded = (conf->raid_disks - conf->previous_raid_disks) - added_devices; - spin_unlock_irqrestore(&conf->device_lock, flags); + if (mddev->delta_disks > 0) { + spin_lock_irqsave(&conf->device_lock, flags); + mddev->degraded = (conf->raid_disks - conf->previous_raid_disks) + - added_devices; + spin_unlock_irqrestore(&conf->device_lock, flags); + } mddev->raid_disks = conf->raid_disks; mddev->reshape_position = 0; set_bit(MD_CHANGE_DEVS, &mddev->flags); @@ -4863,6 +4891,9 @@ static int raid5_start_reshape(mddev_t *mddev) } #endif +/* This is called from the reshape thread and should make any + * changes needed in 'conf' + */ static void end_reshape(raid5_conf_t *conf) { @@ -4886,25 +4917,44 @@ static void end_reshape(raid5_conf_t *conf) } } +/* This is called from the raid5d thread with mddev_lock held. + * It makes config changes to the device. + */ static void raid5_finish_reshape(mddev_t *mddev) { struct block_device *bdev; if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { - md_set_array_sectors(mddev, raid5_size(mddev, 0, 0)); - set_capacity(mddev->gendisk, mddev->array_sectors); - mddev->changed = 1; - mddev->reshape_position = MaxSector; - - bdev = bdget_disk(mddev->gendisk, 0); - if (bdev) { - mutex_lock(&bdev->bd_inode->i_mutex); - i_size_write(bdev->bd_inode, - (loff_t)mddev->array_sectors << 9); - mutex_unlock(&bdev->bd_inode->i_mutex); - bdput(bdev); + if (mddev->delta_disks > 0) { + md_set_array_sectors(mddev, raid5_size(mddev, 0, 0)); + set_capacity(mddev->gendisk, mddev->array_sectors); + mddev->changed = 1; + + bdev = bdget_disk(mddev->gendisk, 0); + if (bdev) { + mutex_lock(&bdev->bd_inode->i_mutex); + i_size_write(bdev->bd_inode, + (loff_t)mddev->array_sectors << 9); + mutex_unlock(&bdev->bd_inode->i_mutex); + bdput(bdev); + } + } else { + int d; + raid5_conf_t *conf = mddev_to_conf(mddev); + mddev->degraded = conf->raid_disks; + for (d = 0; d < conf->raid_disks ; d++) + if (conf->disks[d].rdev && + test_bit(In_sync, + &conf->disks[d].rdev->flags)) + mddev->degraded--; + for (d = conf->raid_disks ; + d < conf->raid_disks - mddev->delta_disks; + d++) + raid5_remove_disk(mddev, d); } + mddev->reshape_position = MaxSector; + mddev->delta_disks = 0; } } -- cgit v0.10.2 From 11373542344bdc35be1e6e68b0baadd1b6f7acbb Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 31 Mar 2009 15:18:37 +1100 Subject: Documentation/md.txt update Update md.txt to reflect recent changes in a number of sysfs attributes. Signed-off-by: NeilBrown diff --git a/Documentation/md.txt b/Documentation/md.txt index 1da9d1b..4edd39e 100644 --- a/Documentation/md.txt +++ b/Documentation/md.txt @@ -164,15 +164,19 @@ All md devices contain: raid_disks a text file with a simple number indicating the number of devices in a fully functional array. If this is not yet known, the file - will be empty. If an array is being resized (not currently - possible) this will contain the larger of the old and new sizes. - Some raid level (RAID1) allow this value to be set while the - array is active. This will reconfigure the array. Otherwise - it can only be set while assembling an array. + will be empty. If an array is being resized this will contain + the new number of devices. + Some raid levels allow this value to be set while the array is + active. This will reconfigure the array. Otherwise it can only + be set while assembling an array. + A change to this attribute will not be permitted if it would + reduce the size of the array. To reduce the number of drives + in an e.g. raid5, the array size must first be reduced by + setting the 'array_size' attribute. chunk_size - This is the size if bytes for 'chunks' and is only relevant to - raid levels that involve striping (1,4,5,6,10). The address space + This is the size in bytes for 'chunks' and is only relevant to + raid levels that involve striping (0,4,5,6,10). The address space of the array is conceptually divided into chunks and consecutive chunks are striped onto neighbouring devices. The size should be at least PAGE_SIZE (4k) and should be a power @@ -183,6 +187,20 @@ All md devices contain: simply a number that is interpretted differently by different levels. It can be written while assembling an array. + array_size + This can be used to artificially constrain the available space in + the array to be less than is actually available on the combined + devices. Writing a number (in Kilobytes) which is less than + the available size will set the size. Any reconfiguration of the + array (e.g. adding devices) will not cause the size to change. + Writing the word 'default' will cause the effective size of the + array to be whatever size is actually available based on + 'level', 'chunk_size' and 'component_size'. + + This can be used to reduce the size of the array before reducing + the number of devices in a raid4/5/6, or to support external + metadata formats which mandate such clipping. + reshape_position This is either "none" or a sector number within the devices of the array where "reshape" is up to. If this is set, the three @@ -207,6 +225,11 @@ All md devices contain: about the array. It can be 0.90 (traditional format), 1.0, 1.1, 1.2 (newer format in varying locations) or "none" indicating that the kernel isn't managing metadata at all. + Alternately it can be "external:" followed by a string which + is set by user-space. This indicates that metadata is managed + by a user-space program. Any device failure or other event that + requires a metadata update will cause array activity to be + suspended until the event is acknowledged. resync_start The point at which resync should start. If no resync is needed, -- cgit v0.10.2 From 86b42c713be3e5f6807aa14b4cbdb005d35c64d5 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 31 Mar 2009 15:19:03 +1100 Subject: md/raid5: clearly differentiate 'before' and 'after' stripes during reshape. During a raid5 reshape, we have some stripes in the cache that are 'before' the reshape (and are still to be processed) and some that are 'after'. They are currently differentiated by having different ->disks values as the only reshape current supported involves changing the number of disks. However we will soon support reshapes that do not change the number of disks (chunk parity or chunk size). So make the difference more explicit with a 'generation' number. Signed-off-by: NeilBrown diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 76eed59..73cdf43 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -318,6 +318,7 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int previous) remove_hash(sh); + sh->generation = conf->generation - previous; sh->disks = previous ? conf->previous_raid_disks : conf->raid_disks; sh->sector = sector; stripe_set_idx(sector, conf, previous, sh); @@ -341,7 +342,8 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int previous) insert_hash(conf, sh); } -static struct stripe_head *__find_stripe(raid5_conf_t *conf, sector_t sector, int disks) +static struct stripe_head *__find_stripe(raid5_conf_t *conf, sector_t sector, + short generation) { struct stripe_head *sh; struct hlist_node *hn; @@ -349,7 +351,7 @@ static struct stripe_head *__find_stripe(raid5_conf_t *conf, sector_t sector, in CHECK_DEVLOCK(); pr_debug("__find_stripe, sector %llu\n", (unsigned long long)sector); hlist_for_each_entry(sh, hn, stripe_hash(conf, sector), hash) - if (sh->sector == sector && sh->disks == disks) + if (sh->sector == sector && sh->generation == generation) return sh; pr_debug("__stripe %llu not in cache\n", (unsigned long long)sector); return NULL; @@ -363,7 +365,6 @@ get_active_stripe(raid5_conf_t *conf, sector_t sector, int previous, int noblock) { struct stripe_head *sh; - int disks = previous ? conf->previous_raid_disks : conf->raid_disks; pr_debug("get_stripe, sector %llu\n", (unsigned long long)sector); @@ -373,7 +374,7 @@ get_active_stripe(raid5_conf_t *conf, sector_t sector, wait_event_lock_irq(conf->wait_for_stripe, conf->quiesce == 0, conf->device_lock, /* nothing */); - sh = __find_stripe(conf, sector, disks); + sh = __find_stripe(conf, sector, conf->generation - previous); if (!sh) { if (!conf->inactive_blocked) sh = get_free_stripe(conf); @@ -3648,7 +3649,7 @@ static int make_request(struct request_queue *q, struct bio * bi) if ((mddev->delta_disks < 0 ? logical_sector >= conf->reshape_progress : logical_sector < conf->reshape_progress) - && disks == conf->previous_raid_disks) + && previous) /* mismatch, need to try again */ must_retry = 1; spin_unlock_irq(&conf->device_lock); @@ -4837,6 +4838,7 @@ static int raid5_start_reshape(mddev_t *mddev) else conf->reshape_progress = 0; conf->reshape_safe = conf->reshape_progress; + conf->generation++; spin_unlock_irq(&conf->device_lock); /* Add some new drives, as many as will fit. diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index b2edcc4..a081fb4 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -198,6 +198,8 @@ struct stripe_head { struct hlist_node hash; struct list_head lru; /* inactive_list or handle_list */ struct raid5_private_data *raid_conf; + short generation; /* increments with every + * reshape */ sector_t sector; /* sector of this row */ short pd_idx; /* parity disk index */ short qd_idx; /* 'Q' disk index for raid6 */ @@ -348,6 +350,7 @@ struct raid5_private_data { */ sector_t reshape_safe; int previous_raid_disks; + short generation; /* increments with every reshape */ struct list_head handle_list; /* stripes needing handling */ struct list_head hold_list; /* preread ready stripes */ -- cgit v0.10.2 From 784052ecc6ade6b6acf4f67e4ada8e5f2e6df446 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 31 Mar 2009 15:19:07 +1100 Subject: md/raid5: prepare for allowing reshape to change chunksize. Add "prev_chunk" to raid5_conf_t, similar to "previous_raid_disks", to remember what the chunk size was before the reshape that is currently underway. This seems like duplication with "chunk_size" and "new_chunk" in mddev_t, and to some extent it is, but there are differences. The values in mddev_t are always defined and often the same. The prev* values are only defined if a reshape is underway. Also (and more significantly) the raid5_conf_t values will be changed at the same time (inside an appropriate lock) that the reshape is started by setting reshape_position. In contrast, the new_chunk value is set when the sysfs file is written which could be well before the reshape starts. Signed-off-by: NeilBrown diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 73cdf43..7638cc3 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -299,7 +299,7 @@ static int grow_buffers(struct stripe_head *sh, int num) return 0; } -static void raid5_build_block(struct stripe_head *sh, int i); +static void raid5_build_block(struct stripe_head *sh, int i, int previous); static void stripe_set_idx(sector_t stripe, raid5_conf_t *conf, int previous, struct stripe_head *sh); @@ -337,7 +337,7 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int previous) BUG(); } dev->flags = 0; - raid5_build_block(sh, i); + raid5_build_block(sh, i, previous); } insert_hash(conf, sh); } @@ -1212,9 +1212,9 @@ static void raid5_end_write_request(struct bio *bi, int error) } -static sector_t compute_blocknr(struct stripe_head *sh, int i); +static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous); -static void raid5_build_block(struct stripe_head *sh, int i) +static void raid5_build_block(struct stripe_head *sh, int i, int previous) { struct r5dev *dev = &sh->dev[i]; @@ -1230,7 +1230,7 @@ static void raid5_build_block(struct stripe_head *sh, int i) dev->req.bi_private = sh; dev->flags = 0; - dev->sector = compute_blocknr(sh, i); + dev->sector = compute_blocknr(sh, i, previous); } static void error(mddev_t *mddev, mdk_rdev_t *rdev) @@ -1273,7 +1273,8 @@ static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector, int pd_idx, qd_idx; int ddf_layout = 0; sector_t new_sector; - int sectors_per_chunk = conf->chunk_size >> 9; + int sectors_per_chunk = previous ? (conf->prev_chunk >> 9) + : (conf->chunk_size >> 9); int raid_disks = previous ? conf->previous_raid_disks : conf->raid_disks; int data_disks = raid_disks - conf->max_degraded; @@ -1472,13 +1473,14 @@ static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector, } -static sector_t compute_blocknr(struct stripe_head *sh, int i) +static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous) { raid5_conf_t *conf = sh->raid_conf; int raid_disks = sh->disks; int data_disks = raid_disks - conf->max_degraded; sector_t new_sector = sh->sector, check; - int sectors_per_chunk = conf->chunk_size >> 9; + int sectors_per_chunk = previous ? (conf->prev_chunk >> 9) + : (conf->chunk_size >> 9); sector_t stripe; int chunk_offset; int chunk_number, dummy1, dd_idx = i; @@ -1579,8 +1581,7 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i) r_sector = (sector_t)chunk_number * sectors_per_chunk + chunk_offset; check = raid5_compute_sector(conf, r_sector, - (raid_disks != conf->raid_disks), - &dummy1, &sh2); + previous, &dummy1, &sh2); if (check != sh->sector || dummy1 != dd_idx || sh2.pd_idx != sh->pd_idx || sh2.qd_idx != sh->qd_idx) { printk(KERN_ERR "compute_blocknr: map not correct\n"); @@ -1992,7 +1993,9 @@ static int page_is_zero(struct page *p) static void stripe_set_idx(sector_t stripe, raid5_conf_t *conf, int previous, struct stripe_head *sh) { - int sectors_per_chunk = conf->chunk_size >> 9; + int sectors_per_chunk = + previous ? (conf->prev_chunk >> 9) + : (conf->chunk_size >> 9); int dd_idx; int chunk_offset = sector_div(stripe, sectors_per_chunk); int disks = previous ? conf->previous_raid_disks : conf->raid_disks; @@ -2662,7 +2665,7 @@ static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh, int dd_idx, j; struct stripe_head *sh2; - sector_t bn = compute_blocknr(sh, i); + sector_t bn = compute_blocknr(sh, i, 1); sector_t s = raid5_compute_sector(conf, bn, 0, &dd_idx, NULL); sh2 = get_active_stripe(conf, s, 0, 1); @@ -3318,6 +3321,8 @@ static int raid5_mergeable_bvec(struct request_queue *q, if ((bvm->bi_rw & 1) == WRITE) return biovec->bv_len; /* always allow writes to be mergeable */ + if (mddev->new_chunk < mddev->chunk_size) + chunk_sectors = mddev->new_chunk >> 9; max = (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9; if (max < 0) max = 0; if (max <= biovec->bv_len && bio_sectors == 0) @@ -3333,6 +3338,8 @@ static int in_chunk_boundary(mddev_t *mddev, struct bio *bio) unsigned int chunk_sectors = mddev->chunk_size >> 9; unsigned int bio_sectors = bio->bi_size >> 9; + if (mddev->new_chunk < mddev->chunk_size) + chunk_sectors = mddev->new_chunk >> 9; return chunk_sectors >= ((sector & (chunk_sectors - 1)) + bio_sectors); } @@ -3788,7 +3795,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped BUG_ON(conf->reshape_progress == 0); stripe_addr = writepos; BUG_ON((mddev->dev_sectors & - ~((sector_t)mddev->chunk_size / 512 - 1)) + ~((sector_t)conf->chunk_size / 512 - 1)) - (conf->chunk_size / 512) - stripe_addr != sector_nr); } else { @@ -3811,7 +3818,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped if (conf->level == 6 && j == sh->qd_idx) continue; - s = compute_blocknr(sh, j); + s = compute_blocknr(sh, j, 0); if (s < raid5_size(mddev, 0, 0)) { skipped = 1; continue; @@ -4217,6 +4224,7 @@ raid5_size(mddev_t *mddev, sector_t sectors, int raid_disks) } sectors &= ~((sector_t)mddev->chunk_size/512 - 1); + sectors &= ~((sector_t)mddev->new_chunk/512 - 1); return sectors * (raid_disks - conf->max_degraded); } @@ -4322,6 +4330,8 @@ static raid5_conf_t *setup_conf(mddev_t *mddev) conf->algorithm = mddev->new_layout; conf->max_nr_stripes = NR_STRIPES; conf->reshape_progress = mddev->reshape_position; + if (conf->reshape_progress != MaxSector) + conf->prev_chunk = mddev->chunk_size; memory = conf->max_nr_stripes * (sizeof(struct stripe_head) + conf->raid_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024; @@ -4385,7 +4395,7 @@ static int run(mddev_t *mddev) * geometry. */ here_new = mddev->reshape_position; - if (sector_div(here_new, (mddev->chunk_size>>9)* + if (sector_div(here_new, (mddev->new_chunk>>9)* (mddev->raid_disks - max_degraded))) { printk(KERN_ERR "raid5: reshape_position not " "on a stripe boundary\n"); @@ -4789,7 +4799,8 @@ static int raid5_check_reshape(mddev_t *mddev) if ((mddev->chunk_size / STRIPE_SIZE) * 4 > conf->max_nr_stripes || (mddev->new_chunk / STRIPE_SIZE) * 4 > conf->max_nr_stripes) { printk(KERN_WARNING "raid5: reshape: not enough stripes. Needed %lu\n", - (mddev->chunk_size / STRIPE_SIZE)*4); + (max(mddev->chunk_size, mddev->new_chunk) + / STRIPE_SIZE)*4); return -ENOSPC; } diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index a081fb4..b9c9328 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -350,6 +350,7 @@ struct raid5_private_data { */ sector_t reshape_safe; int previous_raid_disks; + int prev_chunk; short generation; /* increments with every reshape */ struct list_head handle_list; /* stripes needing handling */ -- cgit v0.10.2 From e183eaedd53807e33f02ee80573e2833890e1f21 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 31 Mar 2009 15:20:22 +1100 Subject: md/raid5: prepare for allowing reshape to change layout Add prev_algo to raid5_conf_t along the same lines as prev_chunk and previous_raid_disks. Signed-off-by: NeilBrown diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 7638cc3..80ec9a6 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -1273,6 +1273,8 @@ static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector, int pd_idx, qd_idx; int ddf_layout = 0; sector_t new_sector; + int algorithm = previous ? conf->prev_algo + : conf->algorithm; int sectors_per_chunk = previous ? (conf->prev_chunk >> 9) : (conf->chunk_size >> 9); int raid_disks = previous ? conf->previous_raid_disks @@ -1307,7 +1309,7 @@ static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector, pd_idx = data_disks; break; case 5: - switch (conf->algorithm) { + switch (algorithm) { case ALGORITHM_LEFT_ASYMMETRIC: pd_idx = data_disks - stripe % raid_disks; if (*dd_idx >= pd_idx) @@ -1335,13 +1337,13 @@ static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector, break; default: printk(KERN_ERR "raid5: unsupported algorithm %d\n", - conf->algorithm); + algorithm); BUG(); } break; case 6: - switch (conf->algorithm) { + switch (algorithm) { case ALGORITHM_LEFT_ASYMMETRIC: pd_idx = raid_disks - 1 - (stripe % raid_disks); qd_idx = pd_idx + 1; @@ -1454,7 +1456,7 @@ static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector, default: printk(KERN_CRIT "raid6: unsupported algorithm %d\n", - conf->algorithm); + algorithm); BUG(); } break; @@ -1481,6 +1483,8 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous) sector_t new_sector = sh->sector, check; int sectors_per_chunk = previous ? (conf->prev_chunk >> 9) : (conf->chunk_size >> 9); + int algorithm = previous ? conf->prev_algo + : conf->algorithm; sector_t stripe; int chunk_offset; int chunk_number, dummy1, dd_idx = i; @@ -1497,7 +1501,7 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous) switch(conf->level) { case 4: break; case 5: - switch (conf->algorithm) { + switch (algorithm) { case ALGORITHM_LEFT_ASYMMETRIC: case ALGORITHM_RIGHT_ASYMMETRIC: if (i > sh->pd_idx) @@ -1516,14 +1520,14 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous) break; default: printk(KERN_ERR "raid5: unsupported algorithm %d\n", - conf->algorithm); + algorithm); BUG(); } break; case 6: if (i == sh->qd_idx) return 0; /* It is the Q disk */ - switch (conf->algorithm) { + switch (algorithm) { case ALGORITHM_LEFT_ASYMMETRIC: case ALGORITHM_RIGHT_ASYMMETRIC: case ALGORITHM_ROTATING_ZERO_RESTART: @@ -1571,7 +1575,7 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous) break; default: printk(KERN_CRIT "raid6: unsupported algorithm %d\n", - conf->algorithm); + algorithm); BUG(); } break; @@ -4330,8 +4334,10 @@ static raid5_conf_t *setup_conf(mddev_t *mddev) conf->algorithm = mddev->new_layout; conf->max_nr_stripes = NR_STRIPES; conf->reshape_progress = mddev->reshape_position; - if (conf->reshape_progress != MaxSector) + if (conf->reshape_progress != MaxSector) { conf->prev_chunk = mddev->chunk_size; + conf->prev_algo = mddev->layout; + } memory = conf->max_nr_stripes * (sizeof(struct stripe_head) + conf->raid_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024; @@ -4472,14 +4478,14 @@ static int run(mddev_t *mddev) if (mddev->degraded == 0) printk("raid5: raid level %d set %s active with %d out of %d" - " devices, algorithm %d\n", conf->level, mdname(mddev), - mddev->raid_disks-mddev->degraded, mddev->raid_disks, - conf->algorithm); + " devices, algorithm %d\n", conf->level, mdname(mddev), + mddev->raid_disks-mddev->degraded, mddev->raid_disks, + mddev->new_layout); else printk(KERN_ALERT "raid5: raid level %d set %s active with %d" " out of %d devices, algorithm %d\n", conf->level, mdname(mddev), mddev->raid_disks - mddev->degraded, - mddev->raid_disks, conf->algorithm); + mddev->raid_disks, mddev->new_layout); print_raid5_conf(conf); diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index b9c9328..cdd0456 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -350,7 +350,7 @@ struct raid5_private_data { */ sector_t reshape_safe; int previous_raid_disks; - int prev_chunk; + int prev_chunk, prev_algo; short generation; /* increments with every reshape */ struct list_head handle_list; /* stripes needing handling */ -- cgit v0.10.2 From 7a6613810785872b7c028fba22fc0bae1c91733d Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 31 Mar 2009 15:21:40 +1100 Subject: md/raid5: reshape using largest of old and new chunk size This ensures that even when old and new stripes are overlapping, we will try to read all of the old before having to write any of the new. Signed-off-by: NeilBrown diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 80ec9a6..f7fb2b8 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -3738,6 +3738,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped int dd_idx; sector_t writepos, safepos, gap; sector_t stripe_addr; + int reshape_sectors; if (sector_nr == 0) { /* If restarting in the middle, skip the initial sectors */ @@ -3755,6 +3756,15 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped } } + /* We need to process a full chunk at a time. + * If old and new chunk sizes differ, we need to process the + * largest of these + */ + if (mddev->new_chunk > mddev->chunk_size) + reshape_sectors = mddev->new_chunk / 512; + else + reshape_sectors = mddev->chunk_size / 512; + /* we update the metadata when there is more than 3Meg * in the block range (that is rather arbitrary, should * probably be time based) or when the data about to be @@ -3768,12 +3778,12 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped safepos = conf->reshape_safe; sector_div(safepos, data_disks); if (mddev->delta_disks < 0) { - writepos -= conf->chunk_size/512; - safepos += conf->chunk_size/512; + writepos -= reshape_sectors; + safepos += reshape_sectors; gap = conf->reshape_safe - conf->reshape_progress; } else { - writepos += conf->chunk_size/512; - safepos -= conf->chunk_size/512; + writepos += reshape_sectors; + safepos -= reshape_sectors; gap = conf->reshape_progress - conf->reshape_safe; } @@ -3799,14 +3809,14 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped BUG_ON(conf->reshape_progress == 0); stripe_addr = writepos; BUG_ON((mddev->dev_sectors & - ~((sector_t)conf->chunk_size / 512 - 1)) - - (conf->chunk_size / 512) - stripe_addr + ~((sector_t)reshape_sectors - 1)) + - reshape_sectors - stripe_addr != sector_nr); } else { - BUG_ON(writepos != sector_nr + conf->chunk_size / 512); + BUG_ON(writepos != sector_nr + reshape_sectors); stripe_addr = sector_nr; } - for (i=0; i < conf->chunk_size/512; i+= STRIPE_SECTORS) { + for (i = 0; i < reshape_sectors; i += STRIPE_SECTORS) { int j; int skipped = 0; sh = get_active_stripe(conf, stripe_addr+i, 0, 0); @@ -3839,9 +3849,9 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped } spin_lock_irq(&conf->device_lock); if (mddev->delta_disks < 0) - conf->reshape_progress -= i * new_data_disks; + conf->reshape_progress -= reshape_sectors * new_data_disks; else - conf->reshape_progress += i * new_data_disks; + conf->reshape_progress += reshape_sectors * new_data_disks; spin_unlock_irq(&conf->device_lock); /* Ok, those stripe are ready. We can start scheduling * reads on the source stripes. @@ -3867,7 +3877,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped /* If this takes us to the resync_max point where we have to pause, * then we need to write out the superblock. */ - sector_nr += conf->chunk_size>>9; + sector_nr += reshape_sectors; if (sector_nr >= mddev->resync_max) { /* Cannot proceed until we've updated the superblock... */ wait_event(conf->wait_for_overlap, @@ -3883,7 +3893,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped spin_unlock_irq(&conf->device_lock); wake_up(&conf->wait_for_overlap); } - return conf->chunk_size>>9; + return reshape_sectors; } /* FIXME go_faster isn't used */ -- cgit v0.10.2 From 88ce4930e2b80378d45506ce2c3bb5820e156e85 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 31 Mar 2009 15:24:23 +1100 Subject: md/raid5: allow layout and chunksize to be changed on active array. If an array has 3 or more devices, we allow the chunksize or layout to be changed and when a reshape starts, we use these as the 'new' values. Signed-off-by: NeilBrown diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index f7fb2b8..4fdc6d0 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -4397,9 +4397,7 @@ static int run(mddev_t *mddev) int old_disks; int max_degraded = (mddev->level == 6 ? 2 : 1); - if (mddev->new_level != mddev->level || - mddev->new_layout != mddev->layout || - mddev->new_chunk != mddev->chunk_size) { + if (mddev->new_level != mddev->level) { printk(KERN_ERR "raid5: %s: unsupported reshape " "required - aborting.\n", mdname(mddev)); @@ -4784,8 +4782,10 @@ static int raid5_check_reshape(mddev_t *mddev) { raid5_conf_t *conf = mddev_to_conf(mddev); - if (mddev->delta_disks == 0) - return 0; /* nothing to do */ + if (mddev->delta_disks == 0 && + mddev->new_layout == mddev->layout && + mddev->new_chunk == mddev->chunk_size) + return -EINVAL; /* nothing to do */ if (mddev->bitmap) /* Cannot grow a bitmap yet */ return -EBUSY; @@ -4860,6 +4860,10 @@ static int raid5_start_reshape(mddev_t *mddev) spin_lock_irq(&conf->device_lock); conf->previous_raid_disks = conf->raid_disks; conf->raid_disks += mddev->delta_disks; + conf->prev_chunk = conf->chunk_size; + conf->chunk_size = mddev->new_chunk; + conf->prev_algo = conf->algorithm; + conf->algorithm = mddev->new_layout; if (mddev->delta_disks < 0) conf->reshape_progress = raid5_size(mddev, 0, 0); else @@ -4952,6 +4956,7 @@ static void end_reshape(raid5_conf_t *conf) static void raid5_finish_reshape(mddev_t *mddev) { struct block_device *bdev; + raid5_conf_t *conf = mddev_to_conf(mddev); if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { @@ -4970,7 +4975,6 @@ static void raid5_finish_reshape(mddev_t *mddev) } } else { int d; - raid5_conf_t *conf = mddev_to_conf(mddev); mddev->degraded = conf->raid_disks; for (d = 0; d < conf->raid_disks ; d++) if (conf->disks[d].rdev && @@ -4982,6 +4986,8 @@ static void raid5_finish_reshape(mddev_t *mddev) d++) raid5_remove_disk(mddev, d); } + mddev->layout = conf->algorithm; + mddev->chunk_size = conf->chunk_size; mddev->reshape_position = MaxSector; mddev->delta_disks = 0; } @@ -5080,11 +5086,10 @@ static void *raid5_takeover_raid6(mddev_t *mddev) static int raid5_reconfig(mddev_t *mddev, int new_layout, int new_chunk) { - /* Currently the layout and chunk size can only be changed - * for a 2-drive raid array, as in that case no data shuffling - * is required. - * Later we might validate these and set new_* so a reshape - * can complete the change. + /* For a 2-drive array, the layout and chunk size can be changed + * immediately as not restriping is needed. + * For larger arrays we record the new value - after validation + * to be used by a reshape pass. */ raid5_conf_t *conf = mddev_to_conf(mddev); @@ -5103,19 +5108,49 @@ static int raid5_reconfig(mddev_t *mddev, int new_layout, int new_chunk) /* They look valid */ - if (mddev->raid_disks != 2) - return -EINVAL; + if (mddev->raid_disks == 2) { - if (new_layout >= 0) { - conf->algorithm = new_layout; - mddev->layout = mddev->new_layout = new_layout; + if (new_layout >= 0) { + conf->algorithm = new_layout; + mddev->layout = mddev->new_layout = new_layout; + } + if (new_chunk > 0) { + conf->chunk_size = new_chunk; + mddev->chunk_size = mddev->new_chunk = new_chunk; + } + set_bit(MD_CHANGE_DEVS, &mddev->flags); + md_wakeup_thread(mddev->thread); + } else { + if (new_layout >= 0) + mddev->new_layout = new_layout; + if (new_chunk > 0) + mddev->new_chunk = new_chunk; } + return 0; +} + +static int raid6_reconfig(mddev_t *mddev, int new_layout, int new_chunk) +{ + if (new_layout >= 0 && !algorithm_valid_raid6(new_layout)) + return -EINVAL; if (new_chunk > 0) { - conf->chunk_size = new_chunk; - mddev->chunk_size = mddev->new_chunk = new_chunk; + if (new_chunk & (new_chunk-1)) + /* not a power of 2 */ + return -EINVAL; + if (new_chunk < PAGE_SIZE) + return -EINVAL; + if (mddev->array_sectors & ((new_chunk>>9)-1)) + /* not factor of array size */ + return -EINVAL; } - set_bit(MD_CHANGE_DEVS, &mddev->flags); - md_wakeup_thread(mddev->thread); + + /* They look valid */ + + if (new_layout >= 0) + mddev->new_layout = new_layout; + if (new_chunk > 0) + mddev->new_chunk = new_chunk; + return 0; } @@ -5216,6 +5251,7 @@ static struct mdk_personality raid6_personality = #endif .quiesce = raid5_quiesce, .takeover = raid6_takeover, + .reconfig = raid6_reconfig, }; static struct mdk_personality raid5_personality = { -- cgit v0.10.2 From d1a7c50369835f9ecbd7752016cd9302ecfae678 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 31 Mar 2009 15:24:32 +1100 Subject: md: don't display meaningless values in sysfs files resync_start and sync_speed When no resync if happening, both of these files currently have meaningless values (is slightly different ways). Change them to "none" in that case. Signed-off-by: NeilBrown diff --git a/drivers/md/md.c b/drivers/md/md.c index c509313..2be574c 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -2888,6 +2888,8 @@ __ATTR(chunk_size, S_IRUGO|S_IWUSR, chunk_size_show, chunk_size_store); static ssize_t resync_start_show(mddev_t *mddev, char *page) { + if (mddev->recovery_cp == MaxSector) + return sprintf(page, "none\n"); return sprintf(page, "%llu\n", (unsigned long long)mddev->recovery_cp); } @@ -3469,6 +3471,8 @@ static ssize_t sync_speed_show(mddev_t *mddev, char *page) { unsigned long resync, dt, db; + if (mddev->curr_resync == 0) + return sprintf(page, "none\n"); resync = mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active); dt = (jiffies - mddev->resync_mark) / HZ; if (!dt) dt++; -- cgit v0.10.2 From ab69ae12ceef7f23c578a3c230144e94a167a821 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 31 Mar 2009 15:26:47 +1100 Subject: md/raid5: be more careful about write ordering when reshaping. When we are reshaping an array, it is very important that we read the data from a particular sector offset before writing new data at that offset. In most cases when growing or shrinking an array we read long before we even consider writing. But when restriping an array without changing it size, there is a small possibility that we might have some data to available write before the read has happened at the same location. This would require some stripes to be in cache already. To guard against this small possibility, we check, before writing, that the 'old' stripe at the same location is not in the process of being read. And we ensure that we mark all 'source' stripes as such before allowing new 'destination' stripes to proceed. Signed-off-by: NeilBrown diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 4fdc6d0..062df846 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -395,7 +395,8 @@ get_active_stripe(raid5_conf_t *conf, sector_t sector, init_stripe(sh, sector, previous); } else { if (atomic_read(&sh->count)) { - BUG_ON(!list_empty(&sh->lru)); + BUG_ON(!list_empty(&sh->lru) + && !test_bit(STRIPE_EXPANDING, &sh->state)); } else { if (!test_bit(STRIPE_HANDLE, &sh->state)) atomic_inc(&conf->active_stripes); @@ -2944,6 +2945,23 @@ static bool handle_stripe5(struct stripe_head *sh) /* Finish reconstruct operations initiated by the expansion process */ if (sh->reconstruct_state == reconstruct_state_result) { + struct stripe_head *sh2 + = get_active_stripe(conf, sh->sector, 1, 1); + if (sh2 && test_bit(STRIPE_EXPAND_SOURCE, &sh2->state)) { + /* sh cannot be written until sh2 has been read. + * so arrange for sh to be delayed a little + */ + set_bit(STRIPE_DELAYED, &sh->state); + set_bit(STRIPE_HANDLE, &sh->state); + if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, + &sh2->state)) + atomic_inc(&conf->preread_active_stripes); + release_stripe(sh2); + goto unlock; + } + if (sh2) + release_stripe(sh2); + sh->reconstruct_state = reconstruct_state_idle; clear_bit(STRIPE_EXPANDING, &sh->state); for (i = conf->raid_disks; i--; ) { @@ -3172,6 +3190,23 @@ static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page) } if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state)) { + struct stripe_head *sh2 + = get_active_stripe(conf, sh->sector, 1, 1); + if (sh2 && test_bit(STRIPE_EXPAND_SOURCE, &sh2->state)) { + /* sh cannot be written until sh2 has been read. + * so arrange for sh to be delayed a little + */ + set_bit(STRIPE_DELAYED, &sh->state); + set_bit(STRIPE_HANDLE, &sh->state); + if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, + &sh2->state)) + atomic_inc(&conf->preread_active_stripes); + release_stripe(sh2); + goto unlock; + } + if (sh2) + release_stripe(sh2); + /* Need to write out all blocks after computing P&Q */ sh->disks = conf->raid_disks; stripe_set_idx(sh->sector, conf, 0, sh); @@ -3739,6 +3774,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped sector_t writepos, safepos, gap; sector_t stripe_addr; int reshape_sectors; + struct list_head stripes; if (sector_nr == 0) { /* If restarting in the middle, skip the initial sectors */ @@ -3816,6 +3852,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped BUG_ON(writepos != sector_nr + reshape_sectors); stripe_addr = sector_nr; } + INIT_LIST_HEAD(&stripes); for (i = 0; i < reshape_sectors; i += STRIPE_SECTORS) { int j; int skipped = 0; @@ -3845,7 +3882,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped set_bit(STRIPE_EXPAND_READY, &sh->state); set_bit(STRIPE_HANDLE, &sh->state); } - release_stripe(sh); + list_add(&sh->lru, &stripes); } spin_lock_irq(&conf->device_lock); if (mddev->delta_disks < 0) @@ -3874,6 +3911,14 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped release_stripe(sh); first_sector += STRIPE_SECTORS; } + /* Now that the sources are clearly marked, we can release + * the destination stripes + */ + while (!list_empty(&stripes)) { + sh = list_entry(stripes.next, struct stripe_head, lru); + list_del_init(&sh->lru); + release_stripe(sh); + } /* If this takes us to the resync_max point where we have to pause, * then we need to write out the superblock. */ -- cgit v0.10.2 From 2cffc4a01dd90a502324e3453d7b245d6d16e1c2 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 31 Mar 2009 15:27:05 +1100 Subject: md: remove CONFIG_MD_RAID_RESHAPE config option. This was only needed when the code was experimental. Most of it is well tested now, so the option is no longer useful. Signed-off-by: NeilBrown diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index 449d0b9..36e0675 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -152,35 +152,6 @@ config MD_RAID456 If unsure, say Y. -config MD_RAID5_RESHAPE - bool "Support adding drives to a raid-5 array" - depends on MD_RAID456 - default y - ---help--- - A RAID-5 set can be expanded by adding extra drives. This - requires "restriping" the array which means (almost) every - block must be written to a different place. - - This option allows such restriping to be done while the array - is online. - - You will need mdadm version 2.4.1 or later to use this - feature safely. During the early stage of reshape there is - a critical section where live data is being over-written. A - crash during this time needs extra care for recovery. The - newer mdadm takes a copy of the data in the critical section - and will restore it, if necessary, after a crash. - - The mdadm usage is e.g. - mdadm --grow /dev/md1 --raid-disks=6 - to grow '/dev/md1' to having 6 disks. - - Note: The array can only be expanded, not contracted. - There should be enough spares already present to make the new - array workable. - - If unsure, say Y. - config MD_RAID6_PQ tristate diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 062df846..fb11c13 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -948,7 +948,6 @@ static int grow_stripes(raid5_conf_t *conf, int num) return 0; } -#ifdef CONFIG_MD_RAID5_RESHAPE static int resize_stripes(raid5_conf_t *conf, int newsize) { /* Make all the stripes able to hold 'newsize' devices. @@ -1073,7 +1072,6 @@ static int resize_stripes(raid5_conf_t *conf, int newsize) conf->pool_size = newsize; return err; } -#endif static int drop_one_stripe(raid5_conf_t *conf) { @@ -4822,7 +4820,6 @@ static int raid5_resize(mddev_t *mddev, sector_t sectors) return 0; } -#ifdef CONFIG_MD_RAID5_RESHAPE static int raid5_check_reshape(mddev_t *mddev) { raid5_conf_t *conf = mddev_to_conf(mddev); @@ -4967,7 +4964,6 @@ static int raid5_start_reshape(mddev_t *mddev) md_new_event(mddev); return 0; } -#endif /* This is called from the reshape thread and should make any * changes needed in 'conf' @@ -5289,11 +5285,9 @@ static struct mdk_personality raid6_personality = .sync_request = sync_request, .resize = raid5_resize, .size = raid5_size, -#ifdef CONFIG_MD_RAID5_RESHAPE .check_reshape = raid5_check_reshape, .start_reshape = raid5_start_reshape, .finish_reshape = raid5_finish_reshape, -#endif .quiesce = raid5_quiesce, .takeover = raid6_takeover, .reconfig = raid6_reconfig, @@ -5314,11 +5308,9 @@ static struct mdk_personality raid5_personality = .sync_request = sync_request, .resize = raid5_resize, .size = raid5_size, -#ifdef CONFIG_MD_RAID5_RESHAPE .check_reshape = raid5_check_reshape, .start_reshape = raid5_start_reshape, .finish_reshape = raid5_finish_reshape, -#endif .quiesce = raid5_quiesce, .takeover = raid5_takeover, .reconfig = raid5_reconfig, @@ -5340,11 +5332,9 @@ static struct mdk_personality raid4_personality = .sync_request = sync_request, .resize = raid5_resize, .size = raid5_size, -#ifdef CONFIG_MD_RAID5_RESHAPE .check_reshape = raid5_check_reshape, .start_reshape = raid5_start_reshape, .finish_reshape = raid5_finish_reshape, -#endif .quiesce = raid5_quiesce, }; -- cgit v0.10.2 From b0f9ec047b79a92e8b8a9dfbf97537c8fbef234a Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 31 Mar 2009 15:27:18 +1100 Subject: md/raid5: minor code cleanups in make_request. ... and to be certain the that make_request doesn't wait forever, add a 'wake_up' when ->reshape_progress has been set to MaxSector Signed-off-by: NeilBrown diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index fb11c13..bb4b12e 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -3637,10 +3637,9 @@ static int make_request(struct request_queue *q, struct bio * bi) retry: previous = 0; + disks = conf->raid_disks; prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE); - if (likely(conf->reshape_progress == MaxSector)) - disks = conf->raid_disks; - else { + if (unlikely(conf->reshape_progress != MaxSector)) { /* spinlock is needed as reshape_progress may be * 64bit on a 32bit platform, and so it might be * possible to see a half-updated value @@ -3650,7 +3649,6 @@ static int make_request(struct request_queue *q, struct bio * bi) * to check again. */ spin_lock_irq(&conf->device_lock); - disks = conf->raid_disks; if (mddev->delta_disks < 0 ? logical_sector < conf->reshape_progress : logical_sector >= conf->reshape_progress) { @@ -3679,7 +3677,7 @@ static int make_request(struct request_queue *q, struct bio * bi) sh = get_active_stripe(conf, new_sector, previous, (bi->bi_rw&RWA_MASK)); if (sh) { - if (unlikely(conf->reshape_progress != MaxSector)) { + if (unlikely(previous)) { /* expansion might have moved on while waiting for a * stripe, so we must do the range check again. * Expansion could still move past after this @@ -3690,10 +3688,9 @@ static int make_request(struct request_queue *q, struct bio * bi) */ int must_retry = 0; spin_lock_irq(&conf->device_lock); - if ((mddev->delta_disks < 0 - ? logical_sector >= conf->reshape_progress - : logical_sector < conf->reshape_progress) - && previous) + if (mddev->delta_disks < 0 + ? logical_sector >= conf->reshape_progress + : logical_sector < conf->reshape_progress) /* mismatch, need to try again */ must_retry = 1; spin_unlock_irq(&conf->device_lock); @@ -4977,6 +4974,7 @@ static void end_reshape(raid5_conf_t *conf) conf->previous_raid_disks = conf->raid_disks; conf->reshape_progress = MaxSector; spin_unlock_irq(&conf->device_lock); + wake_up(&conf->wait_for_overlap); /* read-ahead size must cover two whole stripes, which is * 2 * (datadisks) * chunksize where 'n' is the number of raid devices -- cgit v0.10.2 From c8f517c444e4f9f55b5b5ca202b8404691a35805 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 31 Mar 2009 15:28:40 +1100 Subject: md/raid5 revise rules for when to update metadata during reshape We currently update the metadata : 1/ every 3Megabytes 2/ When the place we will write new-layout data to is recorded in the metadata as still containing old-layout data. Rule one exists to avoid having to re-do too much reshaping in the face of a crash/restart. So it should really be time based rather than size based. So change it to "every 10 seconds". Rule two turns out to be too harsh when restriping an array 'in-place', as in that case the metadata much be updates for every stripe. For the in-place update, it can only possibly be safe from a crash if some user-space program data a backup of every e.g. few hundred stripes before allowing them to be reshaped. In that case, the constant metadata update is pointless. So only update the metadata if the new metadata will report that the end of the 'old-layout' data is beyond where we are currently writing 'new-layout' data. Signed-off-by: NeilBrown diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index bb4b12e..3bbc6d6 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -3766,7 +3766,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped int new_data_disks = conf->raid_disks - conf->max_degraded; int i; int dd_idx; - sector_t writepos, safepos, gap; + sector_t writepos, readpos, safepos; sector_t stripe_addr; int reshape_sectors; struct list_head stripes; @@ -3806,26 +3806,46 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped */ writepos = conf->reshape_progress; sector_div(writepos, new_data_disks); + readpos = conf->reshape_progress; + sector_div(readpos, data_disks); safepos = conf->reshape_safe; sector_div(safepos, data_disks); if (mddev->delta_disks < 0) { writepos -= reshape_sectors; + readpos += reshape_sectors; safepos += reshape_sectors; - gap = conf->reshape_safe - conf->reshape_progress; } else { writepos += reshape_sectors; + readpos -= reshape_sectors; safepos -= reshape_sectors; - gap = conf->reshape_progress - conf->reshape_safe; } + /* 'writepos' is the most advanced device address we might write. + * 'readpos' is the least advanced device address we might read. + * 'safepos' is the least address recorded in the metadata as having + * been reshaped. + * If 'readpos' is behind 'writepos', then there is no way that we can + * ensure safety in the face of a crash - that must be done by userspace + * making a backup of the data. So in that case there is no particular + * rush to update metadata. + * Otherwise if 'safepos' is behind 'writepos', then we really need to + * update the metadata to advance 'safepos' to match 'readpos' so that + * we can be safe in the event of a crash. + * So we insist on updating metadata if safepos is behind writepos and + * readpos is beyond writepos. + * In any case, update the metadata every 10 seconds. + * Maybe that number should be configurable, but I'm not sure it is + * worth it.... maybe it could be a multiple of safemode_delay??? + */ if ((mddev->delta_disks < 0 - ? writepos < safepos - : writepos > safepos) || - gap > (new_data_disks)*3000*2 /*3Meg*/) { + ? (safepos > writepos && readpos < writepos) + : (safepos < writepos && readpos > writepos)) || + time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) { /* Cannot proceed until we've updated the superblock... */ wait_event(conf->wait_for_overlap, atomic_read(&conf->reshape_stripes)==0); mddev->reshape_position = conf->reshape_progress; + conf->reshape_checkpoint = jiffies; set_bit(MD_CHANGE_DEVS, &mddev->flags); md_wakeup_thread(mddev->thread); wait_event(mddev->sb_wait, mddev->flags == 0 || @@ -3923,6 +3943,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped wait_event(conf->wait_for_overlap, atomic_read(&conf->reshape_stripes) == 0); mddev->reshape_position = conf->reshape_progress; + conf->reshape_checkpoint = jiffies; set_bit(MD_CHANGE_DEVS, &mddev->flags); md_wakeup_thread(mddev->thread); wait_event(mddev->sb_wait, @@ -4957,6 +4978,7 @@ static int raid5_start_reshape(mddev_t *mddev) spin_unlock_irq(&conf->device_lock); return -EAGAIN; } + conf->reshape_checkpoint = jiffies; md_wakeup_thread(mddev->sync_thread); md_new_event(mddev); return 0; diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index cdd0456..52ba999 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -352,6 +352,8 @@ struct raid5_private_data { int previous_raid_disks; int prev_chunk, prev_algo; short generation; /* increments with every reshape */ + unsigned long reshape_checkpoint; /* Time we last updated + * metadata */ struct list_head handle_list; /* stripes needing handling */ struct list_head hold_list; /* preread ready stripes */ -- cgit v0.10.2