From 101a642781cf6b37848c5468d6c1a20a76c35ea9 Mon Sep 17 00:00:00 2001 From: Boaz Harrosh Date: Thu, 3 Apr 2014 17:53:31 +0300 Subject: ore: (trivial) reformat some code rearrange some source lines. Nothing changed. Signed-off-by: Boaz Harrosh diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c index dae8846..92157b6 100644 --- a/fs/exofs/ore.c +++ b/fs/exofs/ore.c @@ -675,8 +675,8 @@ static int _prepare_for_striping(struct ore_io_state *ios) si->cur_pg = si->unit_off / PAGE_SIZE; while (length) { - unsigned comp = dev - first_dev; - struct ore_per_dev_state *per_dev = &ios->per_dev[comp]; + struct ore_per_dev_state *per_dev = + &ios->per_dev[dev - first_dev]; unsigned cur_len, page_off = 0; if (!per_dev->length) { @@ -708,11 +708,9 @@ static int _prepare_for_striping(struct ore_io_state *ios) if (unlikely(ret)) goto out; - dev += mirrors_p1; - dev = (dev % devs_in_group) + first_dev; - length -= cur_len; + dev = ((dev + mirrors_p1) % devs_in_group) + first_dev; si->cur_comp = (si->cur_comp + 1) % group_width; if (unlikely((dev == si->par_dev) || (!length && ios->sp2d))) { if (!length && ios->sp2d) { @@ -721,11 +719,6 @@ static int _prepare_for_striping(struct ore_io_state *ios) */ dev = si->par_dev; } - if (ios->sp2d) - /* In writes cur_len just means if it's the - * last one. See _ore_add_parity_unit. - */ - cur_len = length; per_dev = &ios->per_dev[dev - first_dev]; if (!per_dev->length) { /* Only/always the parity unit of the first @@ -736,7 +729,11 @@ static int _prepare_for_striping(struct ore_io_state *ios) per_dev->offset = si->obj_offset - si->unit_off; } - ret = _ore_add_parity_unit(ios, si, per_dev, cur_len); + /* In writes cur_len just means if it's the + * last one. See _ore_add_parity_unit. + */ + ret = _ore_add_parity_unit(ios, si, per_dev, + ios->sp2d ? length : cur_len); if (unlikely(ret)) goto out; diff --git a/fs/exofs/ore_raid.c b/fs/exofs/ore_raid.c index 4e2c032..af417d3 100644 --- a/fs/exofs/ore_raid.c +++ b/fs/exofs/ore_raid.c @@ -226,9 +226,7 @@ static void _gen_xor_unit(struct __stripe_pages_2d *sp2d) init_async_submit(&_1ps->submit, ASYNC_TX_XOR_ZERO_DST | ASYNC_TX_ACK, - NULL, - NULL, NULL, - (addr_conv_t *)_1ps->scribble); + NULL, NULL, NULL, (addr_conv_t *)_1ps->scribble); /* TODO: raid6 */ _1ps->tx = async_xor(_1ps->pages[sp2d->data_devs], _1ps->pages, -- cgit v0.10.2 From 455682ce547817d75e38028283dc8db00754005d Mon Sep 17 00:00:00 2001 From: Boaz Harrosh Date: Wed, 9 Apr 2014 23:14:38 +0300 Subject: ore: Remove redundant dev_order(), more cleanups Two cleanups: * si->cur_comp, si->cur_pg where always calculated after the call to ore_calc_stripe_info() with the help of _dev_order(...). But these are already calculated by ore_calc_stripe_info() and can be just set there. (This is left over from the time that si->cur_comp, si->cur_pg were only used by raid code, but now the main loop manages them anyway even though they are ultimately not used in none raid code) * si->cur_comp - For the very last stripe case, was set inside _ore_add_parity_unit(). This is not clear and will be wrong for coming raid6 so move this to only caller. Now si->cur_comp is only manipulated within _prepare_for_striping(), always next to the manipulation of cur_dev. Which is much easier to understand and follow. Signed-off-by: Boaz Harrosh diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c index 92157b6..0e2a835 100644 --- a/fs/exofs/ore.c +++ b/fs/exofs/ore.c @@ -545,17 +545,19 @@ void ore_calc_stripe_info(struct ore_layout *layout, u64 file_offset, /* "H - (N * U)" is just "H % U" so it's bound to u32 */ u32 C = (u32)(H - (N * U)) / stripe_unit + G * group_width; + u32 first_dev = C - C % group_width; div_u64_rem(file_offset, stripe_unit, &si->unit_off); si->obj_offset = si->unit_off + (N * stripe_unit) + (M * group_depth * stripe_unit); + si->cur_comp = C - first_dev; + si->cur_pg = si->unit_off / PAGE_SIZE; if (parity) { u32 LCMdP = lcm(group_width, parity) / parity; /* R = N % LCMdP; */ u32 RxP = (N % LCMdP) * parity; - u32 first_dev = C - C % group_width; si->par_dev = (group_width + group_width - parity - RxP) % group_width + first_dev; @@ -670,9 +672,7 @@ static int _prepare_for_striping(struct ore_io_state *ios) BUG_ON(length > si->length); - dev_order = _dev_order(devs_in_group, mirrors_p1, si->par_dev, dev); - si->cur_comp = dev_order; - si->cur_pg = si->unit_off / PAGE_SIZE; + dev_order = si->cur_comp; while (length) { struct ore_per_dev_state *per_dev = @@ -718,6 +718,8 @@ static int _prepare_for_striping(struct ore_io_state *ios) * stripe. then operate on parity dev. */ dev = si->par_dev; + /* If last stripe operate on parity comp */ + si->cur_comp = group_width - ios->layout->parity; } per_dev = &ios->per_dev[dev - first_dev]; if (!per_dev->length) { diff --git a/fs/exofs/ore_raid.c b/fs/exofs/ore_raid.c index af417d3..d58a952 100644 --- a/fs/exofs/ore_raid.c +++ b/fs/exofs/ore_raid.c @@ -402,9 +402,8 @@ static int _add_to_r4w_last_page(struct ore_io_state *ios, u64 *offset) ore_calc_stripe_info(ios->layout, *offset, 0, &si); - p = si.unit_off / PAGE_SIZE; - c = _dev_order(ios->layout->group_width * ios->layout->mirrors_p1, - ios->layout->mirrors_p1, si.par_dev, si.dev); + p = si.cur_pg; + c = si.cur_comp; page = ios->sp2d->_1p_stripes[p].pages[c]; pg_len = PAGE_SIZE - (si.unit_off % PAGE_SIZE); @@ -532,9 +531,8 @@ static int _read_4_write_last_stripe(struct ore_io_state *ios) goto read_it; ore_calc_stripe_info(ios->layout, offset, 0, &read_si); - p = read_si.unit_off / PAGE_SIZE; - c = _dev_order(ios->layout->group_width * ios->layout->mirrors_p1, - ios->layout->mirrors_p1, read_si.par_dev, read_si.dev); + p = read_si.cur_pg; + c = read_si.cur_comp; if (min_p == sp2d->pages_in_unit) { /* Didn't do it yet */ @@ -638,9 +636,6 @@ int _ore_add_parity_unit(struct ore_io_state *ios, si->cur_pg = _sp2d_min_pg(sp2d); num_pages = _sp2d_max_pg(sp2d) + 1 - si->cur_pg; - if (!cur_len) /* If last stripe operate on parity comp */ - si->cur_comp = sp2d->data_devs; - if (!per_dev->length) { per_dev->offset += si->cur_pg * PAGE_SIZE; /* If first stripe, Read in all read4write pages diff --git a/fs/exofs/ore_raid.h b/fs/exofs/ore_raid.h index 2ffd2c3..d365bda 100644 --- a/fs/exofs/ore_raid.h +++ b/fs/exofs/ore_raid.h @@ -31,24 +31,6 @@ #define ORE_DBGMSG2(M...) do {} while (0) /* #define ORE_DBGMSG2 ORE_DBGMSG */ -/* Calculate the component order in a stripe. eg the logical data unit - * address within the stripe of @dev given the @par_dev of this stripe. - */ -static inline unsigned _dev_order(unsigned devs_in_group, unsigned mirrors_p1, - unsigned par_dev, unsigned dev) -{ - unsigned first_dev = dev - dev % devs_in_group; - - dev -= first_dev; - par_dev -= first_dev; - - if (devs_in_group == par_dev) /* The raid 0 case */ - return dev / mirrors_p1; - /* raid4/5/6 case */ - return ((devs_in_group + dev - par_dev - mirrors_p1) % devs_in_group) / - mirrors_p1; -} - /* ios_raid.c stuff needed by ios.c */ int _ore_post_alloc_raid_stuff(struct ore_io_state *ios); void _ore_free_raid_stuff(struct ore_io_state *ios); -- cgit v0.10.2 From ce5d36aac26cc395fe3bc45525cdbad3644f01e5 Mon Sep 17 00:00:00 2001 From: Boaz Harrosh Date: Thu, 22 May 2014 14:48:15 +0300 Subject: ore: Support for raid 6 This simple patch adds support for raid6 to the ORE. Most operations and calculations where already for the general case. Only things left: * call async_gen_syndrome() in the case of raid6 (NOTE that the raid6 math is the one supported by the Linux Kernel see: crypto/async_tx/async_pq.c) * call _ore_add_parity_unit() twice with only last call generating the redundancy pages. * Fix couple BUGS in old code a. In reads when parity==2 it can happen that per_dev->length=0 but per_dev->offset was set and adjusted by _ore_add_sg_seg(). Don't let it be overwritten. b. The all 'cur_comp > starting_dev' thing to determine if: "per_dev->offset is in the current stripe number or the next one." Was a complete raid5/4 accident. When parity==2 this is not at all true usually. All we need to do is increment si->ob_offset once we pass by the first parity device. (This also greatly simplifies the code, amen) c. Calculation of si->dev rotation can overflow when parity==2. * Then last enable raid6 in ore_verify_layout() I want to deeply thank Daniel Gryniewicz who found first all the bugs in the old raid code, and inspired these patches: Inspired-by Daniel Gryniewicz Signed-off-by: Boaz Harrosh diff --git a/fs/exofs/Kconfig.ore b/fs/exofs/Kconfig.ore index 1ca7fb7..2daf232 100644 --- a/fs/exofs/Kconfig.ore +++ b/fs/exofs/Kconfig.ore @@ -9,4 +9,6 @@ config ORE tristate depends on EXOFS_FS || PNFS_OBJLAYOUT select ASYNC_XOR + select RAID6_PQ + select ASYNC_PQ default SCSI_OSD_ULD diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c index 0e2a835..cfc0205 100644 --- a/fs/exofs/ore.c +++ b/fs/exofs/ore.c @@ -58,9 +58,12 @@ int ore_verify_layout(unsigned total_comps, struct ore_layout *layout) layout->parity = 1; break; case PNFS_OSD_RAID_PQ: + layout->parity = 2; + break; case PNFS_OSD_RAID_4: default: - ORE_ERR("Only RAID_0/5 for now\n"); + ORE_ERR("Only RAID_0/5/6 for now received-enum=%d\n", + layout->raid_algorithm); return -EINVAL; } if (0 != (layout->stripe_unit & ~PAGE_MASK)) { @@ -112,6 +115,8 @@ int ore_verify_layout(unsigned total_comps, struct ore_layout *layout) layout->max_io_length /= stripe_length; layout->max_io_length *= stripe_length; } + ORE_DBGMSG("max_io_length=0x%lx\n", layout->max_io_length); + return 0; } EXPORT_SYMBOL(ore_verify_layout); @@ -561,7 +566,8 @@ void ore_calc_stripe_info(struct ore_layout *layout, u64 file_offset, si->par_dev = (group_width + group_width - parity - RxP) % group_width + first_dev; - si->dev = (group_width + C - RxP) % group_width + first_dev; + si->dev = (group_width + group_width + C - RxP) % + group_width + first_dev; si->bytes_in_stripe = U; si->first_stripe_start = M * S + G * T + N * U; } else { @@ -651,6 +657,43 @@ out: /* we fail the complete unit on an error eg don't advance return ret; } +static int _add_parity_units(struct ore_io_state *ios, + struct ore_striping_info *si, + unsigned dev, unsigned first_dev, + unsigned mirrors_p1, unsigned devs_in_group, + unsigned cur_len) +{ + unsigned do_parity; + int ret = 0; + + for (do_parity = ios->layout->parity; do_parity; --do_parity) { + struct ore_per_dev_state *per_dev; + + per_dev = &ios->per_dev[dev - first_dev]; + if (!per_dev->length && !per_dev->offset) { + /* Only/always the parity unit of the first + * stripe will be empty. So this is a chance to + * initialize the per_dev info. + */ + per_dev->dev = dev; + per_dev->offset = si->obj_offset - si->unit_off; + } + + ret = _ore_add_parity_unit(ios, si, per_dev, cur_len, + do_parity == 1); + if (unlikely(ret)) + break; + + if (do_parity != 1) { + dev = ((dev + mirrors_p1) % devs_in_group) + first_dev; + si->cur_comp = (si->cur_comp + 1) % + ios->layout->group_width; + } + } + + return ret; +} + static int _prepare_for_striping(struct ore_io_state *ios) { struct ore_striping_info *si = &ios->si; @@ -660,7 +703,6 @@ static int _prepare_for_striping(struct ore_io_state *ios) unsigned devs_in_group = group_width * mirrors_p1; unsigned dev = si->dev; unsigned first_dev = dev - (dev % devs_in_group); - unsigned dev_order; unsigned cur_pg = ios->pages_consumed; u64 length = ios->length; int ret = 0; @@ -672,14 +714,13 @@ static int _prepare_for_striping(struct ore_io_state *ios) BUG_ON(length > si->length); - dev_order = si->cur_comp; - while (length) { struct ore_per_dev_state *per_dev = &ios->per_dev[dev - first_dev]; unsigned cur_len, page_off = 0; - if (!per_dev->length) { + if (!per_dev->length && !per_dev->offset) { + /* First time initialize the per_dev info. */ per_dev->dev = dev; if (dev == si->dev) { WARN_ON(dev == si->par_dev); @@ -688,13 +729,7 @@ static int _prepare_for_striping(struct ore_io_state *ios) page_off = si->unit_off & ~PAGE_MASK; BUG_ON(page_off && (page_off != ios->pgbase)); } else { - if (si->cur_comp > dev_order) - per_dev->offset = - si->obj_offset - si->unit_off; - else /* si->cur_comp < dev_order */ - per_dev->offset = - si->obj_offset + stripe_unit - - si->unit_off; + per_dev->offset = si->obj_offset - si->unit_off; cur_len = stripe_unit; } } else { @@ -721,20 +756,12 @@ static int _prepare_for_striping(struct ore_io_state *ios) /* If last stripe operate on parity comp */ si->cur_comp = group_width - ios->layout->parity; } - per_dev = &ios->per_dev[dev - first_dev]; - if (!per_dev->length) { - /* Only/always the parity unit of the first - * stripe will be empty. So this is a chance to - * initialize the per_dev info. - */ - per_dev->dev = dev; - per_dev->offset = si->obj_offset - si->unit_off; - } /* In writes cur_len just means if it's the * last one. See _ore_add_parity_unit. */ - ret = _ore_add_parity_unit(ios, si, per_dev, + ret = _add_parity_units(ios, si, dev, first_dev, + mirrors_p1, devs_in_group, ios->sp2d ? length : cur_len); if (unlikely(ret)) goto out; @@ -746,6 +773,8 @@ static int _prepare_for_striping(struct ore_io_state *ios) /* Next stripe, start fresh */ si->cur_comp = 0; si->cur_pg = 0; + si->obj_offset += cur_len; + si->unit_off = 0; } } out: diff --git a/fs/exofs/ore_raid.c b/fs/exofs/ore_raid.c index d58a952..7f20f25 100644 --- a/fs/exofs/ore_raid.c +++ b/fs/exofs/ore_raid.c @@ -218,20 +218,28 @@ static unsigned _sp2d_max_pg(struct __stripe_pages_2d *sp2d) static void _gen_xor_unit(struct __stripe_pages_2d *sp2d) { unsigned p; + unsigned tx_flags = ASYNC_TX_ACK; + + if (sp2d->parity == 1) + tx_flags |= ASYNC_TX_XOR_ZERO_DST; + for (p = 0; p < sp2d->pages_in_unit; p++) { struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; if (!_1ps->write_count) continue; - init_async_submit(&_1ps->submit, - ASYNC_TX_XOR_ZERO_DST | ASYNC_TX_ACK, + init_async_submit(&_1ps->submit, tx_flags, NULL, NULL, NULL, (addr_conv_t *)_1ps->scribble); - /* TODO: raid6 */ - _1ps->tx = async_xor(_1ps->pages[sp2d->data_devs], _1ps->pages, - 0, sp2d->data_devs, PAGE_SIZE, - &_1ps->submit); + if (sp2d->parity == 1) + _1ps->tx = async_xor(_1ps->pages[sp2d->data_devs], + _1ps->pages, 0, sp2d->data_devs, + PAGE_SIZE, &_1ps->submit); + else /* parity == 2 */ + _1ps->tx = async_gen_syndrome(_1ps->pages, 0, + sp2d->data_devs + sp2d->parity, + PAGE_SIZE, &_1ps->submit); } for (p = 0; p < sp2d->pages_in_unit; p++) { @@ -616,7 +624,7 @@ static int _read_4_write_execute(struct ore_io_state *ios) int _ore_add_parity_unit(struct ore_io_state *ios, struct ore_striping_info *si, struct ore_per_dev_state *per_dev, - unsigned cur_len) + unsigned cur_len, bool do_xor) { if (ios->reading) { if (per_dev->cur_sg >= ios->sgs_per_dev) { @@ -641,9 +649,11 @@ int _ore_add_parity_unit(struct ore_io_state *ios, /* If first stripe, Read in all read4write pages * (if needed) before we calculate the first parity. */ - _read_4_write_first_stripe(ios); + if (do_xor) + _read_4_write_first_stripe(ios); } - if (!cur_len) /* If last stripe r4w pages of last stripe */ + if (!cur_len && do_xor) + /* If last stripe r4w pages of last stripe */ _read_4_write_last_stripe(ios); _read_4_write_execute(ios); @@ -655,7 +665,7 @@ int _ore_add_parity_unit(struct ore_io_state *ios, ++(ios->cur_par_page); } - BUG_ON(si->cur_comp != sp2d->data_devs); + BUG_ON(si->cur_comp < sp2d->data_devs); BUG_ON(si->cur_pg + num_pages > sp2d->pages_in_unit); ret = _ore_add_stripe_unit(ios, &array_start, 0, pages, @@ -663,9 +673,10 @@ int _ore_add_parity_unit(struct ore_io_state *ios, if (unlikely(ret)) return ret; - /* TODO: raid6 if (last_parity_dev) */ - _gen_xor_unit(sp2d); - _sp2d_reset(sp2d, ios->r4w, ios->private); + if (do_xor) { + _gen_xor_unit(sp2d); + _sp2d_reset(sp2d, ios->r4w, ios->private); + } } return 0; } diff --git a/fs/exofs/ore_raid.h b/fs/exofs/ore_raid.h index d365bda..cf6375d 100644 --- a/fs/exofs/ore_raid.h +++ b/fs/exofs/ore_raid.h @@ -38,7 +38,8 @@ void _ore_free_raid_stuff(struct ore_io_state *ios); void _ore_add_sg_seg(struct ore_per_dev_state *per_dev, unsigned cur_len, bool not_last); int _ore_add_parity_unit(struct ore_io_state *ios, struct ore_striping_info *si, - struct ore_per_dev_state *per_dev, unsigned cur_len); + struct ore_per_dev_state *per_dev, unsigned cur_len, + bool do_xor); void _ore_add_stripe_page(struct __stripe_pages_2d *sp2d, struct ore_striping_info *si, struct page *page); static inline void _add_stripe_page(struct __stripe_pages_2d *sp2d, -- cgit v0.10.2