From a8f8c45082cb78489c5e6050c372ddac73ae439e Mon Sep 17 00:00:00 2001 From: Boaz Harrosh Date: Mon, 15 Aug 2011 11:52:36 -0700 Subject: osd: Kconfig remove wrong FIXME The OSD protocol calls for all kind of security levels that use CRYPTO_HMAC and SH1, but the current code only supports NO_SEC, which does not use any of these. Remove a wrong FIXME that calls for them. Thanks Maxin for reporting on this. Reported-by: "Maxin B. John" Signed-off-by: Boaz Harrosh diff --git a/drivers/scsi/osd/Kconfig b/drivers/scsi/osd/Kconfig index 861b5ce..a070351 100644 --- a/drivers/scsi/osd/Kconfig +++ b/drivers/scsi/osd/Kconfig @@ -11,10 +11,6 @@ # it under the terms of the GNU General Public version 2 License as # published by the Free Software Foundation # -# FIXME: SCSI_OSD_INITIATOR should select CONFIG (HMAC) SHA1 somehow. -# How is it done properly? -# - config SCSI_OSD_INITIATOR tristate "OSD-Initiator library" depends on SCSI -- cgit v0.10.2 From 1958c7c284298f0d17a1ab3820c77c40ea3d3711 Mon Sep 17 00:00:00 2001 From: H Hartley Sweeten Date: Fri, 23 Sep 2011 13:42:43 -0700 Subject: exofs/ore.c: local functions should be static This quiets the sparse noise: warning: symbol '_calc_trunk_info' was not declared. Should it be static? Signed-off-by: H Hartley Sweeten Signed-off-by: Boaz Harrosh diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c index 25305af..6114fdf 100644 --- a/fs/exofs/ore.c +++ b/fs/exofs/ore.c @@ -753,8 +753,8 @@ struct _trunc_info { unsigned max_devs; }; -void _calc_trunk_info(struct ore_layout *layout, u64 file_offset, - struct _trunc_info *ti) +static void _calc_trunk_info(struct ore_layout *layout, u64 file_offset, + struct _trunc_info *ti) { unsigned stripe_unit = layout->stripe_unit; -- cgit v0.10.2 From de74b05ace743b4a7aefad9e9b33ff899979b34a Mon Sep 17 00:00:00 2001 From: H Hartley Sweeten Date: Fri, 23 Sep 2011 13:46:51 -0700 Subject: exofs/super.c: local functions should be static This quiets the following sparse noise: warning: symbol 'exofs_sync_fs' was not declared. Should it be static? warning: symbol 'exofs_free_sbi' was not declared. Should it be static? warning: symbol 'exofs_get_parent' was not declared. Should it be static? Signed-off-by: H Hartley Sweeten Signed-off-by: Boaz Harrosh diff --git a/fs/exofs/super.c b/fs/exofs/super.c index 2748940..afe79ee 100644 --- a/fs/exofs/super.c +++ b/fs/exofs/super.c @@ -355,7 +355,7 @@ static const struct export_operations exofs_export_ops; /* * Write the superblock to the OSD */ -int exofs_sync_fs(struct super_block *sb, int wait) +static int exofs_sync_fs(struct super_block *sb, int wait) { struct exofs_sb_info *sbi; struct exofs_fscb *fscb; @@ -429,7 +429,7 @@ static void _exofs_print_device(const char *msg, const char *dev_path, msg, dev_path ?: "", odi->osdname, _LLU(pid)); } -void exofs_free_sbi(struct exofs_sb_info *sbi) +static void exofs_free_sbi(struct exofs_sb_info *sbi) { while (sbi->comps.numdevs) { int i = --sbi->comps.numdevs; @@ -981,7 +981,7 @@ static const struct super_operations exofs_sops = { * EXPORT OPERATIONS *****************************************************************************/ -struct dentry *exofs_get_parent(struct dentry *child) +static struct dentry *exofs_get_parent(struct dentry *child) { unsigned long ino = exofs_parent_ino(child); -- cgit v0.10.2 From 5bf696dad4beecb6174e701c97e1f2574e6a2c96 Mon Sep 17 00:00:00 2001 From: Boaz Harrosh Date: Wed, 28 Sep 2011 11:39:59 +0300 Subject: exofs: Rename struct ore_components comps => oc ore_components already has a comps member so this leads to things like comps->comps which is annoying. the name oc was already used in new code. So rename all old usage of ore_components comps => ore_components oc. Signed-off-by: Boaz Harrosh diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h index f4e442e..c09d5a7 100644 --- a/fs/exofs/exofs.h +++ b/fs/exofs/exofs.h @@ -71,7 +71,7 @@ struct exofs_sb_info { */ struct ore_layout layout; /* Default files layout */ struct ore_comp one_comp; /* id & cred of partition id=0*/ - struct ore_components comps; /* comps for the partition */ + struct ore_components oc; /* comps for the partition */ struct osd_dev *_min_one_dev[1]; /* Place holder for one dev */ }; @@ -86,7 +86,7 @@ struct exofs_i_info { uint32_t i_dir_start_lookup; /* which page to start lookup */ uint64_t i_commit_size; /* the object's written length */ struct ore_comp one_comp; /* same component for all devices */ - struct ore_components comps; /* inode view of the device table */ + struct ore_components oc; /* inode view of the device table */ }; static inline osd_id exofs_oi_objno(struct exofs_i_info *oi) @@ -207,7 +207,7 @@ extern const struct inode_operations exofs_fast_symlink_inode_operations; * bigger and that the device table repeats twice. * See: exofs_read_lookup_dev_table() */ -static inline void exofs_init_comps(struct ore_components *comps, +static inline void exofs_init_comps(struct ore_components *oc, struct ore_comp *one_comp, struct exofs_sb_info *sbi, osd_id oid) { @@ -217,13 +217,13 @@ static inline void exofs_init_comps(struct ore_components *comps, one_comp->obj.id = oid; exofs_make_credential(one_comp->cred, &one_comp->obj); - comps->numdevs = sbi->comps.numdevs; - comps->single_comp = EC_SINGLE_COMP; - comps->comps = one_comp; + oc->numdevs = sbi->oc.numdevs; + oc->single_comp = EC_SINGLE_COMP; + oc->comps = one_comp; /* Round robin device view of the table */ - first_dev = (dev_mod * sbi->layout.mirrors_p1) % sbi->comps.numdevs; - comps->ods = sbi->comps.ods + first_dev; + first_dev = (dev_mod * sbi->layout.mirrors_p1) % sbi->oc.numdevs; + oc->ods = sbi->oc.ods + first_dev; } #endif diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c index f39a38f..61b2f7e 100644 --- a/fs/exofs/inode.c +++ b/fs/exofs/inode.c @@ -270,7 +270,7 @@ static int read_exec(struct page_collect *pcol) return 0; if (!pcol->ios) { - int ret = ore_get_rw_state(&pcol->sbi->layout, &oi->comps, true, + int ret = ore_get_rw_state(&pcol->sbi->layout, &oi->oc, true, pcol->pg_first << PAGE_CACHE_SHIFT, pcol->length, &pcol->ios); @@ -516,7 +516,7 @@ static int write_exec(struct page_collect *pcol) return 0; BUG_ON(pcol->ios); - ret = ore_get_rw_state(&pcol->sbi->layout, &oi->comps, false, + ret = ore_get_rw_state(&pcol->sbi->layout, &oi->oc, false, pcol->pg_first << PAGE_CACHE_SHIFT, pcol->length, &pcol->ios); @@ -860,7 +860,7 @@ static int _do_truncate(struct inode *inode, loff_t newsize) inode->i_mtime = inode->i_ctime = CURRENT_TIME; - ret = ore_truncate(&sbi->layout, &oi->comps, (u64)newsize); + ret = ore_truncate(&sbi->layout, &oi->oc, (u64)newsize); if (likely(!ret)) truncate_setsize(inode, newsize); @@ -927,14 +927,14 @@ static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi, struct exofs_on_disk_inode_layout *layout; int ret; - ret = ore_get_io_state(&sbi->layout, &oi->comps, &ios); + ret = ore_get_io_state(&sbi->layout, &oi->oc, &ios); if (unlikely(ret)) { EXOFS_ERR("%s: ore_get_io_state failed.\n", __func__); return ret; } - attrs[1].len = exofs_on_disk_inode_layout_size(sbi->comps.numdevs); - attrs[2].len = exofs_on_disk_inode_layout_size(sbi->comps.numdevs); + attrs[1].len = exofs_on_disk_inode_layout_size(sbi->oc.numdevs); + attrs[2].len = exofs_on_disk_inode_layout_size(sbi->oc.numdevs); ios->in_attr = attrs; ios->in_attr_len = ARRAY_SIZE(attrs); @@ -1018,7 +1018,7 @@ struct inode *exofs_iget(struct super_block *sb, unsigned long ino) return inode; oi = exofs_i(inode); __oi_init(oi); - exofs_init_comps(&oi->comps, &oi->one_comp, sb->s_fs_info, + exofs_init_comps(&oi->oc, &oi->one_comp, sb->s_fs_info, exofs_oi_objno(oi)); /* read the inode from the osd */ @@ -1172,13 +1172,13 @@ struct inode *exofs_new_inode(struct inode *dir, int mode) spin_unlock(&sbi->s_next_gen_lock); insert_inode_hash(inode); - exofs_init_comps(&oi->comps, &oi->one_comp, sb->s_fs_info, + exofs_init_comps(&oi->oc, &oi->one_comp, sb->s_fs_info, exofs_oi_objno(oi)); exofs_sbi_write_stats(sbi); /* Make sure new sbi->s_nextid is on disk */ mark_inode_dirty(inode); - ret = ore_get_io_state(&sbi->layout, &oi->comps, &ios); + ret = ore_get_io_state(&sbi->layout, &oi->oc, &ios); if (unlikely(ret)) { EXOFS_ERR("exofs_new_inode: ore_get_io_state failed\n"); return ERR_PTR(ret); @@ -1267,7 +1267,7 @@ static int exofs_update_inode(struct inode *inode, int do_sync) } else memcpy(fcb->i_data, oi->i_data, sizeof(fcb->i_data)); - ret = ore_get_io_state(&sbi->layout, &oi->comps, &ios); + ret = ore_get_io_state(&sbi->layout, &oi->oc, &ios); if (unlikely(ret)) { EXOFS_ERR("%s: ore_get_io_state failed.\n", __func__); goto free_args; @@ -1350,7 +1350,7 @@ void exofs_evict_inode(struct inode *inode) /* ignore the error, attempt a remove anyway */ /* Now Remove the OSD objects */ - ret = ore_get_io_state(&sbi->layout, &oi->comps, &ios); + ret = ore_get_io_state(&sbi->layout, &oi->oc, &ios); if (unlikely(ret)) { EXOFS_ERR("%s: ore_get_io_state failed\n", __func__); return; diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c index 6114fdf..870f85a 100644 --- a/fs/exofs/ore.c +++ b/fs/exofs/ore.c @@ -49,20 +49,20 @@ MODULE_LICENSE("GPL"); static u8 *_ios_cred(struct ore_io_state *ios, unsigned index) { - return ios->comps->comps[index & ios->comps->single_comp].cred; + return ios->oc->comps[index & ios->oc->single_comp].cred; } static struct osd_obj_id *_ios_obj(struct ore_io_state *ios, unsigned index) { - return &ios->comps->comps[index & ios->comps->single_comp].obj; + return &ios->oc->comps[index & ios->oc->single_comp].obj; } static struct osd_dev *_ios_od(struct ore_io_state *ios, unsigned index) { - return ios->comps->ods[index]; + return ios->oc->ods[index]; } -int ore_get_rw_state(struct ore_layout *layout, struct ore_components *comps, +int ore_get_rw_state(struct ore_layout *layout, struct ore_components *oc, bool is_reading, u64 offset, u64 length, struct ore_io_state **pios) { @@ -71,16 +71,16 @@ int ore_get_rw_state(struct ore_layout *layout, struct ore_components *comps, /*TODO: Maybe use kmem_cach per sbi of size * exofs_io_state_size(layout->s_numdevs) */ - ios = kzalloc(ore_io_state_size(comps->numdevs), GFP_KERNEL); + ios = kzalloc(ore_io_state_size(oc->numdevs), GFP_KERNEL); if (unlikely(!ios)) { ORE_DBGMSG("Failed kzalloc bytes=%d\n", - ore_io_state_size(comps->numdevs)); + ore_io_state_size(oc->numdevs)); *pios = NULL; return -ENOMEM; } ios->layout = layout; - ios->comps = comps; + ios->oc = oc; ios->offset = offset; ios->length = length; ios->reading = is_reading; @@ -90,10 +90,10 @@ int ore_get_rw_state(struct ore_layout *layout, struct ore_components *comps, } EXPORT_SYMBOL(ore_get_rw_state); -int ore_get_io_state(struct ore_layout *layout, struct ore_components *comps, +int ore_get_io_state(struct ore_layout *layout, struct ore_components *oc, struct ore_io_state **ios) { - return ore_get_rw_state(layout, comps, true, 0, 0, ios); + return ore_get_rw_state(layout, oc, true, 0, 0, ios); } EXPORT_SYMBOL(ore_get_io_state); @@ -476,7 +476,7 @@ int ore_create(struct ore_io_state *ios) { int i, ret; - for (i = 0; i < ios->comps->numdevs; i++) { + for (i = 0; i < ios->oc->numdevs; i++) { struct osd_request *or; or = osd_start_request(_ios_od(ios, i), GFP_KERNEL); @@ -501,7 +501,7 @@ int ore_remove(struct ore_io_state *ios) { int i, ret; - for (i = 0; i < ios->comps->numdevs; i++) { + for (i = 0; i < ios->oc->numdevs; i++) { struct osd_request *or; or = osd_start_request(_ios_od(ios, i), GFP_KERNEL); @@ -768,7 +768,7 @@ static void _calc_trunk_info(struct ore_layout *layout, u64 file_offset, ti->max_devs = layout->group_width * layout->group_count; } -int ore_truncate(struct ore_layout *layout, struct ore_components *comps, +int ore_truncate(struct ore_layout *layout, struct ore_components *oc, u64 size) { struct ore_io_state *ios; @@ -779,7 +779,7 @@ int ore_truncate(struct ore_layout *layout, struct ore_components *comps, struct _trunc_info ti; int i, ret; - ret = ore_get_io_state(layout, comps, &ios); + ret = ore_get_io_state(layout, oc, &ios); if (unlikely(ret)) return ret; @@ -792,7 +792,7 @@ int ore_truncate(struct ore_layout *layout, struct ore_components *comps, goto out; } - ios->numdevs = ios->comps->numdevs; + ios->numdevs = ios->oc->numdevs; for (i = 0; i < ti.max_devs; ++i) { struct exofs_trunc_attr *size_attr = &size_attrs[i]; @@ -815,7 +815,7 @@ int ore_truncate(struct ore_layout *layout, struct ore_components *comps, size_attr->attr.val_ptr = &size_attr->newsize; ORE_DBGMSG("trunc(0x%llx) obj_offset=0x%llx dev=%d\n", - _LLU(comps->comps->obj.id), _LLU(obj_size), i); + _LLU(oc->comps->obj.id), _LLU(obj_size), i); ret = _truncate_mirrors(ios, i * ios->layout->mirrors_p1, &size_attr->attr); if (unlikely(ret)) diff --git a/fs/exofs/super.c b/fs/exofs/super.c index afe79ee..babb195 100644 --- a/fs/exofs/super.c +++ b/fs/exofs/super.c @@ -266,7 +266,7 @@ static int __sbi_read_stats(struct exofs_sb_info *sbi) struct ore_io_state *ios; int ret; - ret = ore_get_io_state(&sbi->layout, &sbi->comps, &ios); + ret = ore_get_io_state(&sbi->layout, &sbi->oc, &ios); if (unlikely(ret)) { EXOFS_ERR("%s: ore_get_io_state failed.\n", __func__); return ret; @@ -321,7 +321,7 @@ int exofs_sbi_write_stats(struct exofs_sb_info *sbi) struct ore_io_state *ios; int ret; - ret = ore_get_io_state(&sbi->layout, &sbi->comps, &ios); + ret = ore_get_io_state(&sbi->layout, &sbi->oc, &ios); if (unlikely(ret)) { EXOFS_ERR("%s: ore_get_io_state failed.\n", __func__); return ret; @@ -360,7 +360,7 @@ static int exofs_sync_fs(struct super_block *sb, int wait) struct exofs_sb_info *sbi; struct exofs_fscb *fscb; struct ore_comp one_comp; - struct ore_components comps; + struct ore_components oc; struct ore_io_state *ios; int ret = -ENOMEM; @@ -378,9 +378,9 @@ static int exofs_sync_fs(struct super_block *sb, int wait) * the writeable info is set in exofs_sbi_write_stats() above. */ - exofs_init_comps(&comps, &one_comp, sbi, EXOFS_SUPER_ID); + exofs_init_comps(&oc, &one_comp, sbi, EXOFS_SUPER_ID); - ret = ore_get_io_state(&sbi->layout, &comps, &ios); + ret = ore_get_io_state(&sbi->layout, &oc, &ios); if (unlikely(ret)) goto out; @@ -431,17 +431,17 @@ static void _exofs_print_device(const char *msg, const char *dev_path, static void exofs_free_sbi(struct exofs_sb_info *sbi) { - while (sbi->comps.numdevs) { - int i = --sbi->comps.numdevs; - struct osd_dev *od = sbi->comps.ods[i]; + while (sbi->oc.numdevs) { + int i = --sbi->oc.numdevs; + struct osd_dev *od = sbi->oc.ods[i]; if (od) { - sbi->comps.ods[i] = NULL; + sbi->oc.ods[i] = NULL; osduld_put_device(od); } } - if (sbi->comps.ods != sbi->_min_one_dev) - kfree(sbi->comps.ods); + if (sbi->oc.ods != sbi->_min_one_dev) + kfree(sbi->oc.ods); kfree(sbi); } @@ -468,7 +468,7 @@ static void exofs_put_super(struct super_block *sb) msecs_to_jiffies(100)); } - _exofs_print_device("Unmounting", NULL, sbi->comps.ods[0], + _exofs_print_device("Unmounting", NULL, sbi->oc.ods[0], sbi->one_comp.obj.partition); bdi_destroy(&sbi->bdi); @@ -623,7 +623,7 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info *sbi, return -ENOMEM; } - sbi->comps.numdevs = 0; + sbi->oc.numdevs = 0; comp.obj.partition = sbi->one_comp.obj.partition; comp.obj.id = EXOFS_DEVTABLE_ID; @@ -648,13 +648,13 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info *sbi, goto out; if (likely(numdevs > 1)) { - unsigned size = numdevs * sizeof(sbi->comps.ods[0]); + unsigned size = numdevs * sizeof(sbi->oc.ods[0]); /* Twice bigger table: See exofs_init_comps() and below * comment */ - sbi->comps.ods = kzalloc(size + size - 1, GFP_KERNEL); - if (unlikely(!sbi->comps.ods)) { + sbi->oc.ods = kzalloc(size + size - 1, GFP_KERNEL); + if (unlikely(!sbi->oc.ods)) { EXOFS_ERR("ERROR: faild allocating Device array[%d]\n", numdevs); ret = -ENOMEM; @@ -681,8 +681,8 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info *sbi, * line. We always keep them in device-table order. */ if (fscb_od && osduld_device_same(fscb_od, &odi)) { - sbi->comps.ods[i] = fscb_od; - ++sbi->comps.numdevs; + sbi->oc.ods[i] = fscb_od; + ++sbi->oc.numdevs; fscb_od = NULL; continue; } @@ -695,8 +695,8 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info *sbi, goto out; } - sbi->comps.ods[i] = od; - ++sbi->comps.numdevs; + sbi->oc.ods[i] = od; + ++sbi->oc.numdevs; /* Read the fscb of the other devices to make sure the FS * partition is there. @@ -719,7 +719,7 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info *sbi, out: kfree(dt); if (likely(!ret)) { - unsigned numdevs = sbi->comps.numdevs; + unsigned numdevs = sbi->oc.numdevs; if (unlikely(fscb_od)) { EXOFS_ERR("ERROR: Bad device-table container device not present\n"); @@ -732,7 +732,7 @@ out: * starting at this device. See exofs_init_comps() */ for (i = 0; i < numdevs - 1; ++i) - sbi->comps.ods[i + numdevs] = sbi->comps.ods[i]; + sbi->oc.ods[i + numdevs] = sbi->oc.ods[i]; } return ret; } @@ -783,10 +783,10 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent) sbi->one_comp.obj.partition = opts->pid; sbi->one_comp.obj.id = 0; exofs_make_credential(sbi->one_comp.cred, &sbi->one_comp.obj); - sbi->comps.numdevs = 1; - sbi->comps.single_comp = EC_SINGLE_COMP; - sbi->comps.comps = &sbi->one_comp; - sbi->comps.ods = sbi->_min_one_dev; + sbi->oc.numdevs = 1; + sbi->oc.single_comp = EC_SINGLE_COMP; + sbi->oc.comps = &sbi->one_comp; + sbi->oc.ods = sbi->_min_one_dev; /* fill in some other data by hand */ memset(sb->s_id, 0, sizeof(sb->s_id)); @@ -835,7 +835,7 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent) if (unlikely(ret)) goto free_sbi; } else { - sbi->comps.ods[0] = od; + sbi->oc.ods[0] = od; } __sbi_read_stats(sbi); @@ -875,7 +875,7 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent) goto free_sbi; } - _exofs_print_device("Mounting", opts->dev_name, sbi->comps.ods[0], + _exofs_print_device("Mounting", opts->dev_name, sbi->oc.ods[0], sbi->one_comp.obj.partition); return 0; @@ -924,7 +924,7 @@ static int exofs_statfs(struct dentry *dentry, struct kstatfs *buf) uint64_t used = ULLONG_MAX; int ret; - ret = ore_get_io_state(&sbi->layout, &sbi->comps, &ios); + ret = ore_get_io_state(&sbi->layout, &sbi->oc, &ios); if (ret) { EXOFS_DBGMSG("ore_get_io_state failed.\n"); return ret; diff --git a/include/scsi/osd_ore.h b/include/scsi/osd_ore.h index c5c5e00..954292a 100644 --- a/include/scsi/osd_ore.h +++ b/include/scsi/osd_ore.h @@ -64,7 +64,7 @@ struct ore_io_state { ore_io_done_fn done; struct ore_layout *layout; - struct ore_components *comps; + struct ore_components *oc; /* Global read/write IO*/ loff_t offset; -- cgit v0.10.2 From 8d2d83a8352b0f9c1da82c36f741722f2960feea Mon Sep 17 00:00:00 2001 From: Boaz Harrosh Date: Wed, 10 Aug 2011 14:15:02 -0700 Subject: exofs: Remove unused data_map member from exofs_sb_info The struct pnfs_osd_data_map data_map member of exofs_sb_info was never used after mount. In fact all it's members were duplicated by the ore_layout structure. So just remove the duplicated information. Also removed some stupid, but perfectly supported, restrictions on layout parameters. The case where num_devices is not divisible by mirror_count+1 is perfectly fine since the rotating device view will eventually use all the devices it can get. Signed-off-by: Boaz Harrosh Signed-off-by: Benny Halevy diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h index c09d5a7..3b2e047 100644 --- a/fs/exofs/exofs.h +++ b/fs/exofs/exofs.h @@ -66,9 +66,6 @@ struct exofs_sb_info { u32 s_next_generation; /* next gen # to use */ atomic_t s_curr_pending; /* number of pending commands */ - struct pnfs_osd_data_map data_map; /* Default raid to use - * FIXME: Needed ? - */ struct ore_layout layout; /* Default files layout */ struct ore_comp one_comp; /* id & cred of partition id=0*/ struct ore_components oc; /* comps for the partition */ diff --git a/fs/exofs/super.c b/fs/exofs/super.c index babb195..90b4c52 100644 --- a/fs/exofs/super.c +++ b/fs/exofs/super.c @@ -481,64 +481,51 @@ static int _read_and_match_data_map(struct exofs_sb_info *sbi, unsigned numdevs, { u64 stripe_length; - sbi->data_map.odm_num_comps = - le32_to_cpu(dt->dt_data_map.cb_num_comps); - sbi->data_map.odm_stripe_unit = + sbi->layout.stripe_unit = le64_to_cpu(dt->dt_data_map.cb_stripe_unit); - sbi->data_map.odm_group_width = + sbi->layout.group_width = le32_to_cpu(dt->dt_data_map.cb_group_width); - sbi->data_map.odm_group_depth = + sbi->layout.group_depth = le32_to_cpu(dt->dt_data_map.cb_group_depth); - sbi->data_map.odm_mirror_cnt = - le32_to_cpu(dt->dt_data_map.cb_mirror_cnt); - sbi->data_map.odm_raid_algorithm = + sbi->layout.mirrors_p1 = + le32_to_cpu(dt->dt_data_map.cb_mirror_cnt) + 1; + sbi->layout.raid_algorithm = le32_to_cpu(dt->dt_data_map.cb_raid_algorithm); /* FIXME: Only raid0 for now. if not so, do not mount */ - if (sbi->data_map.odm_num_comps != numdevs) { - EXOFS_ERR("odm_num_comps(%u) != numdevs(%u)\n", - sbi->data_map.odm_num_comps, numdevs); - return -EINVAL; - } - if (sbi->data_map.odm_raid_algorithm != PNFS_OSD_RAID_0) { + if (sbi->layout.raid_algorithm != PNFS_OSD_RAID_0) { EXOFS_ERR("Only RAID_0 for now\n"); return -EINVAL; } - if (0 != (numdevs % (sbi->data_map.odm_mirror_cnt + 1))) { - EXOFS_ERR("Data Map wrong, numdevs=%d mirrors=%d\n", - numdevs, sbi->data_map.odm_mirror_cnt); + if (numdevs < (sbi->layout.group_width * sbi->layout.mirrors_p1)) { + EXOFS_ERR("Data Map wrong, " + "numdevs=%d < group_width=%d * mirrors=%d\n", + numdevs, sbi->layout.group_width, + sbi->layout.mirrors_p1); return -EINVAL; } - if (0 != (sbi->data_map.odm_stripe_unit & ~PAGE_MASK)) { + if (0 != (sbi->layout.stripe_unit & ~PAGE_MASK)) { EXOFS_ERR("Stripe Unit(0x%llx)" " must be Multples of PAGE_SIZE(0x%lx)\n", - _LLU(sbi->data_map.odm_stripe_unit), PAGE_SIZE); + _LLU(sbi->layout.stripe_unit), PAGE_SIZE); return -EINVAL; } - sbi->layout.stripe_unit = sbi->data_map.odm_stripe_unit; - sbi->layout.mirrors_p1 = sbi->data_map.odm_mirror_cnt + 1; - - if (sbi->data_map.odm_group_width) { - sbi->layout.group_width = sbi->data_map.odm_group_width; - sbi->layout.group_depth = sbi->data_map.odm_group_depth; + if (sbi->layout.group_width) { if (!sbi->layout.group_depth) { EXOFS_ERR("group_depth == 0 && group_width != 0\n"); return -EINVAL; } - sbi->layout.group_count = sbi->data_map.odm_num_comps / - sbi->layout.mirrors_p1 / - sbi->data_map.odm_group_width; + sbi->layout.group_count = numdevs / sbi->layout.mirrors_p1 / + sbi->layout.group_width; } else { - if (sbi->data_map.odm_group_depth) { + if (sbi->layout.group_depth) { printk(KERN_NOTICE "Warning: group_depth ignored " - "group_width == 0 && group_depth == %d\n", - sbi->data_map.odm_group_depth); - sbi->data_map.odm_group_depth = 0; + "group_width == 0 && group_depth == %lld\n", + _LLU(sbi->layout.group_depth)); } - sbi->layout.group_width = sbi->data_map.odm_num_comps / - sbi->layout.mirrors_p1; + sbi->layout.group_width = numdevs / sbi->layout.mirrors_p1; sbi->layout.group_depth = -1; sbi->layout.group_count = 1; } @@ -558,7 +545,7 @@ static int _read_and_match_data_map(struct exofs_sb_info *sbi, unsigned numdevs, sbi->layout.group_width, _LLU(sbi->layout.group_depth), sbi->layout.mirrors_p1, - sbi->data_map.odm_raid_algorithm); + sbi->layout.raid_algorithm); return 0; } diff --git a/include/scsi/osd_ore.h b/include/scsi/osd_ore.h index 954292a..f7fabb4 100644 --- a/include/scsi/osd_ore.h +++ b/include/scsi/osd_ore.h @@ -34,6 +34,8 @@ struct ore_comp { struct ore_layout { /* Our way of looking at the data_map */ + enum pnfs_osd_raid_algorithm4 + raid_algorithm; unsigned stripe_unit; unsigned mirrors_p1; -- cgit v0.10.2 From eb507bc18969f63b8968034144fd69706c492516 Mon Sep 17 00:00:00 2001 From: Boaz Harrosh Date: Wed, 10 Aug 2011 14:17:28 -0700 Subject: ore: Make ore_striping_info and ore_calc_stripe_info public The struct ore_striping_info will be used later in other structures. And ore_calc_stripe_info as well. Rename them make struct ore_striping_info public. ore_calc_stripe_info is still static, will be made public on first use. Signed-off-by: Boaz Harrosh diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c index 870f85a..c2b0033 100644 --- a/fs/exofs/ore.c +++ b/fs/exofs/ore.c @@ -287,16 +287,8 @@ EXPORT_SYMBOL(ore_check_io); * * O = L % stripe_unit + N * stripe_unit + M * group_depth * stripe_unit */ -struct _striping_info { - u64 obj_offset; - u64 group_length; - u64 M; /* for truncate */ - unsigned dev; - unsigned unit_off; -}; - -static void _calc_stripe_info(struct ore_layout *layout, u64 file_offset, - struct _striping_info *si) +static void ore_calc_stripe_info(struct ore_layout *layout, u64 file_offset, + struct ore_striping_info *si) { u32 stripe_unit = layout->stripe_unit; u32 group_width = layout->group_width; @@ -375,7 +367,7 @@ static int _add_stripe_unit(struct ore_io_state *ios, unsigned *cur_pg, } static int _prepare_one_group(struct ore_io_state *ios, u64 length, - struct _striping_info *si) + struct ore_striping_info *si) { unsigned stripe_unit = ios->layout->stripe_unit; unsigned mirrors_p1 = ios->layout->mirrors_p1; @@ -434,14 +426,14 @@ static int _prepare_for_striping(struct ore_io_state *ios) { u64 length = ios->length; u64 offset = ios->offset; - struct _striping_info si; + struct ore_striping_info si; int ret = 0; if (!ios->pages) { if (ios->kern_buff) { struct ore_per_dev_state *per_dev = &ios->per_dev[0]; - _calc_stripe_info(ios->layout, ios->offset, &si); + ore_calc_stripe_info(ios->layout, ios->offset, &si); per_dev->offset = si.obj_offset; per_dev->dev = si.dev; @@ -455,7 +447,7 @@ static int _prepare_for_striping(struct ore_io_state *ios) } while (length) { - _calc_stripe_info(ios->layout, offset, &si); + ore_calc_stripe_info(ios->layout, offset, &si); if (length < si.group_length) si.group_length = length; @@ -744,7 +736,7 @@ static int _truncate_mirrors(struct ore_io_state *ios, unsigned cur_comp, } struct _trunc_info { - struct _striping_info si; + struct ore_striping_info si; u64 prev_group_obj_off; u64 next_group_obj_off; @@ -758,7 +750,7 @@ static void _calc_trunk_info(struct ore_layout *layout, u64 file_offset, { unsigned stripe_unit = layout->stripe_unit; - _calc_stripe_info(layout, file_offset, &ti->si); + ore_calc_stripe_info(layout, file_offset, &ti->si); ti->prev_group_obj_off = ti->si.M * stripe_unit; ti->next_group_obj_off = ti->si.M ? (ti->si.M - 1) * stripe_unit : 0; diff --git a/include/scsi/osd_ore.h b/include/scsi/osd_ore.h index f7fabb4..e4d550f 100644 --- a/include/scsi/osd_ore.h +++ b/include/scsi/osd_ore.h @@ -56,6 +56,14 @@ struct ore_components { struct osd_dev **ods; /* osd_dev array */ }; +struct ore_striping_info { + u64 obj_offset; + u64 group_length; + u64 M; /* for truncate */ + unsigned dev; + unsigned unit_off; +}; + struct ore_io_state; typedef void (*ore_io_done_fn)(struct ore_io_state *ios, void *private); -- cgit v0.10.2 From d866d875f68fdeae63df334d291fe138dc636d96 Mon Sep 17 00:00:00 2001 From: Boaz Harrosh Date: Wed, 28 Sep 2011 14:43:09 +0300 Subject: ore/exofs: Change the type of the devices array (API change) In the pNFS obj-LD the device table at the layout level needs to point to a device_cache node, where it is possible and likely that many layouts will point to the same device-nodes. In Exofs we have a more orderly structure where we have a single array of devices that repeats twice for a round-robin view of the device table This patch moves to a model that can be used by the pNFS obj-LD where struct ore_components holds an array of ore_dev-pointers. (ore_dev is newly defined and contains a struct osd_dev *od member) Each pointer in the array of pointers will point to a bigger user-defined dev_struct. That can be accessed by use of the container_of macro. In Exofs an __alloc_dev_table() function allocates the ore_dev-pointers array as well as an exofs_dev array, in one allocation and does the addresses dance to set everything pointing correctly. It still keeps the double allocation trick for the inodes round-robin view of the table. The device table is always allocated dynamically, also for the single device case. So it is unconditionally freed at umount. Signed-off-by: Boaz Harrosh diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h index 3b2e047..006fd6f 100644 --- a/fs/exofs/exofs.h +++ b/fs/exofs/exofs.h @@ -53,6 +53,10 @@ /* u64 has problems with printk this will cast it to unsigned long long */ #define _LLU(x) (unsigned long long)(x) +struct exofs_dev { + struct ore_dev ored; + unsigned did; +}; /* * our extension to the in-memory superblock */ @@ -69,7 +73,6 @@ struct exofs_sb_info { struct ore_layout layout; /* Default files layout */ struct ore_comp one_comp; /* id & cred of partition id=0*/ struct ore_components oc; /* comps for the partition */ - struct osd_dev *_min_one_dev[1]; /* Place holder for one dev */ }; /* @@ -214,13 +217,14 @@ static inline void exofs_init_comps(struct ore_components *oc, one_comp->obj.id = oid; exofs_make_credential(one_comp->cred, &one_comp->obj); - oc->numdevs = sbi->oc.numdevs; + oc->numdevs = sbi->layout.group_width * sbi->layout.mirrors_p1 * + sbi->layout.group_count; oc->single_comp = EC_SINGLE_COMP; oc->comps = one_comp; /* Round robin device view of the table */ first_dev = (dev_mod * sbi->layout.mirrors_p1) % sbi->oc.numdevs; - oc->ods = sbi->oc.ods + first_dev; + oc->ods = &sbi->oc.ods[first_dev]; } #endif diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c index c2b0033..a7d7925 100644 --- a/fs/exofs/ore.c +++ b/fs/exofs/ore.c @@ -59,7 +59,7 @@ static struct osd_obj_id *_ios_obj(struct ore_io_state *ios, unsigned index) static struct osd_dev *_ios_od(struct ore_io_state *ios, unsigned index) { - return ios->oc->ods[index]; + return ore_comp_dev(ios->oc, index); } int ore_get_rw_state(struct ore_layout *layout, struct ore_components *oc, diff --git a/fs/exofs/super.c b/fs/exofs/super.c index 90b4c52..bce3686 100644 --- a/fs/exofs/super.c +++ b/fs/exofs/super.c @@ -431,17 +431,18 @@ static void _exofs_print_device(const char *msg, const char *dev_path, static void exofs_free_sbi(struct exofs_sb_info *sbi) { - while (sbi->oc.numdevs) { - int i = --sbi->oc.numdevs; - struct osd_dev *od = sbi->oc.ods[i]; + unsigned numdevs = sbi->oc.numdevs; + + while (numdevs) { + unsigned i = --numdevs; + struct osd_dev *od = ore_comp_dev(&sbi->oc, i); if (od) { - sbi->oc.ods[i] = NULL; + ore_comp_set_dev(&sbi->oc, i, NULL); osduld_put_device(od); } } - if (sbi->oc.ods != sbi->_min_one_dev) - kfree(sbi->oc.ods); + kfree(sbi->oc.ods); kfree(sbi); } @@ -468,7 +469,7 @@ static void exofs_put_super(struct super_block *sb) msecs_to_jiffies(100)); } - _exofs_print_device("Unmounting", NULL, sbi->oc.ods[0], + _exofs_print_device("Unmounting", NULL, ore_comp_dev(&sbi->oc, 0), sbi->one_comp.obj.partition); bdi_destroy(&sbi->bdi); @@ -592,12 +593,40 @@ static int exofs_devs_2_odi(struct exofs_dt_device_info *dt_dev, return !(odi->systemid_len || odi->osdname_len); } +int __alloc_dev_table(struct exofs_sb_info *sbi, unsigned numdevs, + struct exofs_dev **peds) +{ + struct __alloc_ore_devs_and_exofs_devs { + /* Twice bigger table: See exofs_init_comps() and comment at + * exofs_read_lookup_dev_table() + */ + struct ore_dev *oreds[numdevs * 2 - 1]; + struct exofs_dev eds[numdevs]; + } *aoded; + struct exofs_dev *eds; + unsigned i; + + aoded = kzalloc(sizeof(*aoded), GFP_KERNEL); + if (unlikely(!aoded)) { + EXOFS_ERR("ERROR: faild allocating Device array[%d]\n", + numdevs); + return -ENOMEM; + } + + sbi->oc.ods = aoded->oreds; + *peds = eds = aoded->eds; + for (i = 0; i < numdevs; ++i) + aoded->oreds[i] = &eds[i].ored; + return 0; +} + static int exofs_read_lookup_dev_table(struct exofs_sb_info *sbi, struct osd_dev *fscb_od, unsigned table_count) { struct ore_comp comp; struct exofs_device_table *dt; + struct exofs_dev *eds; unsigned table_bytes = table_count * sizeof(dt->dt_dev_table[0]) + sizeof(*dt); unsigned numdevs, i; @@ -634,20 +663,16 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info *sbi, if (unlikely(ret)) goto out; - if (likely(numdevs > 1)) { - unsigned size = numdevs * sizeof(sbi->oc.ods[0]); - - /* Twice bigger table: See exofs_init_comps() and below - * comment - */ - sbi->oc.ods = kzalloc(size + size - 1, GFP_KERNEL); - if (unlikely(!sbi->oc.ods)) { - EXOFS_ERR("ERROR: faild allocating Device array[%d]\n", - numdevs); - ret = -ENOMEM; - goto out; - } - } + ret = __alloc_dev_table(sbi, numdevs, &eds); + if (unlikely(ret)) + goto out; + /* exofs round-robins the device table view according to inode + * number. We hold a: twice bigger table hence inodes can point + * to any device and have a sequential view of the table + * starting at this device. See exofs_init_comps() + */ + memcpy(&sbi->oc.ods[numdevs], &sbi->oc.ods[0], + (numdevs - 1) * sizeof(sbi->oc.ods[0])); for (i = 0; i < numdevs; i++) { struct exofs_fscb fscb; @@ -663,12 +688,15 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info *sbi, printk(KERN_NOTICE "Add device[%d]: osd_name-%s\n", i, odi.osdname); + /* the exofs id is currently the table index */ + eds[i].did = i; + /* On all devices the device table is identical. The user can * specify any one of the participating devices on the command * line. We always keep them in device-table order. */ if (fscb_od && osduld_device_same(fscb_od, &odi)) { - sbi->oc.ods[i] = fscb_od; + eds[i].ored.od = fscb_od; ++sbi->oc.numdevs; fscb_od = NULL; continue; @@ -682,7 +710,7 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info *sbi, goto out; } - sbi->oc.ods[i] = od; + eds[i].ored.od = od; ++sbi->oc.numdevs; /* Read the fscb of the other devices to make sure the FS @@ -705,21 +733,10 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info *sbi, out: kfree(dt); - if (likely(!ret)) { - unsigned numdevs = sbi->oc.numdevs; - - if (unlikely(fscb_od)) { + if (unlikely(fscb_od && !ret)) { EXOFS_ERR("ERROR: Bad device-table container device not present\n"); osduld_put_device(fscb_od); return -EINVAL; - } - /* exofs round-robins the device table view according to inode - * number. We hold a: twice bigger table hence inodes can point - * to any device and have a sequential view of the table - * starting at this device. See exofs_init_comps() - */ - for (i = 0; i < numdevs - 1; ++i) - sbi->oc.ods[i + numdevs] = sbi->oc.ods[i]; } return ret; } @@ -773,7 +790,6 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent) sbi->oc.numdevs = 1; sbi->oc.single_comp = EC_SINGLE_COMP; sbi->oc.comps = &sbi->one_comp; - sbi->oc.ods = sbi->_min_one_dev; /* fill in some other data by hand */ memset(sb->s_id, 0, sizeof(sb->s_id)); @@ -822,7 +838,13 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent) if (unlikely(ret)) goto free_sbi; } else { - sbi->oc.ods[0] = od; + struct exofs_dev *eds; + + ret = __alloc_dev_table(sbi, 1, &eds); + if (unlikely(ret)) + goto free_sbi; + + ore_comp_set_dev(&sbi->oc, 0, od); } __sbi_read_stats(sbi); @@ -862,7 +884,8 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent) goto free_sbi; } - _exofs_print_device("Mounting", opts->dev_name, sbi->oc.ods[0], + _exofs_print_device("Mounting", opts->dev_name, + ore_comp_dev(&sbi->oc, 0), sbi->one_comp.obj.partition); return 0; diff --git a/include/scsi/osd_ore.h b/include/scsi/osd_ore.h index e4d550f..8fefdfb 100644 --- a/include/scsi/osd_ore.h +++ b/include/scsi/osd_ore.h @@ -44,6 +44,10 @@ struct ore_layout { unsigned group_count; }; +struct ore_dev { + struct osd_dev *od; +}; + struct ore_components { unsigned numdevs; /* Num of devices in array */ /* If @single_comp == EC_SINGLE_COMP, @comps points to a single @@ -53,9 +57,29 @@ struct ore_components { EC_SINGLE_COMP = 0, EC_MULTPLE_COMPS = 0xffffffff } single_comp; struct ore_comp *comps; - struct osd_dev **ods; /* osd_dev array */ + + /* Array of pointers to ore_dev-* . User will usually have these pointed + * too a bigger struct which contain an "ore_dev ored" member and use + * container_of(oc->ods[i], struct foo_dev, ored) to access the bigger + * structure. + */ + struct ore_dev **ods; }; +/* ore_comp_dev Recievies a logical device index */ +static inline struct osd_dev *ore_comp_dev( + const struct ore_components *oc, unsigned i) +{ + BUG_ON(oc->numdevs <= i); + return oc->ods[i]->od; +} + +static inline void ore_comp_set_dev( + struct ore_components *oc, unsigned i, struct osd_dev *od) +{ + oc->ods[i]->od = od; +} + struct ore_striping_info { u64 obj_offset; u64 group_length; -- cgit v0.10.2 From b916c5cd4d895a27b47a652648958f73e4f23ac6 Mon Sep 17 00:00:00 2001 From: Boaz Harrosh Date: Wed, 28 Sep 2011 11:55:51 +0300 Subject: ore: Only IO one group at a time (API change) Usually a single IO is confined to one group of devices (group_width) and at the boundary of a raid group it can spill into a second group. Current code would allocate a full device_table size array at each io_state so it can comply to requests that span two groups. Needless to say that is very wasteful, specially when device_table count can get very large (hundreds even thousands), while a group_width is usually 8 or 10. * Change ore API to trim on IO that spans two raid groups. The user passes offset+length to ore_get_rw_state, the ore might trim on that length if spanning a group boundary. The user must check ios->length or ios->nrpages to see how much IO will be preformed. It is the responsibility of the user to re-issue the reminder of the IO. * Modify exofs To copy spilled pages on to the next IO. This means one last kick is needed after all coalescing of pages is done. Signed-off-by: Boaz Harrosh diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c index 61b2f7e..d87c1f7 100644 --- a/fs/exofs/inode.c +++ b/fs/exofs/inode.c @@ -259,6 +259,46 @@ static void _unlock_pcol_pages(struct page_collect *pcol, int ret, int rw) } } +static int _maybe_not_all_in_one_io(struct ore_io_state *ios, + struct page_collect *pcol_src, struct page_collect *pcol) +{ + /* length was wrong or offset was not page aligned */ + BUG_ON(pcol_src->nr_pages < ios->nr_pages); + + if (pcol_src->nr_pages > ios->nr_pages) { + struct page **src_page; + unsigned pages_less = pcol_src->nr_pages - ios->nr_pages; + unsigned long len_less = pcol_src->length - ios->length; + unsigned i; + int ret; + + /* This IO was trimmed */ + pcol_src->nr_pages = ios->nr_pages; + pcol_src->length = ios->length; + + /* Left over pages are passed to the next io */ + pcol->expected_pages += pages_less; + pcol->nr_pages = pages_less; + pcol->length = len_less; + src_page = pcol_src->pages + pcol_src->nr_pages; + pcol->pg_first = (*src_page)->index; + + ret = pcol_try_alloc(pcol); + if (unlikely(ret)) + return ret; + + for (i = 0; i < pages_less; ++i) + pcol->pages[i] = *src_page++; + + EXOFS_DBGMSG("Length was adjusted nr_pages=0x%x " + "pages_less=0x%x expected_pages=0x%x " + "next_offset=0x%llx next_len=0x%lx\n", + pcol_src->nr_pages, pages_less, pcol->expected_pages, + pcol->pg_first * PAGE_SIZE, pcol->length); + } + return 0; +} + static int read_exec(struct page_collect *pcol) { struct exofs_i_info *oi = exofs_i(pcol->inode); @@ -280,7 +320,6 @@ static int read_exec(struct page_collect *pcol) ios = pcol->ios; ios->pages = pcol->pages; - ios->nr_pages = pcol->nr_pages; if (pcol->read_4_write) { ore_read(pcol->ios); @@ -296,17 +335,23 @@ static int read_exec(struct page_collect *pcol) *pcol_copy = *pcol; ios->done = readpages_done; ios->private = pcol_copy; + + /* pages ownership was passed to pcol_copy */ + _pcol_reset(pcol); + + ret = _maybe_not_all_in_one_io(ios, pcol_copy, pcol); + if (unlikely(ret)) + goto err; + + EXOFS_DBGMSG2("read_exec(0x%lx) offset=0x%llx length=0x%llx\n", + pcol->inode->i_ino, _LLU(ios->offset), _LLU(ios->length)); + ret = ore_read(ios); if (unlikely(ret)) goto err; atomic_inc(&pcol->sbi->s_curr_pending); - EXOFS_DBGMSG2("read_exec obj=0x%llx start=0x%llx length=0x%lx\n", - oi->one_comp.obj.id, _LLU(ios->offset), pcol->length); - - /* pages ownership was passed to pcol_copy */ - _pcol_reset(pcol); return 0; err: @@ -429,6 +474,10 @@ static int exofs_readpages(struct file *file, struct address_space *mapping, return ret; } + ret = read_exec(&pcol); + if (unlikely(ret)) + return ret; + return read_exec(&pcol); } @@ -519,7 +568,6 @@ static int write_exec(struct page_collect *pcol) ret = ore_get_rw_state(&pcol->sbi->layout, &oi->oc, false, pcol->pg_first << PAGE_CACHE_SHIFT, pcol->length, &pcol->ios); - if (unlikely(ret)) goto err; @@ -534,10 +582,19 @@ static int write_exec(struct page_collect *pcol) ios = pcol->ios; ios->pages = pcol_copy->pages; - ios->nr_pages = pcol_copy->nr_pages; ios->done = writepages_done; ios->private = pcol_copy; + /* pages ownership was passed to pcol_copy */ + _pcol_reset(pcol); + + ret = _maybe_not_all_in_one_io(ios, pcol_copy, pcol); + if (unlikely(ret)) + goto err; + + EXOFS_DBGMSG2("write_exec(0x%lx) offset=0x%llx length=0x%llx\n", + pcol->inode->i_ino, _LLU(ios->offset), _LLU(ios->length)); + ret = ore_write(ios); if (unlikely(ret)) { EXOFS_ERR("write_exec: ore_write() Failed\n"); @@ -545,11 +602,6 @@ static int write_exec(struct page_collect *pcol) } atomic_inc(&pcol->sbi->s_curr_pending); - EXOFS_DBGMSG2("write_exec(0x%lx, 0x%llx) start=0x%llx length=0x%lx\n", - pcol->inode->i_ino, pcol->pg_first, _LLU(ios->offset), - pcol->length); - /* pages ownership was passed to pcol_copy */ - _pcol_reset(pcol); return 0; err: @@ -689,12 +741,30 @@ static int exofs_writepages(struct address_space *mapping, _pcol_init(&pcol, expected_pages, mapping->host); ret = write_cache_pages(mapping, wbc, writepage_strip, &pcol); - if (ret) { + if (unlikely(ret)) { EXOFS_ERR("write_cache_pages => %d\n", ret); return ret; } - return write_exec(&pcol); + ret = write_exec(&pcol); + if (unlikely(ret)) + return ret; + + if (wbc->sync_mode == WB_SYNC_ALL) { + return write_exec(&pcol); /* pump the last reminder */ + } else if (pcol.nr_pages) { + /* not SYNC let the reminder join the next writeout */ + unsigned i; + + for (i = 0; i < pcol.nr_pages; i++) { + struct page *page = pcol.pages[i]; + + end_page_writeback(page); + set_page_dirty(page); + unlock_page(page); + } + } + return 0; } static int exofs_writepage(struct page *page, struct writeback_control *wbc) diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c index a7d7925..c1c2cc6 100644 --- a/fs/exofs/ore.c +++ b/fs/exofs/ore.c @@ -47,6 +47,9 @@ MODULE_AUTHOR("Boaz Harrosh "); MODULE_DESCRIPTION("Objects Raid Engine ore.ko"); MODULE_LICENSE("GPL"); +static void ore_calc_stripe_info(struct ore_layout *layout, u64 file_offset, + struct ore_striping_info *si); + static u8 *_ios_cred(struct ore_io_state *ios, unsigned index) { return ios->oc->comps[index & ios->oc->single_comp].cred; @@ -62,38 +65,85 @@ static struct osd_dev *_ios_od(struct ore_io_state *ios, unsigned index) return ore_comp_dev(ios->oc, index); } -int ore_get_rw_state(struct ore_layout *layout, struct ore_components *oc, - bool is_reading, u64 offset, u64 length, - struct ore_io_state **pios) +static int _get_io_state(struct ore_layout *layout, + struct ore_components *oc, unsigned numdevs, + struct ore_io_state **pios) { struct ore_io_state *ios; /*TODO: Maybe use kmem_cach per sbi of size * exofs_io_state_size(layout->s_numdevs) */ - ios = kzalloc(ore_io_state_size(oc->numdevs), GFP_KERNEL); + ios = kzalloc(ore_io_state_size(numdevs), GFP_KERNEL); if (unlikely(!ios)) { ORE_DBGMSG("Failed kzalloc bytes=%d\n", - ore_io_state_size(oc->numdevs)); + ore_io_state_size(numdevs)); *pios = NULL; return -ENOMEM; } ios->layout = layout; ios->oc = oc; - ios->offset = offset; - ios->length = length; + *pios = ios; + return 0; +} + +/* Allocate an io_state for only a single group of devices + * + * If a user needs to call ore_read/write() this version must be used becase it + * allocates extra stuff for striping and raid. + * The ore might decide to only IO less then @length bytes do to alignmets + * and constrains as follows: + * - The IO cannot cross group boundary. + * - In raid5/6 The end of the IO must align at end of a stripe eg. + * (@offset + @length) % strip_size == 0. Or the complete range is within a + * single stripe. + * - Memory condition only permitted a shorter IO. (A user can use @length=~0 + * And check the returned ios->length for max_io_size.) + * + * The caller must check returned ios->length (and/or ios->nr_pages) and + * re-issue these pages that fall outside of ios->length + */ +int ore_get_rw_state(struct ore_layout *layout, struct ore_components *oc, + bool is_reading, u64 offset, u64 length, + struct ore_io_state **pios) +{ + struct ore_io_state *ios; + unsigned numdevs = layout->group_width * layout->mirrors_p1; + int ret; + + ret = _get_io_state(layout, oc, numdevs, pios); + if (unlikely(ret)) + return ret; + + ios = *pios; ios->reading = is_reading; + ios->offset = offset; + + if (length) { + struct ore_striping_info si; + + ore_calc_stripe_info(layout, offset, &si); + ios->length = (length <= si.group_length) ? length : + si.group_length; + ios->nr_pages = (ios->length + PAGE_SIZE - 1) / PAGE_SIZE; + } - *pios = ios; return 0; } EXPORT_SYMBOL(ore_get_rw_state); +/* Allocate an io_state for all the devices in the comps array + * + * This version of io_state allocation is used mostly by create/remove + * and trunc where we currently need all the devices. The only wastful + * bit is the read/write_attributes with no IO. Those sites should + * be converted to use ore_get_rw_state() with length=0 + */ int ore_get_io_state(struct ore_layout *layout, struct ore_components *oc, - struct ore_io_state **ios) + struct ore_io_state **pios) { - return ore_get_rw_state(layout, oc, true, 0, 0, ios); + return _get_io_state(layout, oc, oc->numdevs, pios); } EXPORT_SYMBOL(ore_get_io_state); @@ -374,12 +424,12 @@ static int _prepare_one_group(struct ore_io_state *ios, u64 length, unsigned devs_in_group = ios->layout->group_width * mirrors_p1; unsigned dev = si->dev; unsigned first_dev = dev - (dev % devs_in_group); - unsigned max_comp = ios->numdevs ? ios->numdevs - mirrors_p1 : 0; unsigned cur_pg = ios->pages_consumed; int ret = 0; while (length) { - struct ore_per_dev_state *per_dev = &ios->per_dev[dev]; + unsigned comp = dev - first_dev; + struct ore_per_dev_state *per_dev = &ios->per_dev[comp]; unsigned cur_len, page_off = 0; if (!per_dev->length) { @@ -397,9 +447,6 @@ static int _prepare_one_group(struct ore_io_state *ios, u64 length, per_dev->offset = si->obj_offset - si->unit_off; cur_len = stripe_unit; } - - if (max_comp < dev) - max_comp = dev; } else { cur_len = stripe_unit; } @@ -417,17 +464,15 @@ static int _prepare_one_group(struct ore_io_state *ios, u64 length, length -= cur_len; } out: - ios->numdevs = max_comp + mirrors_p1; + ios->numdevs = devs_in_group; ios->pages_consumed = cur_pg; return ret; } static int _prepare_for_striping(struct ore_io_state *ios) { - u64 length = ios->length; - u64 offset = ios->offset; struct ore_striping_info si; - int ret = 0; + int ret; if (!ios->pages) { if (ios->kern_buff) { @@ -446,21 +491,11 @@ static int _prepare_for_striping(struct ore_io_state *ios) return 0; } - while (length) { - ore_calc_stripe_info(ios->layout, offset, &si); - - if (length < si.group_length) - si.group_length = length; + ore_calc_stripe_info(ios->layout, ios->offset, &si); - ret = _prepare_one_group(ios, si.group_length, &si); - if (unlikely(ret)) - goto out; + BUG_ON(ios->length > si.group_length); + ret = _prepare_one_group(ios, ios->length, &si); - offset += si.group_length; - length -= si.group_length; - } - -out: return ret; } @@ -742,7 +777,6 @@ struct _trunc_info { unsigned first_group_dev; unsigned nex_group_dev; - unsigned max_devs; }; static void _calc_trunk_info(struct ore_layout *layout, u64 file_offset, @@ -757,7 +791,6 @@ static void _calc_trunk_info(struct ore_layout *layout, u64 file_offset, ti->first_group_dev = ti->si.dev - (ti->si.dev % layout->group_width); ti->nex_group_dev = ti->first_group_dev + layout->group_width; - ti->max_devs = layout->group_width * layout->group_count; } int ore_truncate(struct ore_layout *layout, struct ore_components *oc, @@ -777,7 +810,7 @@ int ore_truncate(struct ore_layout *layout, struct ore_components *oc, _calc_trunk_info(ios->layout, size, &ti); - size_attrs = kcalloc(ti.max_devs, sizeof(*size_attrs), + size_attrs = kcalloc(ios->oc->numdevs, sizeof(*size_attrs), GFP_KERNEL); if (unlikely(!size_attrs)) { ret = -ENOMEM; @@ -786,7 +819,7 @@ int ore_truncate(struct ore_layout *layout, struct ore_components *oc, ios->numdevs = ios->oc->numdevs; - for (i = 0; i < ti.max_devs; ++i) { + for (i = 0; i < ios->numdevs; ++i) { struct exofs_trunc_attr *size_attr = &size_attrs[i]; u64 obj_size; -- cgit v0.10.2 From 98260754046eee4cc7d75751a4a20182ade39f58 Mon Sep 17 00:00:00 2001 From: Boaz Harrosh Date: Sun, 2 Oct 2011 15:32:50 +0200 Subject: ore: cleanup: Embed an ore_striping_info inside ore_io_state Now that each ore_io_state covers only a single raid group. A single striping_info math is needed. Embed one inside ore_io_state to cache the calculation results and eliminate an extra call. Also the outer _prepare_for_striping is removed since it does nothing. Signed-off-by: Boaz Harrosh diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c index c1c2cc6..7e02d33 100644 --- a/fs/exofs/ore.c +++ b/fs/exofs/ore.c @@ -121,11 +121,9 @@ int ore_get_rw_state(struct ore_layout *layout, struct ore_components *oc, ios->offset = offset; if (length) { - struct ore_striping_info si; - - ore_calc_stripe_info(layout, offset, &si); - ios->length = (length <= si.group_length) ? length : - si.group_length; + ore_calc_stripe_info(layout, offset, &ios->si); + ios->length = (length <= ios->si.group_length) ? length : + ios->si.group_length; ios->nr_pages = (ios->length + PAGE_SIZE - 1) / PAGE_SIZE; } @@ -416,17 +414,36 @@ static int _add_stripe_unit(struct ore_io_state *ios, unsigned *cur_pg, return 0; } -static int _prepare_one_group(struct ore_io_state *ios, u64 length, - struct ore_striping_info *si) +static int _prepare_for_striping(struct ore_io_state *ios) { + struct ore_striping_info *si = &ios->si; unsigned stripe_unit = ios->layout->stripe_unit; unsigned mirrors_p1 = ios->layout->mirrors_p1; unsigned devs_in_group = ios->layout->group_width * mirrors_p1; unsigned dev = si->dev; unsigned first_dev = dev - (dev % devs_in_group); unsigned cur_pg = ios->pages_consumed; + u64 length = ios->length; int ret = 0; + if (!ios->pages) { + if (ios->kern_buff) { + struct ore_per_dev_state *per_dev = &ios->per_dev[0]; + + per_dev->offset = si->obj_offset; + per_dev->dev = si->dev; + + /* no cross device without page array */ + BUG_ON((ios->layout->group_width > 1) && + (si->unit_off + ios->length > + ios->layout->stripe_unit)); + } + ios->numdevs = ios->layout->mirrors_p1; + return 0; + } + + BUG_ON(length > si->group_length); + while (length) { unsigned comp = dev - first_dev; struct ore_per_dev_state *per_dev = &ios->per_dev[comp]; @@ -469,36 +486,6 @@ out: return ret; } -static int _prepare_for_striping(struct ore_io_state *ios) -{ - struct ore_striping_info si; - int ret; - - if (!ios->pages) { - if (ios->kern_buff) { - struct ore_per_dev_state *per_dev = &ios->per_dev[0]; - - ore_calc_stripe_info(ios->layout, ios->offset, &si); - per_dev->offset = si.obj_offset; - per_dev->dev = si.dev; - - /* no cross device without page array */ - BUG_ON((ios->layout->group_width > 1) && - (si.unit_off + ios->length > - ios->layout->stripe_unit)); - } - ios->numdevs = ios->layout->mirrors_p1; - return 0; - } - - ore_calc_stripe_info(ios->layout, ios->offset, &si); - - BUG_ON(ios->length > si.group_length); - ret = _prepare_one_group(ios, ios->length, &si); - - return ret; -} - int ore_create(struct ore_io_state *ios) { int i, ret; diff --git a/include/scsi/osd_ore.h b/include/scsi/osd_ore.h index 8fefdfb..baeef02 100644 --- a/include/scsi/osd_ore.h +++ b/include/scsi/osd_ore.h @@ -93,6 +93,7 @@ typedef void (*ore_io_done_fn)(struct ore_io_state *ios, void *private); struct ore_io_state { struct kref kref; + struct ore_striping_info si; void *private; ore_io_done_fn done; -- cgit v0.10.2 From 6851a5e5c12b35f8bb8986d0ff16ca9be4cf2c09 Mon Sep 17 00:00:00 2001 From: Boaz Harrosh Date: Wed, 24 Aug 2011 18:10:49 -0700 Subject: ore: Remove check for ios->kern_buff in _prepare_for_striping to later Move the check and preparation of the ios->kern_buff case to later inside _write_mirror(). Since read was never used with ios->kern_buff its support is removed instead of fixed. Signed-off-by: Boaz Harrosh diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c index 7e02d33..d0f8db5 100644 --- a/fs/exofs/ore.c +++ b/fs/exofs/ore.c @@ -427,17 +427,6 @@ static int _prepare_for_striping(struct ore_io_state *ios) int ret = 0; if (!ios->pages) { - if (ios->kern_buff) { - struct ore_per_dev_state *per_dev = &ios->per_dev[0]; - - per_dev->offset = si->obj_offset; - per_dev->dev = si->dev; - - /* no cross device without page array */ - BUG_ON((ios->layout->group_width > 1) && - (si->unit_off + ios->length > - ios->layout->stripe_unit)); - } ios->numdevs = ios->layout->mirrors_p1; return 0; } @@ -557,7 +546,6 @@ static int _write_mirror(struct ore_io_state *ios, int cur_comp) goto out; } per_dev->or = or; - per_dev->offset = master_dev->offset; if (ios->pages) { struct bio *bio; @@ -576,6 +564,7 @@ static int _write_mirror(struct ore_io_state *ios, int cur_comp) __bio_clone(bio, master_dev->bio); bio->bi_bdev = NULL; bio->bi_next = NULL; + per_dev->offset = master_dev->offset; per_dev->length = master_dev->length; per_dev->bio = bio; per_dev->dev = dev; @@ -593,7 +582,15 @@ static int _write_mirror(struct ore_io_state *ios, int cur_comp) _LLU(per_dev->offset), _LLU(per_dev->length), dev); } else if (ios->kern_buff) { - ret = osd_req_write_kern(or, _ios_obj(ios, dev), + per_dev->offset = ios->si.obj_offset; + per_dev->dev = ios->si.dev + dev; + + /* no cross device without page array */ + BUG_ON((ios->layout->group_width > 1) && + (ios->si.unit_off + ios->length > + ios->layout->stripe_unit)); + + ret = osd_req_write_kern(or, _ios_obj(ios, per_dev->dev), per_dev->offset, ios->kern_buff, ios->length); if (unlikely(ret)) @@ -602,7 +599,7 @@ static int _write_mirror(struct ore_io_state *ios, int cur_comp) "length=0x%llx dev=%d\n", _LLU(_ios_obj(ios, dev)->id), _LLU(per_dev->offset), - _LLU(ios->length), dev); + _LLU(ios->length), per_dev->dev); } else { osd_req_set_attributes(or, _ios_obj(ios, dev)); ORE_DBGMSG2("obj(0x%llx) set_attributes=%d dev=%d\n", @@ -668,16 +665,9 @@ static int _read_mirror(struct ore_io_state *ios, unsigned cur_comp) " dev=%d\n", _LLU(obj->id), _LLU(per_dev->offset), _LLU(per_dev->length), first_dev); - } else if (ios->kern_buff) { - int ret = osd_req_read_kern(or, obj, per_dev->offset, - ios->kern_buff, ios->length); - ORE_DBGMSG2("read_kern(0x%llx) offset=0x%llx " - "length=0x%llx dev=%d ret=>%d\n", - _LLU(obj->id), _LLU(per_dev->offset), - _LLU(ios->length), first_dev, ret); - if (unlikely(ret)) - return ret; } else { + BUG_ON(ios->kern_buff); + osd_req_get_attributes(or, obj); ORE_DBGMSG2("obj(0x%llx) get_attributes=%d dev=%d\n", _LLU(obj->id), -- cgit v0.10.2 From 154a9300cd87eee7538f356c9d339f06b0b1d257 Mon Sep 17 00:00:00 2001 From: Boaz Harrosh Date: Fri, 26 Aug 2011 21:00:32 -0700 Subject: exofs: Support for short read/writes If at read/write_done the actual IO was shorter then requested, reported in returned ios->length. It is not an error. The reminder of the pages should just be unlocked but not marked uptodate or end_page_writeback. They will be re issued later by the VFS. Signed-off-by: Boaz Harrosh diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c index d87c1f7..96366a1 100644 --- a/fs/exofs/inode.c +++ b/fs/exofs/inode.c @@ -149,14 +149,17 @@ static int pcol_add_page(struct page_collect *pcol, struct page *page, return 0; } +enum {PAGE_WAS_NOT_IN_IO = 17}; static int update_read_page(struct page *page, int ret) { - if (ret == 0) { + switch (ret) { + case 0: /* Everything is OK */ SetPageUptodate(page); if (PageError(page)) ClearPageError(page); - } else if (ret == -EFAULT) { + break; + case -EFAULT: /* In this case we were trying to read something that wasn't on * disk yet - return a page full of zeroes. This should be OK, * because the object should be empty (if there was a write @@ -167,16 +170,22 @@ static int update_read_page(struct page *page, int ret) SetPageUptodate(page); if (PageError(page)) ClearPageError(page); - ret = 0; /* recovered error */ EXOFS_DBGMSG("recovered read error\n"); - } else /* Error */ + /* fall through */ + case PAGE_WAS_NOT_IN_IO: + ret = 0; /* recovered error */ + break; + default: SetPageError(page); - + } return ret; } static void update_write_page(struct page *page, int ret) { + if (unlikely(ret == PAGE_WAS_NOT_IN_IO)) + return; /* don't pass start don't collect $200 */ + if (ret) { mapping_set_error(page->mapping, ret); SetPageError(page); @@ -195,10 +204,14 @@ static int __readpages_done(struct page_collect *pcol) u64 length = 0; int ret = ore_check_io(pcol->ios, &resid); - if (likely(!ret)) + if (likely(!ret)) { good_bytes = pcol->length; - else + ret = PAGE_WAS_NOT_IN_IO; + } else { good_bytes = pcol->length - resid; + } + if (good_bytes > pcol->ios->length) + good_bytes = pcol->ios->length; EXOFS_DBGMSG2("readpages_done(0x%lx) good_bytes=0x%llx" " length=0x%lx nr_pages=%u\n", @@ -518,10 +531,14 @@ static void writepages_done(struct ore_io_state *ios, void *p) atomic_dec(&pcol->sbi->s_curr_pending); - if (likely(!ret)) + if (likely(!ret)) { good_bytes = pcol->length; - else + ret = PAGE_WAS_NOT_IN_IO; + } else { good_bytes = pcol->length - resid; + } + if (good_bytes > pcol->ios->length) + good_bytes = pcol->ios->length; EXOFS_DBGMSG2("writepages_done(0x%lx) good_bytes=0x%llx" " length=0x%lx nr_pages=%u\n", -- cgit v0.10.2 From bbf9a31bba8c985780fe94da059cc5813a7920f5 Mon Sep 17 00:00:00 2001 From: Boaz Harrosh Date: Fri, 26 Aug 2011 21:04:52 -0700 Subject: ore: Support for short read/writes Memory conditions and max_bio constraints might cause us to not comply to the full length of the requested IO. Instead of failing the complete IO we can issue a shorter read/write and report how much was actually executed in the ios->length member. All users must check ios->length at IO_done or upon return of ore_read/write and re-issue the reminder of the bytes. Because other wise there is no error returned like before. This is part of the effort to support the pnfs-obj layout driver. Signed-off-by: Boaz Harrosh diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c index d0f8db5..8354fe0 100644 --- a/fs/exofs/ore.c +++ b/fs/exofs/ore.c @@ -377,8 +377,8 @@ static int _add_stripe_unit(struct ore_io_state *ios, unsigned *cur_pg, unsigned pg = *cur_pg; struct request_queue *q = osd_request_queue(_ios_od(ios, per_dev->dev)); - - per_dev->length += cur_len; + unsigned len = cur_len; + int ret; if (per_dev->bio == NULL) { unsigned pages_in_stripe = ios->layout->group_width * @@ -390,7 +390,8 @@ static int _add_stripe_unit(struct ore_io_state *ios, unsigned *cur_pg, if (unlikely(!per_dev->bio)) { ORE_DBGMSG("Failed to allocate BIO size=%u\n", bio_size); - return -ENOMEM; + ret = -ENOMEM; + goto out; } } @@ -403,15 +404,24 @@ static int _add_stripe_unit(struct ore_io_state *ios, unsigned *cur_pg, added_len = bio_add_pc_page(q, per_dev->bio, ios->pages[pg], pglen, pgbase); - if (unlikely(pglen != added_len)) - return -ENOMEM; + if (unlikely(pglen != added_len)) { + ret = -ENOMEM; + goto out; + } pgbase = 0; ++pg; } BUG_ON(cur_len); + per_dev->length += len; *cur_pg = pg; - return 0; + ret = 0; +out: /* we fail the complete unit on an error eg don't advance + * per_dev->length and cur_pg. This means that we might have a bigger + * bio than the CDB requested length (per_dev->length). That's fine + * only the oposite is fatal. + */ + return ret; } static int _prepare_for_striping(struct ore_io_state *ios) @@ -472,7 +482,13 @@ static int _prepare_for_striping(struct ore_io_state *ios) out: ios->numdevs = devs_in_group; ios->pages_consumed = cur_pg; - return ret; + if (unlikely(ret)) { + if (length == ios->length) + return ret; + else + ios->length -= length; + } + return 0; } int ore_create(struct ore_io_state *ios) -- cgit v0.10.2 From 3bd9856857339d7ee8c4ad50030583f1b9415c39 Mon Sep 17 00:00:00 2001 From: Boaz Harrosh Date: Wed, 28 Sep 2011 12:04:23 +0300 Subject: ore: Support for partial component table Users like the objlayout-driver would like to only pass a partial device table that covers the IO in question. For example exofs divides the file into raid-group-sized chunks and only serves group_width number of devices at a time. The partiality is communicated by setting ore_componets->first_dev and the array covers all logical devices from oc->first_dev upto (oc->first_dev + oc->numdevs) The ore_comp_dev() API receives a logical device index and returns the actual present device in the table. An out-of-range dev_index will BUG. Logical device index is the theoretical device index as if all the devices of a file are present. .i.e: total_devs = group_width * mirror_p1 * group_count 0 <= dev_index < total_devs Signed-off-by: Boaz Harrosh diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h index 006fd6f..51f4b4c 100644 --- a/fs/exofs/exofs.h +++ b/fs/exofs/exofs.h @@ -217,6 +217,7 @@ static inline void exofs_init_comps(struct ore_components *oc, one_comp->obj.id = oid; exofs_make_credential(one_comp->cred, &one_comp->obj); + oc->first_dev = 0; oc->numdevs = sbi->layout.group_width * sbi->layout.mirrors_p1 * sbi->layout.group_count; oc->single_comp = EC_SINGLE_COMP; diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c index 8354fe0..f1b7180 100644 --- a/fs/exofs/ore.c +++ b/fs/exofs/ore.c @@ -62,6 +62,10 @@ static struct osd_obj_id *_ios_obj(struct ore_io_state *ios, unsigned index) static struct osd_dev *_ios_od(struct ore_io_state *ios, unsigned index) { + ORE_DBGMSG2("oc->first_dev=%d oc->numdevs=%d i=%d oc->ods=%p\n", + ios->oc->first_dev, ios->oc->numdevs, index, + ios->oc->ods); + return ore_comp_dev(ios->oc, index); } diff --git a/include/scsi/osd_ore.h b/include/scsi/osd_ore.h index baeef02..492b70d 100644 --- a/include/scsi/osd_ore.h +++ b/include/scsi/osd_ore.h @@ -49,6 +49,7 @@ struct ore_dev { }; struct ore_components { + unsigned first_dev; /* First logical device no */ unsigned numdevs; /* Num of devices in array */ /* If @single_comp == EC_SINGLE_COMP, @comps points to a single * component. else there are @numdevs components @@ -70,14 +71,14 @@ struct ore_components { static inline struct osd_dev *ore_comp_dev( const struct ore_components *oc, unsigned i) { - BUG_ON(oc->numdevs <= i); - return oc->ods[i]->od; + BUG_ON((i < oc->first_dev) || (oc->first_dev + oc->numdevs <= i)); + return oc->ods[i - oc->first_dev]->od; } static inline void ore_comp_set_dev( struct ore_components *oc, unsigned i, struct osd_dev *od) { - oc->ods[i]->od = od; + oc->ods[i - oc->first_dev]->od = od; } struct ore_striping_info { -- cgit v0.10.2 From 5a51c0c7e9a913649aa65d8233470682bcbb7694 Mon Sep 17 00:00:00 2001 From: Boaz Harrosh Date: Wed, 28 Sep 2011 13:18:45 +0300 Subject: ore/exofs: Define new ore_verify_layout All users of the ore will need to check if current code supports the given layout. For example RAID5/6 is not currently supported. So move all the checks from exofs/super.c to a new ore_verify_layout() to be used by ore users. Note that any new layout should be passed through the ore_verify_layout() because the ore engine will prepare and verify some internal members of ore_layout, and assumes it's called. Signed-off-by: Boaz Harrosh diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c index 96366a1..5a62420 100644 --- a/fs/exofs/inode.c +++ b/fs/exofs/inode.c @@ -37,11 +37,7 @@ #define EXOFS_DBGMSG2(M...) do {} while (0) -enum { BIO_MAX_PAGES_KMALLOC = - (PAGE_SIZE - sizeof(struct bio)) / sizeof(struct bio_vec), - MAX_PAGES_KMALLOC = - PAGE_SIZE / sizeof(struct page *), -}; +enum {MAX_PAGES_KMALLOC = PAGE_SIZE / sizeof(struct page *), }; unsigned exofs_max_io_pages(struct ore_layout *layout, unsigned expected_pages) @@ -49,8 +45,7 @@ unsigned exofs_max_io_pages(struct ore_layout *layout, unsigned pages = min_t(unsigned, expected_pages, MAX_PAGES_KMALLOC); /* TODO: easily support bio chaining */ - pages = min_t(unsigned, pages, - layout->group_width * BIO_MAX_PAGES_KMALLOC); + pages = min_t(unsigned, pages, layout->max_io_length / PAGE_SIZE); return pages; } diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c index f1b7180..4ca59d4 100644 --- a/fs/exofs/ore.c +++ b/fs/exofs/ore.c @@ -47,9 +47,76 @@ MODULE_AUTHOR("Boaz Harrosh "); MODULE_DESCRIPTION("Objects Raid Engine ore.ko"); MODULE_LICENSE("GPL"); +/* ore_verify_layout does a couple of things: + * 1. Given a minimum number of needed parameters fixes up the rest of the + * members to be operatonals for the ore. The needed parameters are those + * that are defined by the pnfs-objects layout STD. + * 2. Check to see if the current ore code actually supports these parameters + * for example stripe_unit must be a multple of the system PAGE_SIZE, + * and etc... + * 3. Cache some havily used calculations that will be needed by users. + */ + static void ore_calc_stripe_info(struct ore_layout *layout, u64 file_offset, struct ore_striping_info *si); +enum { BIO_MAX_PAGES_KMALLOC = + (PAGE_SIZE - sizeof(struct bio)) / sizeof(struct bio_vec),}; + +int ore_verify_layout(unsigned total_comps, struct ore_layout *layout) +{ + u64 stripe_length; + +/* FIXME: Only raid0 is supported for now. */ + if (layout->raid_algorithm != PNFS_OSD_RAID_0) { + ORE_ERR("Only RAID_0 for now\n"); + return -EINVAL; + } + if (0 != (layout->stripe_unit & ~PAGE_MASK)) { + ORE_ERR("Stripe Unit(0x%llx)" + " must be Multples of PAGE_SIZE(0x%lx)\n", + _LLU(layout->stripe_unit), PAGE_SIZE); + return -EINVAL; + } + if (layout->group_width) { + if (!layout->group_depth) { + ORE_ERR("group_depth == 0 && group_width != 0\n"); + return -EINVAL; + } + if (total_comps < (layout->group_width * layout->mirrors_p1)) { + ORE_ERR("Data Map wrong, " + "numdevs=%d < group_width=%d * mirrors=%d\n", + total_comps, layout->group_width, + layout->mirrors_p1); + return -EINVAL; + } + layout->group_count = total_comps / layout->mirrors_p1 / + layout->group_width; + } else { + if (layout->group_depth) { + printk(KERN_NOTICE "Warning: group_depth ignored " + "group_width == 0 && group_depth == %lld\n", + _LLU(layout->group_depth)); + } + layout->group_width = total_comps / layout->mirrors_p1; + layout->group_depth = -1; + layout->group_count = 1; + } + + stripe_length = (u64)layout->group_width * layout->stripe_unit; + if (stripe_length >= (1ULL << 32)) { + ORE_ERR("Stripe_length(0x%llx) >= 32bit is not supported\n", + _LLU(stripe_length)); + return -EINVAL; + } + + layout->max_io_length = + (BIO_MAX_PAGES_KMALLOC * PAGE_SIZE - layout->stripe_unit) * + layout->group_width; + return 0; +} +EXPORT_SYMBOL(ore_verify_layout); + static u8 *_ios_cred(struct ore_io_state *ios, unsigned index) { return ios->oc->comps[index & ios->oc->single_comp].cred; diff --git a/fs/exofs/super.c b/fs/exofs/super.c index bce3686..057b237 100644 --- a/fs/exofs/super.c +++ b/fs/exofs/super.c @@ -480,7 +480,7 @@ static void exofs_put_super(struct super_block *sb) static int _read_and_match_data_map(struct exofs_sb_info *sbi, unsigned numdevs, struct exofs_device_table *dt) { - u64 stripe_length; + int ret; sbi->layout.stripe_unit = le64_to_cpu(dt->dt_data_map.cb_stripe_unit); @@ -493,50 +493,7 @@ static int _read_and_match_data_map(struct exofs_sb_info *sbi, unsigned numdevs, sbi->layout.raid_algorithm = le32_to_cpu(dt->dt_data_map.cb_raid_algorithm); -/* FIXME: Only raid0 for now. if not so, do not mount */ - if (sbi->layout.raid_algorithm != PNFS_OSD_RAID_0) { - EXOFS_ERR("Only RAID_0 for now\n"); - return -EINVAL; - } - if (numdevs < (sbi->layout.group_width * sbi->layout.mirrors_p1)) { - EXOFS_ERR("Data Map wrong, " - "numdevs=%d < group_width=%d * mirrors=%d\n", - numdevs, sbi->layout.group_width, - sbi->layout.mirrors_p1); - return -EINVAL; - } - - if (0 != (sbi->layout.stripe_unit & ~PAGE_MASK)) { - EXOFS_ERR("Stripe Unit(0x%llx)" - " must be Multples of PAGE_SIZE(0x%lx)\n", - _LLU(sbi->layout.stripe_unit), PAGE_SIZE); - return -EINVAL; - } - - if (sbi->layout.group_width) { - if (!sbi->layout.group_depth) { - EXOFS_ERR("group_depth == 0 && group_width != 0\n"); - return -EINVAL; - } - sbi->layout.group_count = numdevs / sbi->layout.mirrors_p1 / - sbi->layout.group_width; - } else { - if (sbi->layout.group_depth) { - printk(KERN_NOTICE "Warning: group_depth ignored " - "group_width == 0 && group_depth == %lld\n", - _LLU(sbi->layout.group_depth)); - } - sbi->layout.group_width = numdevs / sbi->layout.mirrors_p1; - sbi->layout.group_depth = -1; - sbi->layout.group_count = 1; - } - - stripe_length = (u64)sbi->layout.group_width * sbi->layout.stripe_unit; - if (stripe_length >= (1ULL << 32)) { - EXOFS_ERR("Total Stripe length(0x%llx)" - " >= 32bit is not supported\n", _LLU(stripe_length)); - return -EINVAL; - } + ret = ore_verify_layout(numdevs, &sbi->layout); EXOFS_DBGMSG("exofs: layout: " "num_comps=%u stripe_unit=0x%x group_width=%u " @@ -547,7 +504,7 @@ static int _read_and_match_data_map(struct exofs_sb_info *sbi, unsigned numdevs, _LLU(sbi->layout.group_depth), sbi->layout.mirrors_p1, sbi->layout.raid_algorithm); - return 0; + return ret; } static unsigned __ra_pages(struct ore_layout *layout) diff --git a/include/scsi/osd_ore.h b/include/scsi/osd_ore.h index 492b70d..716dbea 100644 --- a/include/scsi/osd_ore.h +++ b/include/scsi/osd_ore.h @@ -42,6 +42,13 @@ struct ore_layout { unsigned group_width; u64 group_depth; unsigned group_count; + + /* Cached often needed calculations filled in by + * ore_verify_layout + */ + unsigned long max_io_length; /* Max length that should be passed to + * ore_get_rw_state + */ }; struct ore_dev { @@ -138,6 +145,7 @@ static inline unsigned ore_io_state_size(unsigned numdevs) } /* ore.c */ +int ore_verify_layout(unsigned total_comps, struct ore_layout *layout); int ore_get_rw_state(struct ore_layout *layout, struct ore_components *comps, bool is_reading, u64 offset, u64 length, struct ore_io_state **ios); -- cgit v0.10.2 From 4b46c9f5cf69505f0bc708995b88b0cc60317ffd Mon Sep 17 00:00:00 2001 From: Boaz Harrosh Date: Wed, 28 Sep 2011 13:25:50 +0300 Subject: ore/exofs: Change ore_check_io API Current ore_check_io API receives a residual pointer, to report partial IO. But it is actually not used, because in a multiple devices IO there is never a linearity in the IO failure. On the other hand if every failing device is reported through a received callback measures can be taken to handle only failed devices. One at a time. This will also be needed by the objects-layout-driver for it's error reporting facility. Exofs is not currently using the new information and keeps the old behaviour of failing the complete IO in case of an error. (No partial completion) TODO: Use an ore_check_io callback to set_page_error only the failing pages. And re-dirty write pages. Signed-off-by: Boaz Harrosh diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c index 5a62420..86c0ac8 100644 --- a/fs/exofs/inode.c +++ b/fs/exofs/inode.c @@ -194,19 +194,16 @@ static void update_write_page(struct page *page, int ret) static int __readpages_done(struct page_collect *pcol) { int i; - u64 resid; u64 good_bytes; u64 length = 0; - int ret = ore_check_io(pcol->ios, &resid); + int ret = ore_check_io(pcol->ios, NULL); if (likely(!ret)) { good_bytes = pcol->length; ret = PAGE_WAS_NOT_IN_IO; } else { - good_bytes = pcol->length - resid; + good_bytes = 0; } - if (good_bytes > pcol->ios->length) - good_bytes = pcol->ios->length; EXOFS_DBGMSG2("readpages_done(0x%lx) good_bytes=0x%llx" " length=0x%lx nr_pages=%u\n", @@ -519,10 +516,9 @@ static void writepages_done(struct ore_io_state *ios, void *p) { struct page_collect *pcol = p; int i; - u64 resid; u64 good_bytes; u64 length = 0; - int ret = ore_check_io(ios, &resid); + int ret = ore_check_io(ios, NULL); atomic_dec(&pcol->sbi->s_curr_pending); @@ -530,10 +526,8 @@ static void writepages_done(struct ore_io_state *ios, void *p) good_bytes = pcol->length; ret = PAGE_WAS_NOT_IN_IO; } else { - good_bytes = pcol->length - resid; + good_bytes = 0; } - if (good_bytes > pcol->ios->length) - good_bytes = pcol->ios->length; EXOFS_DBGMSG2("writepages_done(0x%lx) good_bytes=0x%llx" " length=0x%lx nr_pages=%u\n", diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c index 4ca59d4..3b1cc3a 100644 --- a/fs/exofs/ore.c +++ b/fs/exofs/ore.c @@ -317,7 +317,7 @@ static void _clear_bio(struct bio *bio) } } -int ore_check_io(struct ore_io_state *ios, u64 *resid) +int ore_check_io(struct ore_io_state *ios, ore_on_dev_error on_dev_error) { enum osd_err_priority acumulated_osd_err = 0; int acumulated_lin_err = 0; @@ -325,7 +325,8 @@ int ore_check_io(struct ore_io_state *ios, u64 *resid) for (i = 0; i < ios->numdevs; i++) { struct osd_sense_info osi; - struct osd_request *or = ios->per_dev[i].or; + struct ore_per_dev_state *per_dev = &ios->per_dev[i]; + struct osd_request *or = per_dev->or; int ret; if (unlikely(!or)) @@ -337,29 +338,31 @@ int ore_check_io(struct ore_io_state *ios, u64 *resid) if (OSD_ERR_PRI_CLEAR_PAGES == osi.osd_err_pri) { /* start read offset passed endof file */ - _clear_bio(ios->per_dev[i].bio); + _clear_bio(per_dev->bio); ORE_DBGMSG("start read offset passed end of file " "offset=0x%llx, length=0x%llx\n", - _LLU(ios->per_dev[i].offset), - _LLU(ios->per_dev[i].length)); + _LLU(per_dev->offset), + _LLU(per_dev->length)); continue; /* we recovered */ } + if (on_dev_error) { + u64 residual = ios->reading ? + or->in.residual : or->out.residual; + u64 offset = (ios->offset + ios->length) - residual; + struct ore_dev *od = ios->oc->ods[ + per_dev->dev - ios->oc->first_dev]; + + on_dev_error(ios, od, per_dev->dev, osi.osd_err_pri, + offset, residual); + } if (osi.osd_err_pri >= acumulated_osd_err) { acumulated_osd_err = osi.osd_err_pri; acumulated_lin_err = ret; } } - /* TODO: raid specific residual calculations */ - if (resid) { - if (likely(!acumulated_lin_err)) - *resid = 0; - else - *resid = ios->length; - } - return acumulated_lin_err; } EXPORT_SYMBOL(ore_check_io); diff --git a/include/scsi/osd_ore.h b/include/scsi/osd_ore.h index 716dbea..af2231a 100644 --- a/include/scsi/osd_ore.h +++ b/include/scsi/osd_ore.h @@ -153,7 +153,10 @@ int ore_get_io_state(struct ore_layout *layout, struct ore_components *comps, struct ore_io_state **ios); void ore_put_io_state(struct ore_io_state *ios); -int ore_check_io(struct ore_io_state *ios, u64 *resid); +typedef void (*ore_on_dev_error)(struct ore_io_state *ios, struct ore_dev *od, + unsigned dev_index, enum osd_err_priority oep, + u64 dev_offset, u64 dev_len); +int ore_check_io(struct ore_io_state *ios, ore_on_dev_error rep); int ore_create(struct ore_io_state *ios); int ore_remove(struct ore_io_state *ios); -- cgit v0.10.2 From 611d7a5dc6f2a1a0cfd8cc07b9d15f794cbe5f98 Mon Sep 17 00:00:00 2001 From: Boaz Harrosh Date: Tue, 4 Oct 2011 14:20:17 +0200 Subject: ore: Make ore_calc_stripe_info EXPORT_SYMBOL ore_calc_stripe_info is needed by exofs::export.c for the layout calculations. Make it exportable Signed-off-by: Boaz Harrosh diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c index 3b1cc3a..d92998d 100644 --- a/fs/exofs/ore.c +++ b/fs/exofs/ore.c @@ -57,9 +57,6 @@ MODULE_LICENSE("GPL"); * 3. Cache some havily used calculations that will be needed by users. */ -static void ore_calc_stripe_info(struct ore_layout *layout, u64 file_offset, - struct ore_striping_info *si); - enum { BIO_MAX_PAGES_KMALLOC = (PAGE_SIZE - sizeof(struct bio)) / sizeof(struct bio_vec),}; @@ -409,8 +406,8 @@ EXPORT_SYMBOL(ore_check_io); * * O = L % stripe_unit + N * stripe_unit + M * group_depth * stripe_unit */ -static void ore_calc_stripe_info(struct ore_layout *layout, u64 file_offset, - struct ore_striping_info *si) +void ore_calc_stripe_info(struct ore_layout *layout, u64 file_offset, + struct ore_striping_info *si) { u32 stripe_unit = layout->stripe_unit; u32 group_width = layout->group_width; @@ -443,6 +440,7 @@ static void ore_calc_stripe_info(struct ore_layout *layout, u64 file_offset, si->group_length = T - H; si->M = M; } +EXPORT_SYMBOL(ore_calc_stripe_info); static int _add_stripe_unit(struct ore_io_state *ios, unsigned *cur_pg, unsigned pgbase, struct ore_per_dev_state *per_dev, diff --git a/include/scsi/osd_ore.h b/include/scsi/osd_ore.h index af2231a..a8e39d1 100644 --- a/include/scsi/osd_ore.h +++ b/include/scsi/osd_ore.h @@ -146,6 +146,9 @@ static inline unsigned ore_io_state_size(unsigned numdevs) /* ore.c */ int ore_verify_layout(unsigned total_comps, struct ore_layout *layout); +void ore_calc_stripe_info(struct ore_layout *layout, u64 file_offset, + struct ore_striping_info *si); + int ore_get_rw_state(struct ore_layout *layout, struct ore_components *comps, bool is_reading, u64 offset, u64 length, struct ore_io_state **ios); -- cgit v0.10.2 From 3e335672e018c06e007f85a5d54afd721fb3d6d5 Mon Sep 17 00:00:00 2001 From: Boaz Harrosh Date: Mon, 24 Oct 2011 16:36:33 -0700 Subject: fs/Makefile: Always inspect exofs/ fs/exofs directory has multiple targets now, of which the ore.ko will be needed by the pnfs-objects-layout-driver (fs/nfs/objlayout). As suggested by: Michal Marek convert inclusion of exofs/ from obj-$(CONFIG_EXOFS_FS) => obj-$(y). So ORE can be selected also from fs/nfs/Kconfig CC: Michal Marek CC: Al Viro Signed-off-by: Boaz Harrosh diff --git a/fs/Makefile b/fs/Makefile index afc1096..5c30a13 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -120,6 +120,6 @@ obj-$(CONFIG_DEBUG_FS) += debugfs/ obj-$(CONFIG_OCFS2_FS) += ocfs2/ obj-$(CONFIG_BTRFS_FS) += btrfs/ obj-$(CONFIG_GFS2_FS) += gfs2/ -obj-$(CONFIG_EXOFS_FS) += exofs/ +obj-$(y) += exofs/ # Multiple mods, used by nfs/objlayout obj-$(CONFIG_CEPH_FS) += ceph/ obj-$(CONFIG_PSTORE) += pstore/ -- cgit v0.10.2 From a1fec1dbbc8db974d2582e4040590cebe72171e4 Mon Sep 17 00:00:00 2001 From: Boaz Harrosh Date: Wed, 12 Oct 2011 18:42:22 +0200 Subject: ore: RAID5 read This patch introduces the first stage of RAID5 support mainly the skip-over-raid-units when reading. For writes it inserts BLANK units, into where XOR blocks should be calculated and written to. It introduces the new "general raid maths", and the main additional parameters and components needed for raid5. Since at this stage it could corrupt future version that actually do support raid5. The enablement of raid5 mounting and setting of parity-count > 0 is disabled. So the raid5 code will never be used. Mounting of raid5 is only enabled later once the basic XOR write is also in. But if the patch "enable RAID5" is applied this code has been tested to be able to properly read raid5 volumes and is according to standard. Also it has been tested that the new maths still properly supports RAID0 and grouping code just as before. (BTW: I have found more bugs in the pnfs-obj RAID math fixed here) The ore.c file is getting too big, so new ore_raid.[hc] files are added that will include the special raid stuff that are not used in striping and mirrors. In future write support these will get bigger. When adding the ore_raid.c to Kbuild file I was forced to rename ore.ko to libore.ko. Is it possible to keep source file, say ore.c and module file ore.ko the same even if there are multiple files inside ore.ko? Signed-off-by: Boaz Harrosh diff --git a/fs/exofs/Kbuild b/fs/exofs/Kbuild index c5a5855..352ba14 100644 --- a/fs/exofs/Kbuild +++ b/fs/exofs/Kbuild @@ -13,7 +13,8 @@ # # ore module library -obj-$(CONFIG_ORE) += ore.o +libore-y := ore.o ore_raid.o +obj-$(CONFIG_ORE) += libore.o exofs-y := inode.o file.o symlink.o namei.o dir.o super.o obj-$(CONFIG_EXOFS_FS) += exofs.o diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c index d92998d..fd6090d 100644 --- a/fs/exofs/ore.c +++ b/fs/exofs/ore.c @@ -24,24 +24,9 @@ #include #include +#include -#include - -#define ORE_ERR(fmt, a...) printk(KERN_ERR "ore: " fmt, ##a) - -#ifdef CONFIG_EXOFS_DEBUG -#define ORE_DBGMSG(fmt, a...) \ - printk(KERN_NOTICE "ore @%s:%d: " fmt, __func__, __LINE__, ##a) -#else -#define ORE_DBGMSG(fmt, a...) \ - do { if (0) printk(fmt, ##a); } while (0) -#endif - -/* u64 has problems with printk this will cast it to unsigned long long */ -#define _LLU(x) (unsigned long long)(x) - -#define ORE_DBGMSG2(M...) do {} while (0) -/* #define ORE_DBGMSG2 ORE_DBGMSG */ +#include "ore_raid.h" MODULE_AUTHOR("Boaz Harrosh "); MODULE_DESCRIPTION("Objects Raid Engine ore.ko"); @@ -133,21 +118,81 @@ static struct osd_dev *_ios_od(struct ore_io_state *ios, unsigned index) return ore_comp_dev(ios->oc, index); } -static int _get_io_state(struct ore_layout *layout, - struct ore_components *oc, unsigned numdevs, - struct ore_io_state **pios) +static int _ore_get_io_state(struct ore_layout *layout, + struct ore_components *oc, unsigned numdevs, + unsigned sgs_per_dev, unsigned num_par_pages, + struct ore_io_state **pios) { struct ore_io_state *ios; + struct page **pages; + struct osd_sg_entry *sgilist; + struct __alloc_all_io_state { + struct ore_io_state ios; + struct ore_per_dev_state per_dev[numdevs]; + union { + struct osd_sg_entry sglist[sgs_per_dev * numdevs]; + struct page *pages[num_par_pages]; + }; + } *_aios; + + if (likely(sizeof(*_aios) <= PAGE_SIZE)) { + _aios = kzalloc(sizeof(*_aios), GFP_KERNEL); + if (unlikely(!_aios)) { + ORE_DBGMSG("Failed kzalloc bytes=%zd\n", + sizeof(*_aios)); + *pios = NULL; + return -ENOMEM; + } + pages = num_par_pages ? _aios->pages : NULL; + sgilist = sgs_per_dev ? _aios->sglist : NULL; + ios = &_aios->ios; + } else { + struct __alloc_small_io_state { + struct ore_io_state ios; + struct ore_per_dev_state per_dev[numdevs]; + } *_aio_small; + union __extra_part { + struct osd_sg_entry sglist[sgs_per_dev * numdevs]; + struct page *pages[num_par_pages]; + } *extra_part; + + _aio_small = kzalloc(sizeof(*_aio_small), GFP_KERNEL); + if (unlikely(!_aio_small)) { + ORE_DBGMSG("Failed alloc first part bytes=%zd\n", + sizeof(*_aio_small)); + *pios = NULL; + return -ENOMEM; + } + extra_part = kzalloc(sizeof(*extra_part), GFP_KERNEL); + if (unlikely(!extra_part)) { + ORE_DBGMSG("Failed alloc second part bytes=%zd\n", + sizeof(*extra_part)); + kfree(_aio_small); + *pios = NULL; + return -ENOMEM; + } - /*TODO: Maybe use kmem_cach per sbi of size - * exofs_io_state_size(layout->s_numdevs) - */ - ios = kzalloc(ore_io_state_size(numdevs), GFP_KERNEL); - if (unlikely(!ios)) { - ORE_DBGMSG("Failed kzalloc bytes=%d\n", - ore_io_state_size(numdevs)); - *pios = NULL; - return -ENOMEM; + pages = num_par_pages ? extra_part->pages : NULL; + sgilist = sgs_per_dev ? extra_part->sglist : NULL; + /* In this case the per_dev[0].sgilist holds the pointer to + * be freed + */ + ios = &_aio_small->ios; + ios->extra_part_alloc = true; + } + + if (pages) { + ios->parity_pages = pages; + ios->max_par_pages = num_par_pages; + } + if (sgilist) { + unsigned d; + + for (d = 0; d < numdevs; ++d) { + ios->per_dev[d].sglist = sgilist; + sgilist += sgs_per_dev; + } + ios->sgs_per_dev = sgs_per_dev; } ios->layout = layout; @@ -178,9 +223,42 @@ int ore_get_rw_state(struct ore_layout *layout, struct ore_components *oc, { struct ore_io_state *ios; unsigned numdevs = layout->group_width * layout->mirrors_p1; + unsigned sgs_per_dev = 0, max_par_pages = 0; int ret; - ret = _get_io_state(layout, oc, numdevs, pios); + if (layout->parity && length) { + unsigned data_devs = layout->group_width - layout->parity; + unsigned stripe_size = layout->stripe_unit * data_devs; + unsigned pages_in_unit = layout->stripe_unit / PAGE_SIZE; + u32 remainder; + u64 num_stripes; + u64 num_raid_units; + + num_stripes = div_u64_rem(length, stripe_size, &remainder); + if (remainder) + ++num_stripes; + + num_raid_units = num_stripes * layout->parity; + + if (is_reading) { + /* For reads add per_dev sglist array */ + /* TODO: Raid 6 we need twice more. Actually: + * num_stripes / LCMdP(W,P); + * if (W%P != 0) num_stripes *= parity; + */ + + /* first/last seg is split */ + num_raid_units += layout->group_width; + sgs_per_dev = div_u64(num_raid_units, data_devs); + } else { + /* For Writes add parity pages array. */ + max_par_pages = num_raid_units * pages_in_unit * + sizeof(struct page *); + } + } + + ret = _ore_get_io_state(layout, oc, numdevs, sgs_per_dev, max_par_pages, + pios); if (unlikely(ret)) return ret; @@ -189,10 +267,11 @@ int ore_get_rw_state(struct ore_layout *layout, struct ore_components *oc, ios->offset = offset; if (length) { - ore_calc_stripe_info(layout, offset, &ios->si); - ios->length = (length <= ios->si.group_length) ? length : - ios->si.group_length; + ore_calc_stripe_info(layout, offset, length, &ios->si); + ios->length = ios->si.length; ios->nr_pages = (ios->length + PAGE_SIZE - 1) / PAGE_SIZE; + if (layout->parity) + _ore_post_alloc_raid_stuff(ios); } return 0; @@ -209,7 +288,7 @@ EXPORT_SYMBOL(ore_get_rw_state); int ore_get_io_state(struct ore_layout *layout, struct ore_components *oc, struct ore_io_state **pios) { - return _get_io_state(layout, oc, oc->numdevs, pios); + return _ore_get_io_state(layout, oc, oc->numdevs, 0, 0, pios); } EXPORT_SYMBOL(ore_get_io_state); @@ -227,6 +306,7 @@ void ore_put_io_state(struct ore_io_state *ios) bio_put(per_dev->bio); } + _ore_free_raid_stuff(ios); kfree(ios); } } @@ -367,53 +447,65 @@ EXPORT_SYMBOL(ore_check_io); /* * L - logical offset into the file * - * U - The number of bytes in a stripe within a group + * D - number of Data devices + * D = group_width - parity * - * U = stripe_unit * group_width + * U - The number of bytes in a stripe within a group + * U = stripe_unit * D * * T - The number of bytes striped within a group of component objects * (before advancing to the next group) - * - * T = stripe_unit * group_width * group_depth + * T = U * group_depth * * S - The number of bytes striped across all component objects * before the pattern repeats + * S = T * group_count * - * S = stripe_unit * group_width * group_depth * group_count - * - * M - The "major" (i.e., across all components) stripe number - * + * M - The "major" (i.e., across all components) cycle number * M = L / S * - * G - Counts the groups from the beginning of the major stripe - * + * G - Counts the groups from the beginning of the major cycle * G = (L - (M * S)) / T [or (L % S) / T] * * H - The byte offset within the group - * * H = (L - (M * S)) % T [or (L % S) % T] * * N - The "minor" (i.e., across the group) stripe number - * * N = H / U * * C - The component index coresponding to L * - * C = (H - (N * U)) / stripe_unit + G * group_width - * [or (L % U) / stripe_unit + G * group_width] + * C = (H - (N * U)) / stripe_unit + G * D + * [or (L % U) / stripe_unit + G * D] * * O - The component offset coresponding to L - * * O = L % stripe_unit + N * stripe_unit + M * group_depth * stripe_unit + * + * LCMdP – Parity cycle: Lowest Common Multiple of group_width, parity + * divide by parity + * LCMdP = lcm(group_width, parity) / parity + * + * R - The parity Rotation stripe + * (Note parity cycle always starts at a group's boundary) + * R = N % LCMdP + * + * I = the first parity device index + * I = (group_width + group_width - R*parity - parity) % group_width + * + * Craid - The component index Rotated + * Craid = (group_width + C - R*parity) % group_width + * (We add the group_width to avoid negative numbers modulo math) */ void ore_calc_stripe_info(struct ore_layout *layout, u64 file_offset, - struct ore_striping_info *si) + u64 length, struct ore_striping_info *si) { u32 stripe_unit = layout->stripe_unit; u32 group_width = layout->group_width; u64 group_depth = layout->group_depth; + u32 parity = layout->parity; - u32 U = stripe_unit * group_width; + u32 D = group_width - parity; + u32 U = D * stripe_unit; u64 T = U * group_depth; u64 S = T * layout->group_count; u64 M = div64_u64(file_offset, S); @@ -429,22 +521,43 @@ void ore_calc_stripe_info(struct ore_layout *layout, u64 file_offset, u32 N = div_u64(H, U); /* "H - (N * U)" is just "H % U" so it's bound to u32 */ - si->dev = (u32)(H - (N * U)) / stripe_unit + G * group_width; - si->dev *= layout->mirrors_p1; + u32 C = (u32)(H - (N * U)) / stripe_unit + G * group_width; div_u64_rem(file_offset, stripe_unit, &si->unit_off); si->obj_offset = si->unit_off + (N * stripe_unit) + (M * group_depth * stripe_unit); - si->group_length = T - H; + if (parity) { + u32 LCMdP = lcm(group_width, parity) / parity; + /* R = N % LCMdP; */ + u32 RxP = (N % LCMdP) * parity; + u32 first_dev = C - C % group_width; + + si->par_dev = (group_width + group_width - parity - RxP) % + group_width + first_dev; + si->dev = (group_width + C - RxP) % group_width + first_dev; + si->bytes_in_stripe = U; + si->first_stripe_start = M * S + G * T + N * U; + } else { + /* Make the math correct see _prepare_one_group */ + si->par_dev = group_width; + si->dev = C; + } + + si->dev *= layout->mirrors_p1; + si->par_dev *= layout->mirrors_p1; + si->offset = file_offset; + si->length = T - H; + if (si->length > length) + si->length = length; si->M = M; } EXPORT_SYMBOL(ore_calc_stripe_info); -static int _add_stripe_unit(struct ore_io_state *ios, unsigned *cur_pg, - unsigned pgbase, struct ore_per_dev_state *per_dev, - int cur_len) +int _ore_add_stripe_unit(struct ore_io_state *ios, unsigned *cur_pg, + unsigned pgbase, struct page **pages, + struct ore_per_dev_state *per_dev, int cur_len) { unsigned pg = *cur_pg; struct request_queue *q = @@ -455,8 +568,11 @@ static int _add_stripe_unit(struct ore_io_state *ios, unsigned *cur_pg, if (per_dev->bio == NULL) { unsigned pages_in_stripe = ios->layout->group_width * (ios->layout->stripe_unit / PAGE_SIZE); - unsigned bio_size = (ios->nr_pages + pages_in_stripe) / - ios->layout->group_width; + unsigned nr_pages = ios->nr_pages * ios->layout->group_width / + (ios->layout->group_width - + ios->layout->parity); + unsigned bio_size = (nr_pages + pages_in_stripe) / + ios->layout->group_width; per_dev->bio = bio_kmalloc(GFP_KERNEL, bio_size); if (unlikely(!per_dev->bio)) { @@ -471,12 +587,13 @@ static int _add_stripe_unit(struct ore_io_state *ios, unsigned *cur_pg, unsigned pglen = min_t(unsigned, PAGE_SIZE - pgbase, cur_len); unsigned added_len; - BUG_ON(ios->nr_pages <= pg); cur_len -= pglen; - added_len = bio_add_pc_page(q, per_dev->bio, ios->pages[pg], + added_len = bio_add_pc_page(q, per_dev->bio, pages[pg], pglen, pgbase); if (unlikely(pglen != added_len)) { + ORE_DBGMSG("Failed bio_add_pc_page bi_vcnt=%u\n", + per_dev->bio->bi_vcnt); ret = -ENOMEM; goto out; } @@ -501,9 +618,11 @@ static int _prepare_for_striping(struct ore_io_state *ios) struct ore_striping_info *si = &ios->si; unsigned stripe_unit = ios->layout->stripe_unit; unsigned mirrors_p1 = ios->layout->mirrors_p1; - unsigned devs_in_group = ios->layout->group_width * mirrors_p1; + unsigned group_width = ios->layout->group_width; + unsigned devs_in_group = group_width * mirrors_p1; unsigned dev = si->dev; unsigned first_dev = dev - (dev % devs_in_group); + unsigned dev_order; unsigned cur_pg = ios->pages_consumed; u64 length = ios->length; int ret = 0; @@ -513,7 +632,10 @@ static int _prepare_for_striping(struct ore_io_state *ios) return 0; } - BUG_ON(length > si->group_length); + BUG_ON(length > si->length); + + dev_order = _dev_order(devs_in_group, mirrors_p1, si->par_dev, dev); + si->cur_comp = dev_order; while (length) { unsigned comp = dev - first_dev; @@ -522,17 +644,20 @@ static int _prepare_for_striping(struct ore_io_state *ios) if (!per_dev->length) { per_dev->dev = dev; - if (dev < si->dev) { - per_dev->offset = si->obj_offset + stripe_unit - - si->unit_off; - cur_len = stripe_unit; - } else if (dev == si->dev) { + if (dev == si->dev) { + WARN_ON(dev == si->par_dev); per_dev->offset = si->obj_offset; cur_len = stripe_unit - si->unit_off; page_off = si->unit_off & ~PAGE_MASK; BUG_ON(page_off && (page_off != ios->pgbase)); - } else { /* dev > si->dev */ - per_dev->offset = si->obj_offset - si->unit_off; + } else { + if (si->cur_comp > dev_order) + per_dev->offset = + si->obj_offset - si->unit_off; + else /* si->cur_comp < dev_order */ + per_dev->offset = + si->obj_offset + stripe_unit - + si->unit_off; cur_len = stripe_unit; } } else { @@ -541,8 +666,8 @@ static int _prepare_for_striping(struct ore_io_state *ios) if (cur_len >= length) cur_len = length; - ret = _add_stripe_unit(ios, &cur_pg, page_off , per_dev, - cur_len); + ret = _ore_add_stripe_unit(ios, &cur_pg, page_off, ios->pages, + per_dev, cur_len); if (unlikely(ret)) goto out; @@ -550,6 +675,41 @@ static int _prepare_for_striping(struct ore_io_state *ios) dev = (dev % devs_in_group) + first_dev; length -= cur_len; + + si->cur_comp = (si->cur_comp + 1) % group_width; + if (unlikely((dev == si->par_dev) || + (!length && ios->parity_pages))) { + if (!length) + /* If we are writing and this is the very last + * stripe. then operate on parity dev. + */ + dev = si->par_dev; + if (ios->reading) + /* In writes cur_len just means if it's the + * last one. See _ore_add_parity_unit. + */ + cur_len = length; + per_dev = &ios->per_dev[dev - first_dev]; + if (!per_dev->length) { + /* Only/always the parity unit of the first + * stripe will be empty. So this is a chance to + * initialize the per_dev info. + */ + per_dev->dev = dev; + per_dev->offset = si->obj_offset - si->unit_off; + } + + ret = _ore_add_parity_unit(ios, si, per_dev, cur_len); + if (unlikely(ret)) + goto out; + + /* Rotate next par_dev backwards with wraping */ + si->par_dev = (devs_in_group + si->par_dev - + ios->layout->parity * mirrors_p1) % + devs_in_group + first_dev; + /* Next stripe, start fresh */ + si->cur_comp = 0; + } } out: ios->numdevs = devs_in_group; @@ -747,12 +907,24 @@ static int _read_mirror(struct ore_io_state *ios, unsigned cur_comp) per_dev->or = or; if (ios->pages) { - osd_req_read(or, obj, per_dev->offset, - per_dev->bio, per_dev->length); + if (per_dev->cur_sg) { + /* finalize the last sg_entry */ + _ore_add_sg_seg(per_dev, 0, false); + if (unlikely(!per_dev->cur_sg)) + return 0; /* Skip parity only device */ + + osd_req_read_sg(or, obj, per_dev->bio, + per_dev->sglist, per_dev->cur_sg); + } else { + /* The no raid case */ + osd_req_read(or, obj, per_dev->offset, + per_dev->bio, per_dev->length); + } + ORE_DBGMSG("read(0x%llx) offset=0x%llx length=0x%llx" - " dev=%d\n", _LLU(obj->id), + " dev=%d sg_len=%d\n", _LLU(obj->id), _LLU(per_dev->offset), _LLU(per_dev->length), - first_dev); + first_dev, per_dev->cur_sg); } else { BUG_ON(ios->kern_buff); @@ -849,7 +1021,7 @@ static void _calc_trunk_info(struct ore_layout *layout, u64 file_offset, { unsigned stripe_unit = layout->stripe_unit; - ore_calc_stripe_info(layout, file_offset, &ti->si); + ore_calc_stripe_info(layout, file_offset, 0, &ti->si); ti->prev_group_obj_off = ti->si.M * stripe_unit; ti->next_group_obj_off = ti->si.M ? (ti->si.M - 1) * stripe_unit : 0; diff --git a/fs/exofs/ore_raid.c b/fs/exofs/ore_raid.c new file mode 100644 index 0000000..8d4b93a --- /dev/null +++ b/fs/exofs/ore_raid.c @@ -0,0 +1,140 @@ +/* + * Copyright (C) 2011 + * Boaz Harrosh + * + * This file is part of the objects raid engine (ore). + * + * It is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + * + * You should have received a copy of the GNU General Public License + * along with "ore". If not, write to the Free Software Foundation, Inc: + * "Free Software Foundation " + */ + +#include + +#include "ore_raid.h" + +struct page *_raid_page_alloc(void) +{ + return alloc_page(GFP_KERNEL); +} + +void _raid_page_free(struct page *p) +{ + __free_page(p); +} + +void _ore_add_sg_seg(struct ore_per_dev_state *per_dev, unsigned cur_len, + bool not_last) +{ + struct osd_sg_entry *sge; + + ORE_DBGMSG("dev=%d cur_len=0x%x not_last=%d cur_sg=%d " + "offset=0x%llx length=0x%x last_sgs_total=0x%x\n", + per_dev->dev, cur_len, not_last, per_dev->cur_sg, + _LLU(per_dev->offset), per_dev->length, + per_dev->last_sgs_total); + + if (!per_dev->cur_sg) { + sge = per_dev->sglist; + + /* First time we prepare two entries */ + if (per_dev->length) { + ++per_dev->cur_sg; + sge->offset = per_dev->offset; + sge->len = per_dev->length; + } else { + /* Here the parity is the first unit of this object. + * This happens every time we reach a parity device on + * the same stripe as the per_dev->offset. We need to + * just skip this unit. + */ + per_dev->offset += cur_len; + return; + } + } else { + /* finalize the last one */ + sge = &per_dev->sglist[per_dev->cur_sg - 1]; + sge->len = per_dev->length - per_dev->last_sgs_total; + } + + if (not_last) { + /* Partly prepare the next one */ + struct osd_sg_entry *next_sge = sge + 1; + + ++per_dev->cur_sg; + next_sge->offset = sge->offset + sge->len + cur_len; + /* Save cur len so we know how mutch was added next time */ + per_dev->last_sgs_total = per_dev->length; + next_sge->len = 0; + } else if (!sge->len) { + /* Optimize for when the last unit is a parity */ + --per_dev->cur_sg; + } +} + +/* In writes @cur_len means length left. .i.e cur_len==0 is the last parity U */ +int _ore_add_parity_unit(struct ore_io_state *ios, + struct ore_striping_info *si, + struct ore_per_dev_state *per_dev, + unsigned cur_len) +{ + if (ios->reading) { + BUG_ON(per_dev->cur_sg >= ios->sgs_per_dev); + _ore_add_sg_seg(per_dev, cur_len, true); + } else { + struct page **pages = ios->parity_pages + ios->cur_par_page; + unsigned num_pages = ios->layout->stripe_unit / PAGE_SIZE; + unsigned array_start = 0; + unsigned i; + int ret; + + for (i = 0; i < num_pages; i++) { + pages[i] = _raid_page_alloc(); + if (unlikely(!pages[i])) + return -ENOMEM; + + ++(ios->cur_par_page); + /* TODO: only read support for now */ + clear_highpage(pages[i]); + } + + ORE_DBGMSG("writing dev=%d num_pages=%d cur_par_page=%d", + per_dev->dev, num_pages, ios->cur_par_page); + + ret = _ore_add_stripe_unit(ios, &array_start, 0, pages, + per_dev, num_pages * PAGE_SIZE); + if (unlikely(ret)) + return ret; + } + return 0; +} + +int _ore_post_alloc_raid_stuff(struct ore_io_state *ios) +{ + /*TODO: Only raid writes has stuff to add here */ + return 0; +} + +void _ore_free_raid_stuff(struct ore_io_state *ios) +{ + if (ios->parity_pages) { /* writing and raid */ + unsigned i; + + for (i = 0; i < ios->cur_par_page; i++) { + struct page *page = ios->parity_pages[i]; + + if (page) + _raid_page_free(page); + } + if (ios->extra_part_alloc) + kfree(ios->parity_pages); + } else { + /* Will only be set if raid reading && sglist is big */ + if (ios->extra_part_alloc) + kfree(ios->per_dev[0].sglist); + } +} diff --git a/fs/exofs/ore_raid.h b/fs/exofs/ore_raid.h new file mode 100644 index 0000000..c21080b --- /dev/null +++ b/fs/exofs/ore_raid.h @@ -0,0 +1,64 @@ +/* + * Copyright (C) from 2011 + * Boaz Harrosh + * + * This file is part of the objects raid engine (ore). + * + * It is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + * + * You should have received a copy of the GNU General Public License + * along with "ore". If not, write to the Free Software Foundation, Inc: + * "Free Software Foundation " + */ + +#include + +#define ORE_ERR(fmt, a...) printk(KERN_ERR "ore: " fmt, ##a) + +#ifdef CONFIG_EXOFS_DEBUG +#define ORE_DBGMSG(fmt, a...) \ + printk(KERN_NOTICE "ore @%s:%d: " fmt, __func__, __LINE__, ##a) +#else +#define ORE_DBGMSG(fmt, a...) \ + do { if (0) printk(fmt, ##a); } while (0) +#endif + +/* u64 has problems with printk this will cast it to unsigned long long */ +#define _LLU(x) (unsigned long long)(x) + +#define ORE_DBGMSG2(M...) do {} while (0) +/* #define ORE_DBGMSG2 ORE_DBGMSG */ + +/* Calculate the component order in a stripe. eg the logical data unit + * address within the stripe of @dev given the @par_dev of this stripe. + */ +static inline unsigned _dev_order(unsigned devs_in_group, unsigned mirrors_p1, + unsigned par_dev, unsigned dev) +{ + unsigned first_dev = dev - dev % devs_in_group; + + dev -= first_dev; + par_dev -= first_dev; + + if (devs_in_group == par_dev) /* The raid 0 case */ + return dev / mirrors_p1; + /* raid4/5/6 case */ + return ((devs_in_group + dev - par_dev - mirrors_p1) % devs_in_group) / + mirrors_p1; +} + +/* ios_raid.c stuff needed by ios.c */ +int _ore_post_alloc_raid_stuff(struct ore_io_state *ios); +void _ore_free_raid_stuff(struct ore_io_state *ios); + +void _ore_add_sg_seg(struct ore_per_dev_state *per_dev, unsigned cur_len, + bool not_last); +int _ore_add_parity_unit(struct ore_io_state *ios, struct ore_striping_info *si, + struct ore_per_dev_state *per_dev, unsigned cur_len); + +/* ios.c stuff needed by ios_raid.c */ +int _ore_add_stripe_unit(struct ore_io_state *ios, unsigned *cur_pg, + unsigned pgbase, struct page **pages, + struct ore_per_dev_state *per_dev, int cur_len); diff --git a/include/scsi/osd_ore.h b/include/scsi/osd_ore.h index a8e39d1..43821c1 100644 --- a/include/scsi/osd_ore.h +++ b/include/scsi/osd_ore.h @@ -40,6 +40,7 @@ struct ore_layout { unsigned mirrors_p1; unsigned group_width; + unsigned parity; u64 group_depth; unsigned group_count; @@ -89,11 +90,16 @@ static inline void ore_comp_set_dev( } struct ore_striping_info { + u64 offset; u64 obj_offset; - u64 group_length; + u64 length; + u64 first_stripe_start; /* only used in raid writes */ u64 M; /* for truncate */ + unsigned bytes_in_stripe; unsigned dev; + unsigned par_dev; unsigned unit_off; + unsigned cur_comp; }; struct ore_io_state; @@ -127,6 +133,13 @@ struct ore_io_state { bool reading; + /* House keeping of Parity pages */ + bool extra_part_alloc; + struct page **parity_pages; + unsigned max_par_pages; + unsigned cur_par_page; + unsigned sgs_per_dev; + /* Variable array of size numdevs */ unsigned numdevs; struct ore_per_dev_state { @@ -134,7 +147,10 @@ struct ore_io_state { struct bio *bio; loff_t offset; unsigned length; + unsigned last_sgs_total; unsigned dev; + struct osd_sg_entry *sglist; + unsigned cur_sg; } per_dev[]; }; @@ -147,8 +163,7 @@ static inline unsigned ore_io_state_size(unsigned numdevs) /* ore.c */ int ore_verify_layout(unsigned total_comps, struct ore_layout *layout); void ore_calc_stripe_info(struct ore_layout *layout, u64 file_offset, - struct ore_striping_info *si); - + u64 length, struct ore_striping_info *si); int ore_get_rw_state(struct ore_layout *layout, struct ore_components *comps, bool is_reading, u64 offset, u64 length, struct ore_io_state **ios); -- cgit v0.10.2 From 769ba8d92025fa390f3097e658b8ed6e032d68e9 Mon Sep 17 00:00:00 2001 From: Boaz Harrosh Date: Fri, 14 Oct 2011 15:33:51 +0200 Subject: ore: RAID5 Write This is finally the RAID5 Write support. The bigger part of this patch is not the XOR engine itself, But the read4write logic, which is a complete mini prepare_for_striping reading engine that can read scattered pages of a stripe into cache so it can be used for XOR calculation. That is, if the write was not stripe aligned. The main algorithm behind the XOR engine is the 2 dimensional array: struct __stripe_pages_2d. A drawing might save 1000 words --- __stripe_pages_2d | n = pages_in_stripe_unit; w = group_width - parity; | pages array presented to the XOR lib | | V | __1_page_stripe[0].pages --> [c0][c1]..[cw][c_par] <---| | | __1_page_stripe[1].pages --> [c0][c1]..[cw][c_par] <--- | ... | ... | __1_page_stripe[n].pages --> [c0][c1]..[cw][c_par] ^ | data added columns first then row --- The pages are put on this array columns first. .i.e: p0-of-c0, p1-of-c0, ... pn-of-c0, p0-of-c1, ... So we are doing a corner turn of the pages. Note that pages will zigzag down and left. but are put sequentially in growing order. So when the time comes to XOR the stripe, only the beginning and end of the array need be checked. We scan the array and any NULL spot will be field by pages-to-be-read. The FS that wants to support RAID5 needs to supply an operations-vector that searches a given page in cache, and specifies if the page is uptodate or need reading. All these pages to be read are put on a slave ore_io_state and synchronously read. All the pages of a stripe are read in one IO, using the scatter gather mechanism. In write we constrain our IO to only be incomplete on a single stripe. Meaning either the complete IO is within a single stripe so we might have pages to read from both beginning or end of the strip. Or we have some reading to do at beginning but end at strip boundary. The left over pages are pushed to the next IO by the API already established by previous work, where an IO offset/length combination presented to the ORE might get the length truncated and the user must re-submit the leftover pages. (Both exofs and NFS support this) But any ORE user should make it's best effort to align it's IO before hand and avoid complications. A cached ore_layout->stripe_size member can be used for that calculation. (NOTE: that ORE demands that stripe_size may not be bigger then 32bit) What else? Well read it and tell me. Signed-off-by: Boaz Harrosh diff --git a/fs/exofs/Kconfig b/fs/exofs/Kconfig index 70bae41..fa9a286 100644 --- a/fs/exofs/Kconfig +++ b/fs/exofs/Kconfig @@ -1,10 +1,17 @@ +# Note ORE needs to "select ASYNC_XOR". So Not to force multiple selects +# for every ORE user we do it like this. Any user should add itself here +# at the "depends on EXOFS_FS || ..." with an ||. The dependencies are +# selected here, and we default to "ON". So in effect it is like been +# selected by any of the users. config ORE tristate + depends on EXOFS_FS + select ASYNC_XOR + default SCSI_OSD_ULD config EXOFS_FS tristate "exofs: OSD based file system support" depends on SCSI_OSD_ULD - select ORE help EXOFS is a file system that uses an OSD storage device, as its backing storage. diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c index fd6090d..08ee454 100644 --- a/fs/exofs/ore.c +++ b/fs/exofs/ore.c @@ -95,6 +95,14 @@ int ore_verify_layout(unsigned total_comps, struct ore_layout *layout) layout->max_io_length = (BIO_MAX_PAGES_KMALLOC * PAGE_SIZE - layout->stripe_unit) * layout->group_width; + if (layout->parity) { + unsigned stripe_length = + (layout->group_width - layout->parity) * + layout->stripe_unit; + + layout->max_io_length /= stripe_length; + layout->max_io_length *= stripe_length; + } return 0; } EXPORT_SYMBOL(ore_verify_layout); @@ -118,7 +126,7 @@ static struct osd_dev *_ios_od(struct ore_io_state *ios, unsigned index) return ore_comp_dev(ios->oc, index); } -static int _ore_get_io_state(struct ore_layout *layout, +int _ore_get_io_state(struct ore_layout *layout, struct ore_components *oc, unsigned numdevs, unsigned sgs_per_dev, unsigned num_par_pages, struct ore_io_state **pios) @@ -334,7 +342,7 @@ static void _done_io(struct osd_request *or, void *p) kref_put(&ios->kref, _last_io); } -static int ore_io_execute(struct ore_io_state *ios) +int ore_io_execute(struct ore_io_state *ios) { DECLARE_COMPLETION_ONSTACK(wait); bool sync = (ios->done == NULL); @@ -597,6 +605,8 @@ int _ore_add_stripe_unit(struct ore_io_state *ios, unsigned *cur_pg, ret = -ENOMEM; goto out; } + _add_stripe_page(ios->sp2d, &ios->si, pages[pg]); + pgbase = 0; ++pg; } @@ -636,6 +646,7 @@ static int _prepare_for_striping(struct ore_io_state *ios) dev_order = _dev_order(devs_in_group, mirrors_p1, si->par_dev, dev); si->cur_comp = dev_order; + si->cur_pg = si->unit_off / PAGE_SIZE; while (length) { unsigned comp = dev - first_dev; @@ -677,14 +688,14 @@ static int _prepare_for_striping(struct ore_io_state *ios) length -= cur_len; si->cur_comp = (si->cur_comp + 1) % group_width; - if (unlikely((dev == si->par_dev) || - (!length && ios->parity_pages))) { - if (!length) + if (unlikely((dev == si->par_dev) || (!length && ios->sp2d))) { + if (!length && ios->sp2d) { /* If we are writing and this is the very last * stripe. then operate on parity dev. */ dev = si->par_dev; - if (ios->reading) + } + if (ios->sp2d) /* In writes cur_len just means if it's the * last one. See _ore_add_parity_unit. */ @@ -709,6 +720,7 @@ static int _prepare_for_striping(struct ore_io_state *ios) devs_in_group + first_dev; /* Next stripe, start fresh */ si->cur_comp = 0; + si->cur_pg = 0; } } out: @@ -873,6 +885,14 @@ int ore_write(struct ore_io_state *ios) int i; int ret; + if (unlikely(ios->sp2d && !ios->r4w)) { + /* A library is attempting a RAID-write without providing + * a pages lock interface. + */ + WARN_ON_ONCE(1); + return -ENOTSUPP; + } + ret = _prepare_for_striping(ios); if (unlikely(ret)) return ret; @@ -888,7 +908,7 @@ int ore_write(struct ore_io_state *ios) } EXPORT_SYMBOL(ore_write); -static int _read_mirror(struct ore_io_state *ios, unsigned cur_comp) +int _ore_read_mirror(struct ore_io_state *ios, unsigned cur_comp) { struct osd_request *or; struct ore_per_dev_state *per_dev = &ios->per_dev[cur_comp]; @@ -952,7 +972,7 @@ int ore_read(struct ore_io_state *ios) return ret; for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) { - ret = _read_mirror(ios, i); + ret = _ore_read_mirror(ios, i); if (unlikely(ret)) return ret; } diff --git a/fs/exofs/ore_raid.c b/fs/exofs/ore_raid.c index 8d4b93a..29c47e5 100644 --- a/fs/exofs/ore_raid.c +++ b/fs/exofs/ore_raid.c @@ -14,9 +14,13 @@ */ #include +#include #include "ore_raid.h" +#undef ORE_DBGMSG2 +#define ORE_DBGMSG2 ORE_DBGMSG + struct page *_raid_page_alloc(void) { return alloc_page(GFP_KERNEL); @@ -27,6 +31,236 @@ void _raid_page_free(struct page *p) __free_page(p); } +/* This struct is forward declare in ore_io_state, but is private to here. + * It is put on ios->sp2d for RAID5/6 writes only. See _gen_xor_unit. + * + * __stripe_pages_2d is a 2d array of pages, and it is also a corner turn. + * Ascending page index access is sp2d(p-minor, c-major). But storage is + * sp2d[p-minor][c-major], so it can be properlly presented to the async-xor + * API. + */ +struct __stripe_pages_2d { + /* Cache some hot path repeated calculations */ + unsigned parity; + unsigned data_devs; + unsigned pages_in_unit; + + bool needed ; + + /* Array size is pages_in_unit (layout->stripe_unit / PAGE_SIZE) */ + struct __1_page_stripe { + bool alloc; + unsigned write_count; + struct async_submit_ctl submit; + struct dma_async_tx_descriptor *tx; + + /* The size of this array is data_devs + parity */ + struct page **pages; + struct page **scribble; + /* bool array, size of this array is data_devs */ + char *page_is_read; + } _1p_stripes[]; +}; + +/* This can get bigger then a page. So support multiple page allocations + * _sp2d_free should be called even if _sp2d_alloc fails (by returning + * none-zero). + */ +static int _sp2d_alloc(unsigned pages_in_unit, unsigned group_width, + unsigned parity, struct __stripe_pages_2d **psp2d) +{ + struct __stripe_pages_2d *sp2d; + unsigned data_devs = group_width - parity; + struct _alloc_all_bytes { + struct __alloc_stripe_pages_2d { + struct __stripe_pages_2d sp2d; + struct __1_page_stripe _1p_stripes[pages_in_unit]; + } __asp2d; + struct __alloc_1p_arrays { + struct page *pages[group_width]; + struct page *scribble[group_width]; + char page_is_read[data_devs]; + } __a1pa[pages_in_unit]; + } *_aab; + struct __alloc_1p_arrays *__a1pa; + struct __alloc_1p_arrays *__a1pa_end; + const unsigned sizeof__a1pa = sizeof(_aab->__a1pa[0]); + unsigned num_a1pa, alloc_size, i; + + /* FIXME: check these numbers in ore_verify_layout */ + BUG_ON(sizeof(_aab->__asp2d) > PAGE_SIZE); + BUG_ON(sizeof__a1pa > PAGE_SIZE); + + if (sizeof(*_aab) > PAGE_SIZE) { + num_a1pa = (PAGE_SIZE - sizeof(_aab->__asp2d)) / sizeof__a1pa; + alloc_size = sizeof(_aab->__asp2d) + sizeof__a1pa * num_a1pa; + } else { + num_a1pa = pages_in_unit; + alloc_size = sizeof(*_aab); + } + + _aab = kzalloc(alloc_size, GFP_KERNEL); + if (unlikely(!_aab)) { + ORE_DBGMSG("!! Failed to alloc sp2d size=%d\n", alloc_size); + return -ENOMEM; + } + + sp2d = &_aab->__asp2d.sp2d; + *psp2d = sp2d; /* From here Just call _sp2d_free */ + + __a1pa = _aab->__a1pa; + __a1pa_end = __a1pa + num_a1pa; + + for (i = 0; i < pages_in_unit; ++i) { + if (unlikely(__a1pa >= __a1pa_end)) { + num_a1pa = min_t(unsigned, PAGE_SIZE / sizeof__a1pa, + pages_in_unit - i); + + __a1pa = kzalloc(num_a1pa * sizeof__a1pa, GFP_KERNEL); + if (unlikely(!__a1pa)) { + ORE_DBGMSG("!! Failed to _alloc_1p_arrays=%d\n", + num_a1pa); + return -ENOMEM; + } + __a1pa_end = __a1pa + num_a1pa; + /* First *pages is marked for kfree of the buffer */ + sp2d->_1p_stripes[i].alloc = true; + } + + sp2d->_1p_stripes[i].pages = __a1pa->pages; + sp2d->_1p_stripes[i].scribble = __a1pa->scribble ; + sp2d->_1p_stripes[i].page_is_read = __a1pa->page_is_read; + ++__a1pa; + } + + sp2d->parity = parity; + sp2d->data_devs = data_devs; + sp2d->pages_in_unit = pages_in_unit; + return 0; +} + +static void _sp2d_reset(struct __stripe_pages_2d *sp2d, + const struct _ore_r4w_op *r4w, void *priv) +{ + unsigned data_devs = sp2d->data_devs; + unsigned group_width = data_devs + sp2d->parity; + unsigned p; + + if (!sp2d->needed) + return; + + for (p = 0; p < sp2d->pages_in_unit; p++) { + struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; + + if (_1ps->write_count < group_width) { + unsigned c; + + for (c = 0; c < data_devs; c++) + if (_1ps->page_is_read[c]) { + struct page *page = _1ps->pages[c]; + + r4w->put_page(priv, page); + _1ps->page_is_read[c] = false; + } + } + + memset(_1ps->pages, 0, group_width * sizeof(*_1ps->pages)); + _1ps->write_count = 0; + _1ps->tx = NULL; + } + + sp2d->needed = false; +} + +static void _sp2d_free(struct __stripe_pages_2d *sp2d) +{ + unsigned i; + + if (!sp2d) + return; + + for (i = 0; i < sp2d->pages_in_unit; ++i) { + if (sp2d->_1p_stripes[i].alloc) + kfree(sp2d->_1p_stripes[i].pages); + } + + kfree(sp2d); +} + +static unsigned _sp2d_min_pg(struct __stripe_pages_2d *sp2d) +{ + unsigned p; + + for (p = 0; p < sp2d->pages_in_unit; p++) { + struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; + + if (_1ps->write_count) + return p; + } + + return ~0; +} + +static unsigned _sp2d_max_pg(struct __stripe_pages_2d *sp2d) +{ + unsigned p; + + for (p = sp2d->pages_in_unit - 1; p >= 0; --p) { + struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; + + if (_1ps->write_count) + return p; + } + + return ~0; +} + +static void _gen_xor_unit(struct __stripe_pages_2d *sp2d) +{ + unsigned p; + for (p = 0; p < sp2d->pages_in_unit; p++) { + struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; + + if (!_1ps->write_count) + continue; + + init_async_submit(&_1ps->submit, + ASYNC_TX_XOR_ZERO_DST | ASYNC_TX_ACK, + NULL, + NULL, NULL, + (addr_conv_t *)_1ps->scribble); + + /* TODO: raid6 */ + _1ps->tx = async_xor(_1ps->pages[sp2d->data_devs], _1ps->pages, + 0, sp2d->data_devs, PAGE_SIZE, + &_1ps->submit); + } + + for (p = 0; p < sp2d->pages_in_unit; p++) { + struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; + /* NOTE: We wait for HW synchronously (I don't have such HW + * to test with.) Is parallelism needed with today's multi + * cores? + */ + async_tx_issue_pending(_1ps->tx); + } +} + +void _ore_add_stripe_page(struct __stripe_pages_2d *sp2d, + struct ore_striping_info *si, struct page *page) +{ + struct __1_page_stripe *_1ps; + + sp2d->needed = true; + + _1ps = &sp2d->_1p_stripes[si->cur_pg]; + _1ps->pages[si->cur_comp] = page; + ++_1ps->write_count; + + si->cur_pg = (si->cur_pg + 1) % sp2d->pages_in_unit; + /* si->cur_comp is advanced outside at main loop */ +} + void _ore_add_sg_seg(struct ore_per_dev_state *per_dev, unsigned cur_len, bool not_last) { @@ -76,6 +310,240 @@ void _ore_add_sg_seg(struct ore_per_dev_state *per_dev, unsigned cur_len, } } +static int _alloc_read_4_write(struct ore_io_state *ios) +{ + struct ore_layout *layout = ios->layout; + int ret; + /* We want to only read those pages not in cache so worst case + * is a stripe populated with every other page + */ + unsigned sgs_per_dev = ios->sp2d->pages_in_unit + 2; + + ret = _ore_get_io_state(layout, ios->oc, + layout->group_width * layout->mirrors_p1, + sgs_per_dev, 0, &ios->ios_read_4_write); + return ret; +} + +/* @si contains info of the to-be-inserted page. Update of @si should be + * maintained by caller. Specificaly si->dev, si->obj_offset, ... + */ +static int _add_to_read_4_write(struct ore_io_state *ios, + struct ore_striping_info *si, struct page *page) +{ + struct request_queue *q; + struct ore_per_dev_state *per_dev; + struct ore_io_state *read_ios; + unsigned first_dev = si->dev - (si->dev % + (ios->layout->group_width * ios->layout->mirrors_p1)); + unsigned comp = si->dev - first_dev; + unsigned added_len; + + if (!ios->ios_read_4_write) { + int ret = _alloc_read_4_write(ios); + + if (unlikely(ret)) + return ret; + } + + read_ios = ios->ios_read_4_write; + read_ios->numdevs = ios->layout->group_width * ios->layout->mirrors_p1; + + per_dev = &read_ios->per_dev[comp]; + if (!per_dev->length) { + per_dev->bio = bio_kmalloc(GFP_KERNEL, + ios->sp2d->pages_in_unit); + if (unlikely(!per_dev->bio)) { + ORE_DBGMSG("Failed to allocate BIO size=%u\n", + ios->sp2d->pages_in_unit); + return -ENOMEM; + } + per_dev->offset = si->obj_offset; + per_dev->dev = si->dev; + } else if (si->obj_offset != (per_dev->offset + per_dev->length)) { + u64 gap = si->obj_offset - (per_dev->offset + per_dev->length); + + _ore_add_sg_seg(per_dev, gap, true); + } + q = osd_request_queue(ore_comp_dev(read_ios->oc, per_dev->dev)); + added_len = bio_add_pc_page(q, per_dev->bio, page, PAGE_SIZE, 0); + if (unlikely(added_len != PAGE_SIZE)) { + ORE_DBGMSG("Failed to bio_add_pc_page bi_vcnt=%d\n", + per_dev->bio->bi_vcnt); + return -ENOMEM; + } + + per_dev->length += PAGE_SIZE; + return 0; +} + +static void _mark_read4write_pages_uptodate(struct ore_io_state *ios, int ret) +{ + struct bio_vec *bv; + unsigned i, d; + + /* loop on all devices all pages */ + for (d = 0; d < ios->numdevs; d++) { + struct bio *bio = ios->per_dev[d].bio; + + if (!bio) + continue; + + __bio_for_each_segment(bv, bio, i, 0) { + struct page *page = bv->bv_page; + + SetPageUptodate(page); + if (PageError(page)) + ClearPageError(page); + } + } +} + +/* read_4_write is hacked to read the start of the first stripe and/or + * the end of the last stripe. If needed, with an sg-gap at each device/page. + * It is assumed to be called after the to_be_written pages of the first stripe + * are populating ios->sp2d[][] + * + * NOTE: We call ios->r4w->lock_fn for all pages needed for parity calculations + * These pages are held at sp2d[p].pages[c] but with + * sp2d[p].page_is_read[c] = true. At _sp2d_reset these pages are + * ios->r4w->lock_fn(). The ios->r4w->lock_fn might signal that the page is + * @uptodate=true, so we don't need to read it, only unlock, after IO. + * + * TODO: The read_4_write should calc a need_to_read_pages_count, if bigger then + * to-be-written count, we should consider the xor-in-place mode. + * need_to_read_pages_count is the actual number of pages not present in cache. + * maybe "devs_in_group - ios->sp2d[p].write_count" is a good enough + * approximation? In this mode the read pages are put in the empty places of + * ios->sp2d[p][*], xor is calculated the same way. These pages are + * allocated/freed and don't go through cache + */ +static int _read_4_write(struct ore_io_state *ios) +{ + struct ore_io_state *ios_read; + struct ore_striping_info read_si; + struct __stripe_pages_2d *sp2d = ios->sp2d; + u64 offset = ios->si.first_stripe_start; + u64 last_stripe_end; + unsigned bytes_in_stripe = ios->si.bytes_in_stripe; + unsigned i, c, p, min_p = sp2d->pages_in_unit, max_p = -1; + int ret; + + if (offset == ios->offset) /* Go to start collect $200 */ + goto read_last_stripe; + + min_p = _sp2d_min_pg(sp2d); + max_p = _sp2d_max_pg(sp2d); + + for (c = 0; ; c++) { + ore_calc_stripe_info(ios->layout, offset, 0, &read_si); + read_si.obj_offset += min_p * PAGE_SIZE; + offset += min_p * PAGE_SIZE; + for (p = min_p; p <= max_p; p++) { + struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; + struct page **pp = &_1ps->pages[c]; + bool uptodate; + + if (*pp) + /* to-be-written pages start here */ + goto read_last_stripe; + + *pp = ios->r4w->get_page(ios->private, offset, + &uptodate); + if (unlikely(!*pp)) + return -ENOMEM; + + if (!uptodate) + _add_to_read_4_write(ios, &read_si, *pp); + + /* Mark read-pages to be cache_released */ + _1ps->page_is_read[c] = true; + read_si.obj_offset += PAGE_SIZE; + offset += PAGE_SIZE; + } + offset += (sp2d->pages_in_unit - p) * PAGE_SIZE; + } + +read_last_stripe: + offset = ios->offset + (ios->length + PAGE_SIZE - 1) / + PAGE_SIZE * PAGE_SIZE; + last_stripe_end = div_u64(offset + bytes_in_stripe - 1, bytes_in_stripe) + * bytes_in_stripe; + if (offset == last_stripe_end) /* Optimize for the aligned case */ + goto read_it; + + ore_calc_stripe_info(ios->layout, offset, 0, &read_si); + p = read_si.unit_off / PAGE_SIZE; + c = _dev_order(ios->layout->group_width * ios->layout->mirrors_p1, + ios->layout->mirrors_p1, read_si.par_dev, read_si.dev); + + BUG_ON(ios->si.first_stripe_start + bytes_in_stripe != last_stripe_end); + /* unaligned IO must be within a single stripe */ + + if (min_p == sp2d->pages_in_unit) { + /* Didn't do it yet */ + min_p = _sp2d_min_pg(sp2d); + max_p = _sp2d_max_pg(sp2d); + } + + while (offset < last_stripe_end) { + struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; + + if ((min_p <= p) && (p <= max_p)) { + struct page *page; + bool uptodate; + + BUG_ON(_1ps->pages[c]); + page = ios->r4w->get_page(ios->private, offset, + &uptodate); + if (unlikely(!page)) + return -ENOMEM; + + _1ps->pages[c] = page; + /* Mark read-pages to be cache_released */ + _1ps->page_is_read[c] = true; + if (!uptodate) + _add_to_read_4_write(ios, &read_si, page); + } + + offset += PAGE_SIZE; + if (p == (sp2d->pages_in_unit - 1)) { + ++c; + p = 0; + ore_calc_stripe_info(ios->layout, offset, 0, &read_si); + } else { + read_si.obj_offset += PAGE_SIZE; + ++p; + } + } + +read_it: + ios_read = ios->ios_read_4_write; + if (!ios_read) + return 0; + + /* FIXME: Ugly to signal _sbi_read_mirror that we have bio(s). Change + * to check for per_dev->bio + */ + ios_read->pages = ios->pages; + + /* Now read these devices */ + for (i = 0; i < ios_read->numdevs; i += ios_read->layout->mirrors_p1) { + ret = _ore_read_mirror(ios_read, i); + if (unlikely(ret)) + return ret; + } + + ret = ore_io_execute(ios_read); /* Synchronus execution */ + if (unlikely(ret)) { + ORE_DBGMSG("!! ore_io_execute => %d\n", ret); + return ret; + } + + _mark_read4write_pages_uptodate(ios_read, ret); + return 0; +} + /* In writes @cur_len means length left. .i.e cur_len==0 is the last parity U */ int _ore_add_parity_unit(struct ore_io_state *ios, struct ore_striping_info *si, @@ -86,42 +554,89 @@ int _ore_add_parity_unit(struct ore_io_state *ios, BUG_ON(per_dev->cur_sg >= ios->sgs_per_dev); _ore_add_sg_seg(per_dev, cur_len, true); } else { + struct __stripe_pages_2d *sp2d = ios->sp2d; struct page **pages = ios->parity_pages + ios->cur_par_page; - unsigned num_pages = ios->layout->stripe_unit / PAGE_SIZE; + unsigned num_pages; unsigned array_start = 0; unsigned i; int ret; + si->cur_pg = _sp2d_min_pg(sp2d); + num_pages = _sp2d_max_pg(sp2d) + 1 - si->cur_pg; + + if (!cur_len) /* If last stripe operate on parity comp */ + si->cur_comp = sp2d->data_devs; + + if (!per_dev->length) { + per_dev->offset += si->cur_pg * PAGE_SIZE; + /* If first stripe, Read in all read4write pages + * (if needed) before we calculate the first parity. + */ + _read_4_write(ios); + } + for (i = 0; i < num_pages; i++) { pages[i] = _raid_page_alloc(); if (unlikely(!pages[i])) return -ENOMEM; ++(ios->cur_par_page); - /* TODO: only read support for now */ - clear_highpage(pages[i]); } - ORE_DBGMSG("writing dev=%d num_pages=%d cur_par_page=%d", - per_dev->dev, num_pages, ios->cur_par_page); + BUG_ON(si->cur_comp != sp2d->data_devs); + BUG_ON(si->cur_pg + num_pages > sp2d->pages_in_unit); ret = _ore_add_stripe_unit(ios, &array_start, 0, pages, per_dev, num_pages * PAGE_SIZE); if (unlikely(ret)) return ret; + + /* TODO: raid6 if (last_parity_dev) */ + _gen_xor_unit(sp2d); + _sp2d_reset(sp2d, ios->r4w, ios->private); } return 0; } int _ore_post_alloc_raid_stuff(struct ore_io_state *ios) { - /*TODO: Only raid writes has stuff to add here */ + struct ore_layout *layout = ios->layout; + + if (ios->parity_pages) { + unsigned pages_in_unit = layout->stripe_unit / PAGE_SIZE; + unsigned stripe_size = ios->si.bytes_in_stripe; + u64 last_stripe, first_stripe; + + if (_sp2d_alloc(pages_in_unit, layout->group_width, + layout->parity, &ios->sp2d)) { + return -ENOMEM; + } + + BUG_ON(ios->offset % PAGE_SIZE); + + /* Round io down to last full strip */ + first_stripe = div_u64(ios->offset, stripe_size); + last_stripe = div_u64(ios->offset + ios->length, stripe_size); + + /* If an IO spans more then a single stripe it must end at + * a stripe boundary. The reminder at the end is pushed into the + * next IO. + */ + if (last_stripe != first_stripe) { + ios->length = last_stripe * stripe_size - ios->offset; + + BUG_ON(!ios->length); + ios->nr_pages = (ios->length + PAGE_SIZE - 1) / + PAGE_SIZE; + ios->si.length = ios->length; /*make it consistent */ + } + } return 0; } void _ore_free_raid_stuff(struct ore_io_state *ios) { - if (ios->parity_pages) { /* writing and raid */ + if (ios->sp2d) { /* writing and raid */ unsigned i; for (i = 0; i < ios->cur_par_page; i++) { @@ -132,9 +647,14 @@ void _ore_free_raid_stuff(struct ore_io_state *ios) } if (ios->extra_part_alloc) kfree(ios->parity_pages); + /* If IO returned an error pages might need unlocking */ + _sp2d_reset(ios->sp2d, ios->r4w, ios->private); + _sp2d_free(ios->sp2d); } else { /* Will only be set if raid reading && sglist is big */ if (ios->extra_part_alloc) kfree(ios->per_dev[0].sglist); } + if (ios->ios_read_4_write) + ore_put_io_state(ios->ios_read_4_write); } diff --git a/fs/exofs/ore_raid.h b/fs/exofs/ore_raid.h index c21080b..2ffd2c3 100644 --- a/fs/exofs/ore_raid.h +++ b/fs/exofs/ore_raid.h @@ -57,8 +57,23 @@ void _ore_add_sg_seg(struct ore_per_dev_state *per_dev, unsigned cur_len, bool not_last); int _ore_add_parity_unit(struct ore_io_state *ios, struct ore_striping_info *si, struct ore_per_dev_state *per_dev, unsigned cur_len); +void _ore_add_stripe_page(struct __stripe_pages_2d *sp2d, + struct ore_striping_info *si, struct page *page); +static inline void _add_stripe_page(struct __stripe_pages_2d *sp2d, + struct ore_striping_info *si, struct page *page) +{ + if (!sp2d) /* Inline the fast path */ + return; /* Hay no raid stuff */ + _ore_add_stripe_page(sp2d, si, page); +} /* ios.c stuff needed by ios_raid.c */ +int _ore_get_io_state(struct ore_layout *layout, + struct ore_components *oc, unsigned numdevs, + unsigned sgs_per_dev, unsigned num_par_pages, + struct ore_io_state **pios); int _ore_add_stripe_unit(struct ore_io_state *ios, unsigned *cur_pg, unsigned pgbase, struct page **pages, struct ore_per_dev_state *per_dev, int cur_len); +int _ore_read_mirror(struct ore_io_state *ios, unsigned cur_comp); +int ore_io_execute(struct ore_io_state *ios); diff --git a/include/scsi/osd_ore.h b/include/scsi/osd_ore.h index 43821c1..f05fa82 100644 --- a/include/scsi/osd_ore.h +++ b/include/scsi/osd_ore.h @@ -99,11 +99,17 @@ struct ore_striping_info { unsigned dev; unsigned par_dev; unsigned unit_off; + unsigned cur_pg; unsigned cur_comp; }; struct ore_io_state; typedef void (*ore_io_done_fn)(struct ore_io_state *ios, void *private); +struct _ore_r4w_op { + /* @Priv given here is passed ios->private */ + struct page * (*get_page)(void *priv, u64 page_index, bool *uptodate); + void (*put_page)(void *priv, struct page *page); +}; struct ore_io_state { struct kref kref; @@ -139,6 +145,9 @@ struct ore_io_state { unsigned max_par_pages; unsigned cur_par_page; unsigned sgs_per_dev; + struct __stripe_pages_2d *sp2d; + struct ore_io_state *ios_read_4_write; + const struct _ore_r4w_op *r4w; /* Variable array of size numdevs */ unsigned numdevs; -- cgit v0.10.2 From dd296619974c50c46c67e58f355a7e85ef3f0c01 Mon Sep 17 00:00:00 2001 From: Boaz Harrosh Date: Wed, 12 Oct 2011 15:42:07 +0200 Subject: exofs: Support for RAID5 read-4-write interface. The ore need suplied a r4w_get_page/r4w_put_page API from Filesystem so it can get cache pages to read-into when writing parial stripes. Also I commented out and NULLed the .writepage (singular) vector. Because it gives terrible write pattern to raid and is apparently not needed. Even in OOM conditions the system copes (even better) with out it. TODO: How to specify to write_cache_pages() to start or include a certain page? Signed-off-by: Boaz Harrosh diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c index 86c0ac8..3e5f3a6 100644 --- a/fs/exofs/inode.c +++ b/fs/exofs/inode.c @@ -63,6 +63,7 @@ struct page_collect { bool read_4_write; /* This means two things: that the read is sync * And the pages should not be unlocked. */ + struct page *that_locked_page; }; static void _pcol_init(struct page_collect *pcol, unsigned expected_pages, @@ -81,6 +82,7 @@ static void _pcol_init(struct page_collect *pcol, unsigned expected_pages, pcol->length = 0; pcol->pg_first = -1; pcol->read_4_write = false; + pcol->that_locked_page = NULL; } static void _pcol_reset(struct page_collect *pcol) @@ -93,6 +95,7 @@ static void _pcol_reset(struct page_collect *pcol) pcol->length = 0; pcol->pg_first = -1; pcol->ios = NULL; + pcol->that_locked_page = NULL; /* this is probably the end of the loop but in writes * it might not end here. don't be left with nothing @@ -391,6 +394,8 @@ static int readpage_strip(void *data, struct page *page) EXOFS_ERR("PageUptodate(0x%lx, 0x%lx)\n", pcol->inode->i_ino, page->index); + pcol->that_locked_page = page; + if (page->index < end_index) len = PAGE_CACHE_SIZE; else if (page->index == end_index) @@ -560,6 +565,56 @@ static void writepages_done(struct ore_io_state *ios, void *p) EXOFS_DBGMSG2("writepages_done END\n"); } +static struct page *__r4w_get_page(void *priv, u64 offset, bool *uptodate) +{ + struct page_collect *pcol = priv; + pgoff_t index = offset / PAGE_SIZE; + + if (!pcol->that_locked_page || + (pcol->that_locked_page->index != index)) { + struct page *page = find_get_page(pcol->inode->i_mapping, index); + + if (!page) { + page = find_or_create_page(pcol->inode->i_mapping, + index, GFP_NOFS); + if (unlikely(!page)) { + EXOFS_DBGMSG("grab_cache_page Failed " + "index=0x%llx\n", _LLU(index)); + return NULL; + } + unlock_page(page); + } + if (PageDirty(page) || PageWriteback(page)) + *uptodate = true; + else + *uptodate = PageUptodate(page); + EXOFS_DBGMSG("index=0x%lx uptodate=%d\n", index, *uptodate); + return page; + } else { + EXOFS_DBGMSG("YES that_locked_page index=0x%lx\n", + pcol->that_locked_page->index); + *uptodate = true; + return pcol->that_locked_page; + } +} + +static void __r4w_put_page(void *priv, struct page *page) +{ + struct page_collect *pcol = priv; + + if (pcol->that_locked_page != page) { + EXOFS_DBGMSG("index=0x%lx\n", page->index); + page_cache_release(page); + return; + } + EXOFS_DBGMSG("that_locked_page index=0x%lx\n", page->index); +} + +static const struct _ore_r4w_op _r4w_op = { + .get_page = &__r4w_get_page, + .put_page = &__r4w_put_page, +}; + static int write_exec(struct page_collect *pcol) { struct exofs_i_info *oi = exofs_i(pcol->inode); @@ -589,6 +644,7 @@ static int write_exec(struct page_collect *pcol) ios = pcol->ios; ios->pages = pcol_copy->pages; ios->done = writepages_done; + ios->r4w = &_r4w_op; ios->private = pcol_copy; /* pages ownership was passed to pcol_copy */ @@ -773,6 +829,7 @@ static int exofs_writepages(struct address_space *mapping, return 0; } +/* static int exofs_writepage(struct page *page, struct writeback_control *wbc) { struct page_collect pcol; @@ -788,7 +845,7 @@ static int exofs_writepage(struct page *page, struct writeback_control *wbc) return write_exec(&pcol); } - +*/ /* i_mutex held using inode->i_size directly */ static void _write_failed(struct inode *inode, loff_t to) { @@ -894,7 +951,7 @@ static void exofs_invalidatepage(struct page *page, unsigned long offset) const struct address_space_operations exofs_aops = { .readpage = exofs_readpage, .readpages = exofs_readpages, - .writepage = exofs_writepage, + .writepage = NULL, .writepages = exofs_writepages, .write_begin = exofs_write_begin_export, .write_end = exofs_write_end, -- cgit v0.10.2 From 44231e686b2ba3b5702db867bb84e6d76b7cf2c7 Mon Sep 17 00:00:00 2001 From: Boaz Harrosh Date: Sun, 21 Nov 2010 20:03:24 +0200 Subject: ore: Enable RAID5 mounts Now that we support raid5 Enable it at mount. Raid6 will come next raid4 is not demanded for so it will probably not be enabled. (Until some one wants it) NOTE: That mkfs.exofs had support for raid5/6 since long time ago. (Making an empty raidX FS is just as easy as raid0 ;-} ) Signed-off-by: Boaz Harrosh diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c index 08ee454..fcfa86a 100644 --- a/fs/exofs/ore.c +++ b/fs/exofs/ore.c @@ -49,9 +49,17 @@ int ore_verify_layout(unsigned total_comps, struct ore_layout *layout) { u64 stripe_length; -/* FIXME: Only raid0 is supported for now. */ - if (layout->raid_algorithm != PNFS_OSD_RAID_0) { - ORE_ERR("Only RAID_0 for now\n"); + switch (layout->raid_algorithm) { + case PNFS_OSD_RAID_0: + layout->parity = 0; + break; + case PNFS_OSD_RAID_5: + layout->parity = 1; + break; + case PNFS_OSD_RAID_PQ: + case PNFS_OSD_RAID_4: + default: + ORE_ERR("Only RAID_0/5 for now\n"); return -EINVAL; } if (0 != (layout->stripe_unit & ~PAGE_MASK)) { -- cgit v0.10.2