From 1f3c9907b8037de7b9635b0b471572ea8758bcd1 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 11 Dec 2012 13:09:00 +1100 Subject: md: removed unused variable in calc_sb_1_csm. 'i' is unused. NeilBrown diff --git a/drivers/md/md.c b/drivers/md/md.c index 6120071..89dbd44 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -1414,12 +1414,11 @@ static __le32 calc_sb_1_csum(struct mdp_superblock_1 * sb) unsigned long long newcsum; int size = 256 + le32_to_cpu(sb->max_dev)*2; __le32 *isuper = (__le32*)sb; - int i; disk_csum = sb->sb_csum; sb->sb_csum = 0; newcsum = 0; - for (i=0; size>=4; size -= 4 ) + for (; size >= 4; size -= 4) newcsum += le32_to_cpu(*isuper++); if (size == 2) -- cgit v0.10.2 From a7a3f08dc24690ae00c75cfe4b4701a970dd0155 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 11 Dec 2012 13:35:54 +1100 Subject: md: close race between removing and adding a device. When we remove a device from an md array, the final removal of the "dev-XX" sys entry is run asynchronously. If we then re-add that device immediately before the worker thread gets to run, we can end up trying to add the "dev-XX" sysfs entry back before it has been removed. So in both places where we add a device, call flush_workqueue(md_misc_wq); before taking the md lock (as holding the md lock can prevent removal to complete). Signed-off-by: NeilBrown diff --git a/drivers/md/md.c b/drivers/md/md.c index 89dbd44..8b0e3d2 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -4752,6 +4752,8 @@ md_attr_store(struct kobject *kobj, struct attribute *attr, } mddev_get(mddev); spin_unlock(&all_mddevs_lock); + if (entry->store == new_dev_store) + flush_workqueue(md_misc_wq); rv = mddev_lock(mddev); if (!rv) { rv = entry->store(mddev, page, length); @@ -6397,6 +6399,10 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode, goto abort; } + if (cmd == ADD_NEW_DISK) + /* need to ensure md_delayed_delete() has completed */ + flush_workqueue(md_misc_wq); + err = mddev_lock(mddev); if (err) { printk(KERN_INFO -- cgit v0.10.2 From c02c0aeb6c728e5ad705dba86784bd553cb6f059 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 11 Dec 2012 13:39:21 +1100 Subject: md.c: re-indent various 'switch' statements. Intent was unnecessarily deep. Also change one 'switch' which has a single case element, into an 'if'. Signed-off-by: NeilBrown diff --git a/drivers/md/md.c b/drivers/md/md.c index 8b0e3d2..520056a 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -6347,24 +6347,23 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode, * Commands dealing with the RAID driver but not any * particular array: */ - switch (cmd) - { - case RAID_VERSION: - err = get_version(argp); - goto done; + switch (cmd) { + case RAID_VERSION: + err = get_version(argp); + goto done; - case PRINT_RAID_DEBUG: - err = 0; - md_print_devices(); - goto done; + case PRINT_RAID_DEBUG: + err = 0; + md_print_devices(); + goto done; #ifndef MODULE - case RAID_AUTORUN: - err = 0; - autostart_arrays(arg); - goto done; + case RAID_AUTORUN: + err = 0; + autostart_arrays(arg); + goto done; #endif - default:; + default:; } /* @@ -6411,50 +6410,44 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode, goto abort; } - switch (cmd) - { - case SET_ARRAY_INFO: - { - mdu_array_info_t info; - if (!arg) - memset(&info, 0, sizeof(info)); - else if (copy_from_user(&info, argp, sizeof(info))) { - err = -EFAULT; - goto abort_unlock; - } - if (mddev->pers) { - err = update_array_info(mddev, &info); - if (err) { - printk(KERN_WARNING "md: couldn't update" - " array info. %d\n", err); - goto abort_unlock; - } - goto done_unlock; - } - if (!list_empty(&mddev->disks)) { - printk(KERN_WARNING - "md: array %s already has disks!\n", - mdname(mddev)); - err = -EBUSY; - goto abort_unlock; - } - if (mddev->raid_disks) { - printk(KERN_WARNING - "md: array %s already initialised!\n", - mdname(mddev)); - err = -EBUSY; - goto abort_unlock; - } - err = set_array_info(mddev, &info); - if (err) { - printk(KERN_WARNING "md: couldn't set" - " array info. %d\n", err); - goto abort_unlock; - } + if (cmd == SET_ARRAY_INFO) { + mdu_array_info_t info; + if (!arg) + memset(&info, 0, sizeof(info)); + else if (copy_from_user(&info, argp, sizeof(info))) { + err = -EFAULT; + goto abort_unlock; + } + if (mddev->pers) { + err = update_array_info(mddev, &info); + if (err) { + printk(KERN_WARNING "md: couldn't update" + " array info. %d\n", err); + goto abort_unlock; } goto done_unlock; - - default:; + } + if (!list_empty(&mddev->disks)) { + printk(KERN_WARNING + "md: array %s already has disks!\n", + mdname(mddev)); + err = -EBUSY; + goto abort_unlock; + } + if (mddev->raid_disks) { + printk(KERN_WARNING + "md: array %s already initialised!\n", + mdname(mddev)); + err = -EBUSY; + goto abort_unlock; + } + err = set_array_info(mddev, &info); + if (err) { + printk(KERN_WARNING "md: couldn't set" + " array info. %d\n", err); + goto abort_unlock; + } + goto done_unlock; } /* @@ -6473,52 +6466,51 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode, /* * Commands even a read-only array can execute: */ - switch (cmd) - { - case GET_BITMAP_FILE: - err = get_bitmap_file(mddev, argp); - goto done_unlock; + switch (cmd) { + case GET_BITMAP_FILE: + err = get_bitmap_file(mddev, argp); + goto done_unlock; - case RESTART_ARRAY_RW: - err = restart_array(mddev); - goto done_unlock; + case RESTART_ARRAY_RW: + err = restart_array(mddev); + goto done_unlock; - case STOP_ARRAY: - err = do_md_stop(mddev, 0, bdev); - goto done_unlock; + case STOP_ARRAY: + err = do_md_stop(mddev, 0, bdev); + goto done_unlock; - case STOP_ARRAY_RO: - err = md_set_readonly(mddev, bdev); - goto done_unlock; + case STOP_ARRAY_RO: + err = md_set_readonly(mddev, bdev); + goto done_unlock; - case BLKROSET: - if (get_user(ro, (int __user *)(arg))) { - err = -EFAULT; - goto done_unlock; - } - err = -EINVAL; + case BLKROSET: + if (get_user(ro, (int __user *)(arg))) { + err = -EFAULT; + goto done_unlock; + } + err = -EINVAL; - /* if the bdev is going readonly the value of mddev->ro - * does not matter, no writes are coming - */ - if (ro) - goto done_unlock; + /* if the bdev is going readonly the value of mddev->ro + * does not matter, no writes are coming + */ + if (ro) + goto done_unlock; - /* are we are already prepared for writes? */ - if (mddev->ro != 1) - goto done_unlock; + /* are we are already prepared for writes? */ + if (mddev->ro != 1) + goto done_unlock; - /* transitioning to readauto need only happen for - * arrays that call md_write_start - */ - if (mddev->pers) { - err = restart_array(mddev); - if (err == 0) { - mddev->ro = 2; - set_disk_ro(mddev->gendisk, 0); - } + /* transitioning to readauto need only happen for + * arrays that call md_write_start + */ + if (mddev->pers) { + err = restart_array(mddev); + if (err == 0) { + mddev->ro = 2; + set_disk_ro(mddev->gendisk, 0); } - goto done_unlock; + } + goto done_unlock; } /* @@ -6540,37 +6532,36 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode, } } - switch (cmd) + switch (cmd) { + case ADD_NEW_DISK: { - case ADD_NEW_DISK: - { - mdu_disk_info_t info; - if (copy_from_user(&info, argp, sizeof(info))) - err = -EFAULT; - else - err = add_new_disk(mddev, &info); - goto done_unlock; - } + mdu_disk_info_t info; + if (copy_from_user(&info, argp, sizeof(info))) + err = -EFAULT; + else + err = add_new_disk(mddev, &info); + goto done_unlock; + } - case HOT_REMOVE_DISK: - err = hot_remove_disk(mddev, new_decode_dev(arg)); - goto done_unlock; + case HOT_REMOVE_DISK: + err = hot_remove_disk(mddev, new_decode_dev(arg)); + goto done_unlock; - case HOT_ADD_DISK: - err = hot_add_disk(mddev, new_decode_dev(arg)); - goto done_unlock; + case HOT_ADD_DISK: + err = hot_add_disk(mddev, new_decode_dev(arg)); + goto done_unlock; - case RUN_ARRAY: - err = do_md_run(mddev); - goto done_unlock; + case RUN_ARRAY: + err = do_md_run(mddev); + goto done_unlock; - case SET_BITMAP_FILE: - err = set_bitmap_file(mddev, (int)arg); - goto done_unlock; + case SET_BITMAP_FILE: + err = set_bitmap_file(mddev, (int)arg); + goto done_unlock; - default: - err = -EINVAL; - goto abort_unlock; + default: + err = -EINVAL; + goto abort_unlock; } done_unlock: -- cgit v0.10.2 From 35d78c66969fbbb51046e9f251849cf54e9ed88c Mon Sep 17 00:00:00 2001 From: kernelmail Date: Wed, 31 Oct 2012 11:59:10 +1100 Subject: md:Add place to update ->recovery_cp. In resyncing, recovery_cp only updated when resync aborted or completed. But in md drives,many place used it to judge.So add a place to update. Signed-off-by: Jianpeng Ma Signed-off-by: NeilBrown diff --git a/drivers/md/md.c b/drivers/md/md.c index 520056a..f970834 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -7462,6 +7462,9 @@ void md_do_sync(struct md_thread *thread) wait_event(mddev->recovery_wait, atomic_read(&mddev->recovery_active) == 0); mddev->curr_resync_completed = j; + if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && + j > mddev->recovery_cp) + mddev->recovery_cp = j; set_bit(MD_CHANGE_CLEAN, &mddev->flags); sysfs_notify(&mddev->kobj, NULL, "sync_completed"); } -- cgit v0.10.2 From 54f89341e8b8da0cdac8a7b873491739de19f098 Mon Sep 17 00:00:00 2001 From: majianpeng Date: Wed, 31 Oct 2012 11:59:10 +1100 Subject: md: Update checkpoint of resync/recovery based on time. md will current only only checkpoint recovery or resync ever 1/16th of the device size. As devices get larger this can become a long time an so a lot of work that might need to be duplicated after a shutdown. So add a time-based checkpoint. Every 5 minutes limits the amount of duplicated effort to at most 5 minutes, and has almost zero impact on performance. [changelog entry re-written by NeilBrown] Signed-off-by: Jianpeng Ma Signed-off-by: NeilBrown diff --git a/drivers/md/md.c b/drivers/md/md.c index f970834..6aefa44 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -7277,6 +7277,7 @@ EXPORT_SYMBOL_GPL(md_allow_write); #define SYNC_MARKS 10 #define SYNC_MARK_STEP (3*HZ) +#define UPDATE_FREQUENCY (5*60*HZ) void md_do_sync(struct md_thread *thread) { struct mddev *mddev = thread->mddev; @@ -7285,6 +7286,7 @@ void md_do_sync(struct md_thread *thread) window; sector_t max_sectors,j, io_sectors; unsigned long mark[SYNC_MARKS]; + unsigned long update_time; sector_t mark_cnt[SYNC_MARKS]; int last_mark,m; struct list_head *tmp; @@ -7444,6 +7446,7 @@ void md_do_sync(struct md_thread *thread) mddev->curr_resync_completed = j; sysfs_notify(&mddev->kobj, NULL, "sync_completed"); md_new_event(mddev); + update_time = jiffies; blk_start_plug(&plug); while (j < max_sectors) { @@ -7455,6 +7458,7 @@ void md_do_sync(struct md_thread *thread) ((mddev->curr_resync > mddev->curr_resync_completed && (mddev->curr_resync - mddev->curr_resync_completed) > (max_sectors >> 4)) || + time_after_eq(jiffies, update_time + UPDATE_FREQUENCY) || (j - mddev->curr_resync_completed)*2 >= mddev->resync_max - mddev->curr_resync_completed )) { @@ -7465,6 +7469,7 @@ void md_do_sync(struct md_thread *thread) if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && j > mddev->recovery_cp) mddev->recovery_cp = j; + update_time = jiffies; set_bit(MD_CHANGE_CLEAN, &mddev->flags); sysfs_notify(&mddev->kobj, NULL, "sync_completed"); } -- cgit v0.10.2 From 7056741fd9fc14a65608549a4657cf5178f05f63 Mon Sep 17 00:00:00 2001 From: Jim Kukunas Date: Thu, 8 Nov 2012 13:47:44 -0800 Subject: lib/raid6: Add AVX2 optimized recovery functions Optimize RAID6 recovery functions to take advantage of the 256-bit YMM integer instructions introduced in AVX2. The patch was tested and benchmarked before submission. However hardware is not yet released so benchmark numbers cannot be reported. Acked-by: "H. Peter Anvin" Signed-off-by: Jim Kukunas Signed-off-by: NeilBrown diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 58790bd..95477aa 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -123,9 +123,10 @@ cfi-sections := $(call as-instr,.cfi_sections .debug_frame,-DCONFIG_AS_CFI_SECTI # does binutils support specific instructions? asinstr := $(call as-instr,fxsaveq (%rax),-DCONFIG_AS_FXSAVEQ=1) avx_instr := $(call as-instr,vxorps %ymm0$(comma)%ymm1$(comma)%ymm2,-DCONFIG_AS_AVX=1) +avx2_instr :=$(call as-instr,vpbroadcastb %xmm0$(comma)%ymm1,-DCONFIG_AS_AVX2=1) -KBUILD_AFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr) -KBUILD_CFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr) +KBUILD_AFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr) $(avx2_instr) +KBUILD_CFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr) $(avx2_instr) LDFLAGS := -m elf_$(UTS_MACHINE) diff --git a/include/linux/raid/pq.h b/include/linux/raid/pq.h index 640c69c..3156347 100644 --- a/include/linux/raid/pq.h +++ b/include/linux/raid/pq.h @@ -109,6 +109,7 @@ struct raid6_recov_calls { extern const struct raid6_recov_calls raid6_recov_intx1; extern const struct raid6_recov_calls raid6_recov_ssse3; +extern const struct raid6_recov_calls raid6_recov_avx2; /* Algorithm list */ extern const struct raid6_calls * const raid6_algos[]; diff --git a/lib/raid6/Makefile b/lib/raid6/Makefile index de06dfe..8c2e22b 100644 --- a/lib/raid6/Makefile +++ b/lib/raid6/Makefile @@ -1,6 +1,6 @@ obj-$(CONFIG_RAID6_PQ) += raid6_pq.o -raid6_pq-y += algos.o recov.o recov_ssse3.o tables.o int1.o int2.o int4.o \ +raid6_pq-y += algos.o recov.o recov_ssse3.o recov_avx2.o tables.o int1.o int2.o int4.o \ int8.o int16.o int32.o altivec1.o altivec2.o altivec4.o \ altivec8.o mmx.o sse1.o sse2.o hostprogs-y += mktables diff --git a/lib/raid6/algos.c b/lib/raid6/algos.c index 589f5f5..8b7f55c 100644 --- a/lib/raid6/algos.c +++ b/lib/raid6/algos.c @@ -72,6 +72,9 @@ EXPORT_SYMBOL_GPL(raid6_datap_recov); const struct raid6_recov_calls *const raid6_recov_algos[] = { #if (defined(__i386__) || defined(__x86_64__)) && !defined(__arch_um__) +#ifdef CONFIG_AS_AVX2 + &raid6_recov_avx2, +#endif &raid6_recov_ssse3, #endif &raid6_recov_intx1, diff --git a/lib/raid6/recov_avx2.c b/lib/raid6/recov_avx2.c new file mode 100644 index 0000000..43a9bab --- /dev/null +++ b/lib/raid6/recov_avx2.c @@ -0,0 +1,327 @@ +/* + * Copyright (C) 2012 Intel Corporation + * Author: Jim Kukunas + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + */ + +#if (defined(__i386__) || defined(__x86_64__)) && !defined(__arch_um__) + +#if CONFIG_AS_AVX2 + +#include +#include "x86.h" + +static int raid6_has_avx2(void) +{ + return boot_cpu_has(X86_FEATURE_AVX2) && + boot_cpu_has(X86_FEATURE_AVX); +} + +static void raid6_2data_recov_avx2(int disks, size_t bytes, int faila, + int failb, void **ptrs) +{ + u8 *p, *q, *dp, *dq; + const u8 *pbmul; /* P multiplier table for B data */ + const u8 *qmul; /* Q multiplier table (for both) */ + const u8 x0f = 0x0f; + + p = (u8 *)ptrs[disks-2]; + q = (u8 *)ptrs[disks-1]; + + /* Compute syndrome with zero for the missing data pages + Use the dead data pages as temporary storage for + delta p and delta q */ + dp = (u8 *)ptrs[faila]; + ptrs[faila] = (void *)raid6_empty_zero_page; + ptrs[disks-2] = dp; + dq = (u8 *)ptrs[failb]; + ptrs[failb] = (void *)raid6_empty_zero_page; + ptrs[disks-1] = dq; + + raid6_call.gen_syndrome(disks, bytes, ptrs); + + /* Restore pointer table */ + ptrs[faila] = dp; + ptrs[failb] = dq; + ptrs[disks-2] = p; + ptrs[disks-1] = q; + + /* Now, pick the proper data tables */ + pbmul = raid6_vgfmul[raid6_gfexi[failb-faila]]; + qmul = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila] ^ + raid6_gfexp[failb]]]; + + kernel_fpu_begin(); + + /* ymm0 = x0f[16] */ + asm volatile("vpbroadcastb %0, %%ymm7" : : "m" (x0f)); + + while (bytes) { +#ifdef CONFIG_X86_64 + asm volatile("vmovdqa %0, %%ymm1" : : "m" (q[0])); + asm volatile("vmovdqa %0, %%ymm9" : : "m" (q[32])); + asm volatile("vmovdqa %0, %%ymm0" : : "m" (p[0])); + asm volatile("vmovdqa %0, %%ymm8" : : "m" (p[32])); + asm volatile("vpxor %0, %%ymm1, %%ymm1" : : "m" (dq[0])); + asm volatile("vpxor %0, %%ymm9, %%ymm9" : : "m" (dq[32])); + asm volatile("vpxor %0, %%ymm0, %%ymm0" : : "m" (dp[0])); + asm volatile("vpxor %0, %%ymm8, %%ymm8" : : "m" (dp[32])); + + /* + * 1 = dq[0] ^ q[0] + * 9 = dq[32] ^ q[32] + * 0 = dp[0] ^ p[0] + * 8 = dp[32] ^ p[32] + */ + + asm volatile("vbroadcasti128 %0, %%ymm4" : : "m" (qmul[0])); + asm volatile("vbroadcasti128 %0, %%ymm5" : : "m" (qmul[16])); + + asm volatile("vpsraw $4, %ymm1, %ymm3"); + asm volatile("vpsraw $4, %ymm9, %ymm12"); + asm volatile("vpand %ymm7, %ymm1, %ymm1"); + asm volatile("vpand %ymm7, %ymm9, %ymm9"); + asm volatile("vpand %ymm7, %ymm3, %ymm3"); + asm volatile("vpand %ymm7, %ymm12, %ymm12"); + asm volatile("vpshufb %ymm9, %ymm4, %ymm14"); + asm volatile("vpshufb %ymm1, %ymm4, %ymm4"); + asm volatile("vpshufb %ymm12, %ymm5, %ymm15"); + asm volatile("vpshufb %ymm3, %ymm5, %ymm5"); + asm volatile("vpxor %ymm14, %ymm15, %ymm15"); + asm volatile("vpxor %ymm4, %ymm5, %ymm5"); + + /* + * 5 = qx[0] + * 15 = qx[32] + */ + + asm volatile("vbroadcasti128 %0, %%ymm4" : : "m" (pbmul[0])); + asm volatile("vbroadcasti128 %0, %%ymm1" : : "m" (pbmul[16])); + asm volatile("vpsraw $4, %ymm0, %ymm2"); + asm volatile("vpsraw $4, %ymm8, %ymm6"); + asm volatile("vpand %ymm7, %ymm0, %ymm3"); + asm volatile("vpand %ymm7, %ymm8, %ymm14"); + asm volatile("vpand %ymm7, %ymm2, %ymm2"); + asm volatile("vpand %ymm7, %ymm6, %ymm6"); + asm volatile("vpshufb %ymm14, %ymm4, %ymm12"); + asm volatile("vpshufb %ymm3, %ymm4, %ymm4"); + asm volatile("vpshufb %ymm6, %ymm1, %ymm13"); + asm volatile("vpshufb %ymm2, %ymm1, %ymm1"); + asm volatile("vpxor %ymm4, %ymm1, %ymm1"); + asm volatile("vpxor %ymm12, %ymm13, %ymm13"); + + /* + * 1 = pbmul[px[0]] + * 13 = pbmul[px[32]] + */ + asm volatile("vpxor %ymm5, %ymm1, %ymm1"); + asm volatile("vpxor %ymm15, %ymm13, %ymm13"); + + /* + * 1 = db = DQ + * 13 = db[32] = DQ[32] + */ + asm volatile("vmovdqa %%ymm1, %0" : "=m" (dq[0])); + asm volatile("vmovdqa %%ymm13,%0" : "=m" (dq[32])); + asm volatile("vpxor %ymm1, %ymm0, %ymm0"); + asm volatile("vpxor %ymm13, %ymm8, %ymm8"); + + asm volatile("vmovdqa %%ymm0, %0" : "=m" (dp[0])); + asm volatile("vmovdqa %%ymm8, %0" : "=m" (dp[32])); + + bytes -= 64; + p += 64; + q += 64; + dp += 64; + dq += 64; +#else + asm volatile("vmovdqa %0, %%ymm1" : : "m" (*q)); + asm volatile("vmovdqa %0, %%ymm0" : : "m" (*p)); + asm volatile("vpxor %0, %%ymm1, %%ymm1" : : "m" (*dq)); + asm volatile("vpxor %0, %%ymm0, %%ymm0" : : "m" (*dp)); + + /* 1 = dq ^ q; 0 = dp ^ p */ + + asm volatile("vbroadcasti128 %0, %%ymm4" : : "m" (qmul[0])); + asm volatile("vbroadcasti128 %0, %%ymm5" : : "m" (qmul[16])); + + /* + * 1 = dq ^ q + * 3 = dq ^ p >> 4 + */ + asm volatile("vpsraw $4, %ymm1, %ymm3"); + asm volatile("vpand %ymm7, %ymm1, %ymm1"); + asm volatile("vpand %ymm7, %ymm3, %ymm3"); + asm volatile("vpshufb %ymm1, %ymm4, %ymm4"); + asm volatile("vpshufb %ymm3, %ymm5, %ymm5"); + asm volatile("vpxor %ymm4, %ymm5, %ymm5"); + + /* 5 = qx */ + + asm volatile("vbroadcasti128 %0, %%ymm4" : : "m" (pbmul[0])); + asm volatile("vbroadcasti128 %0, %%ymm1" : : "m" (pbmul[16])); + + asm volatile("vpsraw $4, %ymm0, %ymm2"); + asm volatile("vpand %ymm7, %ymm0, %ymm3"); + asm volatile("vpand %ymm7, %ymm2, %ymm2"); + asm volatile("vpshufb %ymm3, %ymm4, %ymm4"); + asm volatile("vpshufb %ymm2, %ymm1, %ymm1"); + asm volatile("vpxor %ymm4, %ymm1, %ymm1"); + + /* 1 = pbmul[px] */ + asm volatile("vpxor %ymm5, %ymm1, %ymm1"); + /* 1 = db = DQ */ + asm volatile("vmovdqa %%ymm1, %0" : "=m" (dq[0])); + + asm volatile("vpxor %ymm1, %ymm0, %ymm0"); + asm volatile("vmovdqa %%ymm0, %0" : "=m" (dp[0])); + + bytes -= 32; + p += 32; + q += 32; + dp += 32; + dq += 32; +#endif + } + + kernel_fpu_end(); +} + +static void raid6_datap_recov_avx2(int disks, size_t bytes, int faila, + void **ptrs) +{ + u8 *p, *q, *dq; + const u8 *qmul; /* Q multiplier table */ + const u8 x0f = 0x0f; + + p = (u8 *)ptrs[disks-2]; + q = (u8 *)ptrs[disks-1]; + + /* Compute syndrome with zero for the missing data page + Use the dead data page as temporary storage for delta q */ + dq = (u8 *)ptrs[faila]; + ptrs[faila] = (void *)raid6_empty_zero_page; + ptrs[disks-1] = dq; + + raid6_call.gen_syndrome(disks, bytes, ptrs); + + /* Restore pointer table */ + ptrs[faila] = dq; + ptrs[disks-1] = q; + + /* Now, pick the proper data tables */ + qmul = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila]]]; + + kernel_fpu_begin(); + + asm volatile("vpbroadcastb %0, %%ymm7" : : "m" (x0f)); + + while (bytes) { +#ifdef CONFIG_X86_64 + asm volatile("vmovdqa %0, %%ymm3" : : "m" (dq[0])); + asm volatile("vmovdqa %0, %%ymm8" : : "m" (dq[32])); + asm volatile("vpxor %0, %%ymm3, %%ymm3" : : "m" (q[0])); + asm volatile("vpxor %0, %%ymm8, %%ymm8" : : "m" (q[32])); + + /* + * 3 = q[0] ^ dq[0] + * 8 = q[32] ^ dq[32] + */ + asm volatile("vbroadcasti128 %0, %%ymm0" : : "m" (qmul[0])); + asm volatile("vmovapd %ymm0, %ymm13"); + asm volatile("vbroadcasti128 %0, %%ymm1" : : "m" (qmul[16])); + asm volatile("vmovapd %ymm1, %ymm14"); + + asm volatile("vpsraw $4, %ymm3, %ymm6"); + asm volatile("vpsraw $4, %ymm8, %ymm12"); + asm volatile("vpand %ymm7, %ymm3, %ymm3"); + asm volatile("vpand %ymm7, %ymm8, %ymm8"); + asm volatile("vpand %ymm7, %ymm6, %ymm6"); + asm volatile("vpand %ymm7, %ymm12, %ymm12"); + asm volatile("vpshufb %ymm3, %ymm0, %ymm0"); + asm volatile("vpshufb %ymm8, %ymm13, %ymm13"); + asm volatile("vpshufb %ymm6, %ymm1, %ymm1"); + asm volatile("vpshufb %ymm12, %ymm14, %ymm14"); + asm volatile("vpxor %ymm0, %ymm1, %ymm1"); + asm volatile("vpxor %ymm13, %ymm14, %ymm14"); + + /* + * 1 = qmul[q[0] ^ dq[0]] + * 14 = qmul[q[32] ^ dq[32]] + */ + asm volatile("vmovdqa %0, %%ymm2" : : "m" (p[0])); + asm volatile("vmovdqa %0, %%ymm12" : : "m" (p[32])); + asm volatile("vpxor %ymm1, %ymm2, %ymm2"); + asm volatile("vpxor %ymm14, %ymm12, %ymm12"); + + /* + * 2 = p[0] ^ qmul[q[0] ^ dq[0]] + * 12 = p[32] ^ qmul[q[32] ^ dq[32]] + */ + + asm volatile("vmovdqa %%ymm1, %0" : "=m" (dq[0])); + asm volatile("vmovdqa %%ymm14, %0" : "=m" (dq[32])); + asm volatile("vmovdqa %%ymm2, %0" : "=m" (p[0])); + asm volatile("vmovdqa %%ymm12,%0" : "=m" (p[32])); + + bytes -= 64; + p += 64; + q += 64; + dq += 64; +#else + asm volatile("vmovdqa %0, %%ymm3" : : "m" (dq[0])); + asm volatile("vpxor %0, %%ymm3, %%ymm3" : : "m" (q[0])); + + /* 3 = q ^ dq */ + + asm volatile("vbroadcasti128 %0, %%ymm0" : : "m" (qmul[0])); + asm volatile("vbroadcasti128 %0, %%ymm1" : : "m" (qmul[16])); + + asm volatile("vpsraw $4, %ymm3, %ymm6"); + asm volatile("vpand %ymm7, %ymm3, %ymm3"); + asm volatile("vpand %ymm7, %ymm6, %ymm6"); + asm volatile("vpshufb %ymm3, %ymm0, %ymm0"); + asm volatile("vpshufb %ymm6, %ymm1, %ymm1"); + asm volatile("vpxor %ymm0, %ymm1, %ymm1"); + + /* 1 = qmul[q ^ dq] */ + + asm volatile("vmovdqa %0, %%ymm2" : : "m" (p[0])); + asm volatile("vpxor %ymm1, %ymm2, %ymm2"); + + /* 2 = p ^ qmul[q ^ dq] */ + + asm volatile("vmovdqa %%ymm1, %0" : "=m" (dq[0])); + asm volatile("vmovdqa %%ymm2, %0" : "=m" (p[0])); + + bytes -= 32; + p += 32; + q += 32; + dq += 32; +#endif + } + + kernel_fpu_end(); +} + +const struct raid6_recov_calls raid6_recov_avx2 = { + .data2 = raid6_2data_recov_avx2, + .datap = raid6_datap_recov_avx2, + .valid = raid6_has_avx2, +#ifdef CONFIG_X86_64 + .name = "avx2x2", +#else + .name = "avx2x1", +#endif + .priority = 2, +}; + +#else +#warning "your version of binutils lacks AVX2 support" +#endif + +#endif diff --git a/lib/raid6/test/Makefile b/lib/raid6/test/Makefile index c76151d..d919c98 100644 --- a/lib/raid6/test/Makefile +++ b/lib/raid6/test/Makefile @@ -23,7 +23,7 @@ RANLIB = ranlib all: raid6.a raid6test raid6.a: int1.o int2.o int4.o int8.o int16.o int32.o mmx.o sse1.o sse2.o \ - altivec1.o altivec2.o altivec4.o altivec8.o recov.o recov_ssse3.o algos.o \ + altivec1.o altivec2.o altivec4.o altivec8.o recov.o recov_ssse3.o recov_avx2.o algos.o \ tables.o rm -f $@ $(AR) cq $@ $^ diff --git a/lib/raid6/x86.h b/lib/raid6/x86.h index d55d632..b759548 100644 --- a/lib/raid6/x86.h +++ b/lib/raid6/x86.h @@ -45,19 +45,23 @@ static inline void kernel_fpu_end(void) #define X86_FEATURE_XMM3 (4*32+ 0) /* "pni" SSE-3 */ #define X86_FEATURE_SSSE3 (4*32+ 9) /* Supplemental SSE-3 */ #define X86_FEATURE_AVX (4*32+28) /* Advanced Vector Extensions */ +#define X86_FEATURE_AVX2 (9*32+ 5) /* AVX2 instructions */ #define X86_FEATURE_MMXEXT (1*32+22) /* AMD MMX extensions */ /* Should work well enough on modern CPUs for testing */ static inline int boot_cpu_has(int flag) { - u32 eax = (flag & 0x20) ? 0x80000001 : 1; - u32 ecx, edx; + u32 eax, ebx, ecx, edx; + + eax = (flag & 0x100) ? 7 : + (flag & 0x20) ? 0x80000001 : 1; + ecx = 0; asm volatile("cpuid" - : "+a" (eax), "=d" (edx), "=c" (ecx) - : : "ebx"); + : "+a" (eax), "=b" (ebx), "=d" (edx), "+c" (ecx)); - return ((flag & 0x80 ? ecx : edx) >> (flag & 31)) & 1; + return ((flag & 0x100 ? ebx : + (flag & 0x80) ? ecx : edx) >> (flag & 31)) & 1; } #endif /* ndef __KERNEL__ */ -- cgit v0.10.2 From 2c935842bdb46f5f557426feb4d2bdfdad1aa5f9 Mon Sep 17 00:00:00 2001 From: Yuanhan Liu Date: Fri, 30 Nov 2012 13:10:39 -0800 Subject: lib/raid6: Add AVX2 optimized gen_syndrome functions Add AVX2 optimized gen_syndrom functions, which is simply based on sse2.c written by hpa. Signed-off-by: Yuanhan Liu Reviewed-by: H. Peter Anvin Signed-off-by: Jim Kukunas Signed-off-by: NeilBrown diff --git a/include/linux/raid/pq.h b/include/linux/raid/pq.h index 3156347..8dfaa2c 100644 --- a/include/linux/raid/pq.h +++ b/include/linux/raid/pq.h @@ -98,6 +98,9 @@ extern const struct raid6_calls raid6_altivec1; extern const struct raid6_calls raid6_altivec2; extern const struct raid6_calls raid6_altivec4; extern const struct raid6_calls raid6_altivec8; +extern const struct raid6_calls raid6_avx2x1; +extern const struct raid6_calls raid6_avx2x2; +extern const struct raid6_calls raid6_avx2x4; struct raid6_recov_calls { void (*data2)(int, size_t, int, int, void **); diff --git a/lib/raid6/Makefile b/lib/raid6/Makefile index 8c2e22b..3430711 100644 --- a/lib/raid6/Makefile +++ b/lib/raid6/Makefile @@ -2,7 +2,7 @@ obj-$(CONFIG_RAID6_PQ) += raid6_pq.o raid6_pq-y += algos.o recov.o recov_ssse3.o recov_avx2.o tables.o int1.o int2.o int4.o \ int8.o int16.o int32.o altivec1.o altivec2.o altivec4.o \ - altivec8.o mmx.o sse1.o sse2.o + altivec8.o mmx.o sse1.o sse2.o avx2.o hostprogs-y += mktables quiet_cmd_unroll = UNROLL $@ diff --git a/lib/raid6/algos.c b/lib/raid6/algos.c index 8b7f55c..6d7316f 100644 --- a/lib/raid6/algos.c +++ b/lib/raid6/algos.c @@ -45,11 +45,20 @@ const struct raid6_calls * const raid6_algos[] = { &raid6_sse1x2, &raid6_sse2x1, &raid6_sse2x2, +#ifdef CONFIG_AS_AVX2 + &raid6_avx2x1, + &raid6_avx2x2, +#endif #endif #if defined(__x86_64__) && !defined(__arch_um__) &raid6_sse2x1, &raid6_sse2x2, &raid6_sse2x4, +#ifdef CONFIG_AS_AVX2 + &raid6_avx2x1, + &raid6_avx2x2, + &raid6_avx2x4, +#endif #endif #ifdef CONFIG_ALTIVEC &raid6_altivec1, diff --git a/lib/raid6/avx2.c b/lib/raid6/avx2.c new file mode 100644 index 0000000..bc3b1dd --- /dev/null +++ b/lib/raid6/avx2.c @@ -0,0 +1,251 @@ +/* -*- linux-c -*- ------------------------------------------------------- * + * + * Copyright (C) 2012 Intel Corporation + * Author: Yuanhan Liu + * + * Based on sse2.c: Copyright 2002 H. Peter Anvin - All Rights Reserved + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, Inc., 53 Temple Place Ste 330, + * Boston MA 02111-1307, USA; either version 2 of the License, or + * (at your option) any later version; incorporated herein by reference. + * + * ----------------------------------------------------------------------- */ + +/* + * AVX2 implementation of RAID-6 syndrome functions + * + */ + +#ifdef CONFIG_AS_AVX2 + +#include +#include "x86.h" + +static const struct raid6_avx2_constants { + u64 x1d[4]; +} raid6_avx2_constants __aligned(32) = { + { 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL, + 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,}, +}; + +static int raid6_have_avx2(void) +{ + return boot_cpu_has(X86_FEATURE_AVX2) && boot_cpu_has(X86_FEATURE_AVX); +} + +/* + * Plain AVX2 implementation + */ +static void raid6_avx21_gen_syndrome(int disks, size_t bytes, void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + int d, z, z0; + + z0 = disks - 3; /* Highest data disk */ + p = dptr[z0+1]; /* XOR parity */ + q = dptr[z0+2]; /* RS syndrome */ + + kernel_fpu_begin(); + + asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0])); + asm volatile("vpxor %ymm3,%ymm3,%ymm3"); /* Zero temp */ + + for (d = 0; d < bytes; d += 32) { + asm volatile("prefetchnta %0" : : "m" (dptr[z0][d])); + asm volatile("vmovdqa %0,%%ymm2" : : "m" (dptr[z0][d]));/* P[0] */ + asm volatile("prefetchnta %0" : : "m" (dptr[z0-1][d])); + asm volatile("vmovdqa %ymm2,%ymm4");/* Q[0] */ + asm volatile("vmovdqa %0,%%ymm6" : : "m" (dptr[z0-1][d])); + for (z = z0-2; z >= 0; z--) { + asm volatile("prefetchnta %0" : : "m" (dptr[z][d])); + asm volatile("vpcmpgtb %ymm4,%ymm3,%ymm5"); + asm volatile("vpaddb %ymm4,%ymm4,%ymm4"); + asm volatile("vpand %ymm0,%ymm5,%ymm5"); + asm volatile("vpxor %ymm5,%ymm4,%ymm4"); + asm volatile("vpxor %ymm6,%ymm2,%ymm2"); + asm volatile("vpxor %ymm6,%ymm4,%ymm4"); + asm volatile("vmovdqa %0,%%ymm6" : : "m" (dptr[z][d])); + } + asm volatile("vpcmpgtb %ymm4,%ymm3,%ymm5"); + asm volatile("vpaddb %ymm4,%ymm4,%ymm4"); + asm volatile("vpand %ymm0,%ymm5,%ymm5"); + asm volatile("vpxor %ymm5,%ymm4,%ymm4"); + asm volatile("vpxor %ymm6,%ymm2,%ymm2"); + asm volatile("vpxor %ymm6,%ymm4,%ymm4"); + + asm volatile("vmovntdq %%ymm2,%0" : "=m" (p[d])); + asm volatile("vpxor %ymm2,%ymm2,%ymm2"); + asm volatile("vmovntdq %%ymm4,%0" : "=m" (q[d])); + asm volatile("vpxor %ymm4,%ymm4,%ymm4"); + } + + asm volatile("sfence" : : : "memory"); + kernel_fpu_end(); +} + +const struct raid6_calls raid6_avx2x1 = { + raid6_avx21_gen_syndrome, + raid6_have_avx2, + "avx2x1", + 1 /* Has cache hints */ +}; + +/* + * Unrolled-by-2 AVX2 implementation + */ +static void raid6_avx22_gen_syndrome(int disks, size_t bytes, void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + int d, z, z0; + + z0 = disks - 3; /* Highest data disk */ + p = dptr[z0+1]; /* XOR parity */ + q = dptr[z0+2]; /* RS syndrome */ + + kernel_fpu_begin(); + + asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0])); + asm volatile("vpxor %ymm1,%ymm1,%ymm1"); /* Zero temp */ + + /* We uniformly assume a single prefetch covers at least 32 bytes */ + for (d = 0; d < bytes; d += 64) { + asm volatile("prefetchnta %0" : : "m" (dptr[z0][d])); + asm volatile("prefetchnta %0" : : "m" (dptr[z0][d+32])); + asm volatile("vmovdqa %0,%%ymm2" : : "m" (dptr[z0][d]));/* P[0] */ + asm volatile("vmovdqa %0,%%ymm3" : : "m" (dptr[z0][d+32]));/* P[1] */ + asm volatile("vmovdqa %ymm2,%ymm4"); /* Q[0] */ + asm volatile("vmovdqa %ymm3,%ymm6"); /* Q[1] */ + for (z = z0-1; z >= 0; z--) { + asm volatile("prefetchnta %0" : : "m" (dptr[z][d])); + asm volatile("prefetchnta %0" : : "m" (dptr[z][d+32])); + asm volatile("vpcmpgtb %ymm4,%ymm1,%ymm5"); + asm volatile("vpcmpgtb %ymm6,%ymm1,%ymm7"); + asm volatile("vpaddb %ymm4,%ymm4,%ymm4"); + asm volatile("vpaddb %ymm6,%ymm6,%ymm6"); + asm volatile("vpand %ymm0,%ymm5,%ymm5"); + asm volatile("vpand %ymm0,%ymm7,%ymm7"); + asm volatile("vpxor %ymm5,%ymm4,%ymm4"); + asm volatile("vpxor %ymm7,%ymm6,%ymm6"); + asm volatile("vmovdqa %0,%%ymm5" : : "m" (dptr[z][d])); + asm volatile("vmovdqa %0,%%ymm7" : : "m" (dptr[z][d+32])); + asm volatile("vpxor %ymm5,%ymm2,%ymm2"); + asm volatile("vpxor %ymm7,%ymm3,%ymm3"); + asm volatile("vpxor %ymm5,%ymm4,%ymm4"); + asm volatile("vpxor %ymm7,%ymm6,%ymm6"); + } + asm volatile("vmovntdq %%ymm2,%0" : "=m" (p[d])); + asm volatile("vmovntdq %%ymm3,%0" : "=m" (p[d+32])); + asm volatile("vmovntdq %%ymm4,%0" : "=m" (q[d])); + asm volatile("vmovntdq %%ymm6,%0" : "=m" (q[d+32])); + } + + asm volatile("sfence" : : : "memory"); + kernel_fpu_end(); +} + +const struct raid6_calls raid6_avx2x2 = { + raid6_avx22_gen_syndrome, + raid6_have_avx2, + "avx2x2", + 1 /* Has cache hints */ +}; + +#ifdef CONFIG_X86_64 + +/* + * Unrolled-by-4 AVX2 implementation + */ +static void raid6_avx24_gen_syndrome(int disks, size_t bytes, void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + int d, z, z0; + + z0 = disks - 3; /* Highest data disk */ + p = dptr[z0+1]; /* XOR parity */ + q = dptr[z0+2]; /* RS syndrome */ + + kernel_fpu_begin(); + + asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0])); + asm volatile("vpxor %ymm1,%ymm1,%ymm1"); /* Zero temp */ + asm volatile("vpxor %ymm2,%ymm2,%ymm2"); /* P[0] */ + asm volatile("vpxor %ymm3,%ymm3,%ymm3"); /* P[1] */ + asm volatile("vpxor %ymm4,%ymm4,%ymm4"); /* Q[0] */ + asm volatile("vpxor %ymm6,%ymm6,%ymm6"); /* Q[1] */ + asm volatile("vpxor %ymm10,%ymm10,%ymm10"); /* P[2] */ + asm volatile("vpxor %ymm11,%ymm11,%ymm11"); /* P[3] */ + asm volatile("vpxor %ymm12,%ymm12,%ymm12"); /* Q[2] */ + asm volatile("vpxor %ymm14,%ymm14,%ymm14"); /* Q[3] */ + + for (d = 0; d < bytes; d += 128) { + for (z = z0; z >= 0; z--) { + asm volatile("prefetchnta %0" : : "m" (dptr[z][d])); + asm volatile("prefetchnta %0" : : "m" (dptr[z][d+32])); + asm volatile("prefetchnta %0" : : "m" (dptr[z][d+64])); + asm volatile("prefetchnta %0" : : "m" (dptr[z][d+96])); + asm volatile("vpcmpgtb %ymm4,%ymm1,%ymm5"); + asm volatile("vpcmpgtb %ymm6,%ymm1,%ymm7"); + asm volatile("vpcmpgtb %ymm12,%ymm1,%ymm13"); + asm volatile("vpcmpgtb %ymm14,%ymm1,%ymm15"); + asm volatile("vpaddb %ymm4,%ymm4,%ymm4"); + asm volatile("vpaddb %ymm6,%ymm6,%ymm6"); + asm volatile("vpaddb %ymm12,%ymm12,%ymm12"); + asm volatile("vpaddb %ymm14,%ymm14,%ymm14"); + asm volatile("vpand %ymm0,%ymm5,%ymm5"); + asm volatile("vpand %ymm0,%ymm7,%ymm7"); + asm volatile("vpand %ymm0,%ymm13,%ymm13"); + asm volatile("vpand %ymm0,%ymm15,%ymm15"); + asm volatile("vpxor %ymm5,%ymm4,%ymm4"); + asm volatile("vpxor %ymm7,%ymm6,%ymm6"); + asm volatile("vpxor %ymm13,%ymm12,%ymm12"); + asm volatile("vpxor %ymm15,%ymm14,%ymm14"); + asm volatile("vmovdqa %0,%%ymm5" : : "m" (dptr[z][d])); + asm volatile("vmovdqa %0,%%ymm7" : : "m" (dptr[z][d+32])); + asm volatile("vmovdqa %0,%%ymm13" : : "m" (dptr[z][d+64])); + asm volatile("vmovdqa %0,%%ymm15" : : "m" (dptr[z][d+96])); + asm volatile("vpxor %ymm5,%ymm2,%ymm2"); + asm volatile("vpxor %ymm7,%ymm3,%ymm3"); + asm volatile("vpxor %ymm13,%ymm10,%ymm10"); + asm volatile("vpxor %ymm15,%ymm11,%ymm11"); + asm volatile("vpxor %ymm5,%ymm4,%ymm4"); + asm volatile("vpxor %ymm7,%ymm6,%ymm6"); + asm volatile("vpxor %ymm13,%ymm12,%ymm12"); + asm volatile("vpxor %ymm15,%ymm14,%ymm14"); + } + asm volatile("vmovntdq %%ymm2,%0" : "=m" (p[d])); + asm volatile("vpxor %ymm2,%ymm2,%ymm2"); + asm volatile("vmovntdq %%ymm3,%0" : "=m" (p[d+32])); + asm volatile("vpxor %ymm3,%ymm3,%ymm3"); + asm volatile("vmovntdq %%ymm10,%0" : "=m" (p[d+64])); + asm volatile("vpxor %ymm10,%ymm10,%ymm10"); + asm volatile("vmovntdq %%ymm11,%0" : "=m" (p[d+96])); + asm volatile("vpxor %ymm11,%ymm11,%ymm11"); + asm volatile("vmovntdq %%ymm4,%0" : "=m" (q[d])); + asm volatile("vpxor %ymm4,%ymm4,%ymm4"); + asm volatile("vmovntdq %%ymm6,%0" : "=m" (q[d+32])); + asm volatile("vpxor %ymm6,%ymm6,%ymm6"); + asm volatile("vmovntdq %%ymm12,%0" : "=m" (q[d+64])); + asm volatile("vpxor %ymm12,%ymm12,%ymm12"); + asm volatile("vmovntdq %%ymm14,%0" : "=m" (q[d+96])); + asm volatile("vpxor %ymm14,%ymm14,%ymm14"); + } + + asm volatile("sfence" : : : "memory"); + kernel_fpu_end(); +} + +const struct raid6_calls raid6_avx2x4 = { + raid6_avx24_gen_syndrome, + raid6_have_avx2, + "avx2x4", + 1 /* Has cache hints */ +}; +#endif + +#endif /* CONFIG_AS_AVX2 */ diff --git a/lib/raid6/test/Makefile b/lib/raid6/test/Makefile index d919c98..754cbac 100644 --- a/lib/raid6/test/Makefile +++ b/lib/raid6/test/Makefile @@ -11,6 +11,16 @@ AWK = awk -f AR = ar RANLIB = ranlib +ARCH := $(shell uname -m 2>/dev/null | sed -e /s/i.86/i386/) +ifeq ($(ARCH),i386) + CFLAGS += -DCONFIG_X86_32 +endif +ifeq ($(ARCH),x86_64) + CFLAGS += -DCONFIG_X86_64 +endif +CFLAGS += $(shell echo "vpbroadcastb %xmm0, %ymm1"| gcc -c -x assembler - &&\ + rm ./-.o && echo -DCONFIG_AS_AVX2=1) + .c.o: $(CC) $(CFLAGS) -c -o $@ $< @@ -22,7 +32,7 @@ RANLIB = ranlib all: raid6.a raid6test -raid6.a: int1.o int2.o int4.o int8.o int16.o int32.o mmx.o sse1.o sse2.o \ +raid6.a: int1.o int2.o int4.o int8.o int16.o int32.o mmx.o sse1.o sse2.o avx2.o \ altivec1.o altivec2.o altivec4.o altivec8.o recov.o recov_ssse3.o recov_avx2.o algos.o \ tables.o rm -f $@ -- cgit v0.10.2 From 4f8c55c5ad491dbc7b52ce08bb702ca39ce944cf Mon Sep 17 00:00:00 2001 From: Yuanhan Liu Date: Fri, 30 Nov 2012 13:10:40 -0800 Subject: lib/raid6: build proper files on corresponding arch sse and avx2 stuff only exist on x86 arch, and we don't need to build altivec on x86. And we can do that at lib/raid6/Makefile. Proposed-by: H. Peter Anvin Signed-off-by: Yuanhan Liu Reviewed-by: H. Peter Anvin Signed-off-by: Jim Kukunas Signed-off-by: NeilBrown diff --git a/lib/raid6/Makefile b/lib/raid6/Makefile index 3430711..9f7c184 100644 --- a/lib/raid6/Makefile +++ b/lib/raid6/Makefile @@ -1,8 +1,11 @@ obj-$(CONFIG_RAID6_PQ) += raid6_pq.o -raid6_pq-y += algos.o recov.o recov_ssse3.o recov_avx2.o tables.o int1.o int2.o int4.o \ - int8.o int16.o int32.o altivec1.o altivec2.o altivec4.o \ - altivec8.o mmx.o sse1.o sse2.o avx2.o +raid6_pq-y += algos.o recov.o tables.o int1.o int2.o int4.o \ + int8.o int16.o int32.o + +raid6_pq-$(CONFIG_X86) += recov_ssse3.o recov_avx2.o mmx.o sse1.o sse2.o avx2.o +raid6_pq-$(CONFIG_ALTIVEC) += altivec1.o altivec2.o altivec4.o altivec8.o + hostprogs-y += mktables quiet_cmd_unroll = UNROLL $@ diff --git a/lib/raid6/altivec.uc b/lib/raid6/altivec.uc index b71012b..7cc12b5 100644 --- a/lib/raid6/altivec.uc +++ b/lib/raid6/altivec.uc @@ -24,13 +24,10 @@ #include -#ifdef CONFIG_ALTIVEC - #include #ifdef __KERNEL__ # include # include -#endif /* * This is the C data type to use. We use a vector of diff --git a/lib/raid6/mmx.c b/lib/raid6/mmx.c index 279347f..590c71c 100644 --- a/lib/raid6/mmx.c +++ b/lib/raid6/mmx.c @@ -16,7 +16,7 @@ * MMX implementation of RAID-6 syndrome functions */ -#if defined(__i386__) && !defined(__arch_um__) +#ifdef CONFIG_X86_32 #include #include "x86.h" diff --git a/lib/raid6/recov_avx2.c b/lib/raid6/recov_avx2.c index 43a9bab..e1eea43 100644 --- a/lib/raid6/recov_avx2.c +++ b/lib/raid6/recov_avx2.c @@ -8,8 +8,6 @@ * of the License. */ -#if (defined(__i386__) || defined(__x86_64__)) && !defined(__arch_um__) - #if CONFIG_AS_AVX2 #include @@ -323,5 +321,3 @@ const struct raid6_recov_calls raid6_recov_avx2 = { #else #warning "your version of binutils lacks AVX2 support" #endif - -#endif diff --git a/lib/raid6/recov_ssse3.c b/lib/raid6/recov_ssse3.c index ecb710c..a916832 100644 --- a/lib/raid6/recov_ssse3.c +++ b/lib/raid6/recov_ssse3.c @@ -7,8 +7,6 @@ * of the License. */ -#if (defined(__i386__) || defined(__x86_64__)) && !defined(__arch_um__) - #include #include "x86.h" @@ -332,5 +330,3 @@ const struct raid6_recov_calls raid6_recov_ssse3 = { #endif .priority = 1, }; - -#endif diff --git a/lib/raid6/sse1.c b/lib/raid6/sse1.c index 10dd9194..f762971 100644 --- a/lib/raid6/sse1.c +++ b/lib/raid6/sse1.c @@ -21,7 +21,7 @@ * worthwhile as a separate implementation. */ -#if defined(__i386__) && !defined(__arch_um__) +#ifdef CONFIG_X86_32 #include #include "x86.h" diff --git a/lib/raid6/sse2.c b/lib/raid6/sse2.c index bc2d57d..85b82c8 100644 --- a/lib/raid6/sse2.c +++ b/lib/raid6/sse2.c @@ -17,8 +17,6 @@ * */ -#if (defined(__i386__) || defined(__x86_64__)) && !defined(__arch_um__) - #include #include "x86.h" @@ -159,9 +157,7 @@ const struct raid6_calls raid6_sse2x2 = { 1 /* Has cache hints */ }; -#endif - -#if defined(__x86_64__) && !defined(__arch_um__) +#ifdef CONFIG_X86_64 /* * Unrolled-by-4 SSE2 implementation @@ -259,4 +255,4 @@ const struct raid6_calls raid6_sse2x4 = { 1 /* Has cache hints */ }; -#endif +#endif /* CONFIG_X86_64 */ diff --git a/lib/raid6/test/Makefile b/lib/raid6/test/Makefile index 754cbac..087332d 100644 --- a/lib/raid6/test/Makefile +++ b/lib/raid6/test/Makefile @@ -10,16 +10,31 @@ LD = ld AWK = awk -f AR = ar RANLIB = ranlib +OBJS = int1.o int2.o int4.o int8.o int16.o int32.o recov.o algos.o tables.o ARCH := $(shell uname -m 2>/dev/null | sed -e /s/i.86/i386/) ifeq ($(ARCH),i386) CFLAGS += -DCONFIG_X86_32 + IS_X86 = yes endif ifeq ($(ARCH),x86_64) CFLAGS += -DCONFIG_X86_64 + IS_X86 = yes +endif + +ifeq ($(IS_X86),yes) + OBJS += mmx.o sse1.o sse2.o avx2.o recov_ssse3.o recov_avx2.o + CFLAGS += $(shell echo "vpbroadcastb %xmm0, %ymm1" | \ + gcc -c -x assembler - >&/dev/null && \ + rm ./-.o && echo -DCONFIG_AS_AVX2=1) +else + HAS_ALTIVEC := $(shell echo -e '\#include \nvector int a;' |\ + gcc -c -x c - >&/dev/null && \ + rm ./-.o && echo yes) + ifeq ($(HAS_ALTIVEC),yes) + OBJS += altivec1.o altivec2.o altivec4.o altivec8.o + endif endif -CFLAGS += $(shell echo "vpbroadcastb %xmm0, %ymm1"| gcc -c -x assembler - &&\ - rm ./-.o && echo -DCONFIG_AS_AVX2=1) .c.o: $(CC) $(CFLAGS) -c -o $@ $< @@ -32,9 +47,7 @@ CFLAGS += $(shell echo "vpbroadcastb %xmm0, %ymm1"| gcc -c -x assembler - &&\ all: raid6.a raid6test -raid6.a: int1.o int2.o int4.o int8.o int16.o int32.o mmx.o sse1.o sse2.o avx2.o \ - altivec1.o altivec2.o altivec4.o altivec8.o recov.o recov_ssse3.o recov_avx2.o algos.o \ - tables.o +raid6.a: $(OBJS) rm -f $@ $(AR) cq $@ $^ $(RANLIB) $@ -- cgit v0.10.2 From 0a19caabf01ac138bf3668786939e50ea4d9c8ac Mon Sep 17 00:00:00 2001 From: majianpeng Date: Mon, 19 Nov 2012 19:57:34 +0800 Subject: md: Use ->curr_resync as last completed request when cleanly aborting resync. If a resync is aborted cleanly, ->curr_resync is a reliable record of where we got up to. If there was an error it is less reliable but we always know that ->curr_resync_completed is safe. So add a flag MD_RECOVERY_ERROR to differentiate between these cases and set recovery_cp accordingly. Signed-off-by: Jianpeng Ma Signed-off-by: NeilBrown diff --git a/drivers/md/md.c b/drivers/md/md.c index 6aefa44..30ba223 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -7180,6 +7180,7 @@ void md_done_sync(struct mddev *mddev, int blocks, int ok) wake_up(&mddev->recovery_wait); if (!ok) { set_bit(MD_RECOVERY_INTR, &mddev->recovery); + set_bit(MD_RECOVERY_ERROR, &mddev->recovery); md_wakeup_thread(mddev->thread); // stop recovery, signal do_sync .... } @@ -7574,8 +7575,13 @@ void md_do_sync(struct md_thread *thread) printk(KERN_INFO "md: checkpointing %s of %s.\n", desc, mdname(mddev)); - mddev->recovery_cp = - mddev->curr_resync_completed; + if (test_bit(MD_RECOVERY_ERROR, + &mddev->recovery)) + mddev->recovery_cp = + mddev->curr_resync_completed; + else + mddev->recovery_cp = + mddev->curr_resync; } } else mddev->recovery_cp = MaxSector; diff --git a/drivers/md/md.h b/drivers/md/md.h index af443ab..c29e62e 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -307,6 +307,7 @@ struct mddev { * REQUEST: user-space has requested a sync (used with SYNC) * CHECK: user-space request for check-only, no repair * RESHAPE: A reshape is happening + * ERROR: sync-action interrupted because io-error * * If neither SYNC or RESHAPE are set, then it is a recovery. */ @@ -320,6 +321,7 @@ struct mddev { #define MD_RECOVERY_CHECK 7 #define MD_RECOVERY_RESHAPE 8 #define MD_RECOVERY_FROZEN 9 +#define MD_RECOVERY_ERROR 10 unsigned long recovery; /* If a RAID personality determines that recovery (of a particular -- cgit v0.10.2 From 749586b7d34df910118bff2c248d08877d772e81 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 20 Nov 2012 14:11:15 +1100 Subject: md/raid5: use async_tx_quiesce() instead of open-coding it. handle_stripe_expansion contains: if (tx) { async_tx_ack(tx); dma_wait_for_async_tx(tx); } which is very similar to the body of async_tx_quiesce(), except that the later handles an error from dma_wait_for_async_tx() (admittedly by panicing, but that decision belongs in the dma code, not the md code). So just us async_tx_quiesce(). Acked-by: Dan Williams Reported-by: Bartlomiej Zolnierkiewicz Signed-off-by: NeilBrown diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index a450268..2cf23f2 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -3224,10 +3224,7 @@ static void handle_stripe_expansion(struct r5conf *conf, struct stripe_head *sh) } /* done submitting copies, wait for them to complete */ - if (tx) { - async_tx_ack(tx); - dma_wait_for_async_tx(tx); - } + async_tx_quiesce(&tx); } /* -- cgit v0.10.2 From a9add5d92b64ea57fb4c3b557c3891cdeb15fa0c Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Wed, 31 Oct 2012 11:59:09 +1100 Subject: md/raid5: add blktrace calls This makes it easier to trace what raid5 is doing. Signed-off-by: NeilBrown diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 2cf23f2..ffebc1e 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -53,6 +53,8 @@ #include #include #include +#include + #include "md.h" #include "raid5.h" #include "raid0.h" @@ -182,6 +184,8 @@ static void return_io(struct bio *return_bi) return_bi = bi->bi_next; bi->bi_next = NULL; bi->bi_size = 0; + trace_block_bio_complete(bdev_get_queue(bi->bi_bdev), + bi, 0); bio_endio(bi, 0); bi = return_bi; } @@ -671,6 +675,9 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) bi->bi_next = NULL; if (rrdev) set_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags); + trace_block_bio_remap(bdev_get_queue(bi->bi_bdev), + bi, disk_devt(conf->mddev->gendisk), + sh->dev[i].sector); generic_make_request(bi); } if (rrdev) { @@ -698,6 +705,9 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) rbi->bi_io_vec[0].bv_offset = 0; rbi->bi_size = STRIPE_SIZE; rbi->bi_next = NULL; + trace_block_bio_remap(bdev_get_queue(rbi->bi_bdev), + rbi, disk_devt(conf->mddev->gendisk), + sh->dev[i].sector); generic_make_request(rbi); } if (!rdev && !rrdev) { @@ -2855,8 +2865,10 @@ static void handle_stripe_dirtying(struct r5conf *conf, pr_debug("for sector %llu, rmw=%d rcw=%d\n", (unsigned long long)sh->sector, rmw, rcw); set_bit(STRIPE_HANDLE, &sh->state); - if (rmw < rcw && rmw > 0) + if (rmw < rcw && rmw > 0) { /* prefer read-modify-write, but need to get some data */ + blk_add_trace_msg(conf->mddev->queue, "raid5 rmw %llu %d", + (unsigned long long)sh->sector, rmw); for (i = disks; i--; ) { struct r5dev *dev = &sh->dev[i]; if ((dev->towrite || i == sh->pd_idx) && @@ -2867,7 +2879,7 @@ static void handle_stripe_dirtying(struct r5conf *conf, if ( test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { pr_debug("Read_old block " - "%d for r-m-w\n", i); + "%d for r-m-w\n", i); set_bit(R5_LOCKED, &dev->flags); set_bit(R5_Wantread, &dev->flags); s->locked++; @@ -2877,8 +2889,10 @@ static void handle_stripe_dirtying(struct r5conf *conf, } } } + } if (rcw <= rmw && rcw > 0) { /* want reconstruct write, but need to get some data */ + int qread =0; rcw = 0; for (i = disks; i--; ) { struct r5dev *dev = &sh->dev[i]; @@ -2897,12 +2911,17 @@ static void handle_stripe_dirtying(struct r5conf *conf, set_bit(R5_LOCKED, &dev->flags); set_bit(R5_Wantread, &dev->flags); s->locked++; + qread++; } else { set_bit(STRIPE_DELAYED, &sh->state); set_bit(STRIPE_HANDLE, &sh->state); } } } + if (rcw) + blk_add_trace_msg(conf->mddev->queue, "raid5 rcw %llu %d %d %d", + (unsigned long long)sh->sector, + rcw, qread, test_bit(STRIPE_DELAYED, &sh->state)); } /* now if nothing is locked, and if we have enough data, * we can start a write request @@ -3900,6 +3919,8 @@ static void raid5_align_endio(struct bio *bi, int error) rdev_dec_pending(rdev, conf->mddev); if (!error && uptodate) { + trace_block_bio_complete(bdev_get_queue(raid_bi->bi_bdev), + raid_bi, 0); bio_endio(raid_bi, 0); if (atomic_dec_and_test(&conf->active_aligned_reads)) wake_up(&conf->wait_for_stripe); @@ -4004,6 +4025,9 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio) atomic_inc(&conf->active_aligned_reads); spin_unlock_irq(&conf->device_lock); + trace_block_bio_remap(bdev_get_queue(align_bi->bi_bdev), + align_bi, disk_devt(mddev->gendisk), + raid_bio->bi_sector); generic_make_request(align_bi); return 1; } else { @@ -4078,6 +4102,7 @@ static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule) struct stripe_head *sh; struct mddev *mddev = cb->cb.data; struct r5conf *conf = mddev->private; + int cnt = 0; if (cb->list.next && !list_empty(&cb->list)) { spin_lock_irq(&conf->device_lock); @@ -4092,9 +4117,11 @@ static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule) smp_mb__before_clear_bit(); clear_bit(STRIPE_ON_UNPLUG_LIST, &sh->state); __release_stripe(conf, sh); + cnt++; } spin_unlock_irq(&conf->device_lock); } + trace_block_unplug(mddev->queue, cnt, !from_schedule); kfree(cb); } @@ -4352,6 +4379,8 @@ static void make_request(struct mddev *mddev, struct bio * bi) if ( rw == WRITE ) md_write_end(mddev); + trace_block_bio_complete(bdev_get_queue(bi->bi_bdev), + bi, 0); bio_endio(bi, 0); } } @@ -4728,8 +4757,11 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio) handled++; } remaining = raid5_dec_bi_active_stripes(raid_bio); - if (remaining == 0) + if (remaining == 0) { + trace_block_bio_complete(bdev_get_queue(raid_bio->bi_bdev), + raid_bio, 0); bio_endio(raid_bio, 0); + } if (atomic_dec_and_test(&conf->active_aligned_reads)) wake_up(&conf->wait_for_stripe); return handled; -- cgit v0.10.2