From ee86273062cbb310665fe49e1f1937d2cf85b0b9 Mon Sep 17 00:00:00 2001 From: Milan Broz Date: Mon, 23 Aug 2010 15:16:00 +0200 Subject: loop: add some basic read-only sysfs attributes Create /sys/block/loopX/loop directory and provide these attributes: - backing_file - autoclear - offset - sizelimit This loop directory is present only if loop device is configured. To be used in util-linux-ng (and possibly elsewhere like udev rules) where code need to get loop attributes from kernel (and not store duplicate info in userspace). Moreover loop ioctls are not even able to provide full backing file info because of buffer limits. Signed-off-by: Milan Broz Signed-off-by: Jens Axboe diff --git a/drivers/block/loop.c b/drivers/block/loop.c index f3c636d..dc55230 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -74,6 +74,7 @@ #include #include #include +#include #include @@ -737,6 +738,103 @@ static inline int is_loop_device(struct file *file) return i && S_ISBLK(i->i_mode) && MAJOR(i->i_rdev) == LOOP_MAJOR; } +/* loop sysfs attributes */ + +static ssize_t loop_attr_show(struct device *dev, char *page, + ssize_t (*callback)(struct loop_device *, char *)) +{ + struct loop_device *l, *lo = NULL; + + mutex_lock(&loop_devices_mutex); + list_for_each_entry(l, &loop_devices, lo_list) + if (disk_to_dev(l->lo_disk) == dev) { + lo = l; + break; + } + mutex_unlock(&loop_devices_mutex); + + return lo ? callback(lo, page) : -EIO; +} + +#define LOOP_ATTR_RO(_name) \ +static ssize_t loop_attr_##_name##_show(struct loop_device *, char *); \ +static ssize_t loop_attr_do_show_##_name(struct device *d, \ + struct device_attribute *attr, char *b) \ +{ \ + return loop_attr_show(d, b, loop_attr_##_name##_show); \ +} \ +static struct device_attribute loop_attr_##_name = \ + __ATTR(_name, S_IRUGO, loop_attr_do_show_##_name, NULL); + +static ssize_t loop_attr_backing_file_show(struct loop_device *lo, char *buf) +{ + ssize_t ret; + char *p = NULL; + + mutex_lock(&lo->lo_ctl_mutex); + if (lo->lo_backing_file) + p = d_path(&lo->lo_backing_file->f_path, buf, PAGE_SIZE - 1); + mutex_unlock(&lo->lo_ctl_mutex); + + if (IS_ERR_OR_NULL(p)) + ret = PTR_ERR(p); + else { + ret = strlen(p); + memmove(buf, p, ret); + buf[ret++] = '\n'; + buf[ret] = 0; + } + + return ret; +} + +static ssize_t loop_attr_offset_show(struct loop_device *lo, char *buf) +{ + return sprintf(buf, "%llu\n", (unsigned long long)lo->lo_offset); +} + +static ssize_t loop_attr_sizelimit_show(struct loop_device *lo, char *buf) +{ + return sprintf(buf, "%llu\n", (unsigned long long)lo->lo_sizelimit); +} + +static ssize_t loop_attr_autoclear_show(struct loop_device *lo, char *buf) +{ + int autoclear = (lo->lo_flags & LO_FLAGS_AUTOCLEAR); + + return sprintf(buf, "%s\n", autoclear ? "1" : "0"); +} + +LOOP_ATTR_RO(backing_file); +LOOP_ATTR_RO(offset); +LOOP_ATTR_RO(sizelimit); +LOOP_ATTR_RO(autoclear); + +static struct attribute *loop_attrs[] = { + &loop_attr_backing_file.attr, + &loop_attr_offset.attr, + &loop_attr_sizelimit.attr, + &loop_attr_autoclear.attr, + NULL, +}; + +static struct attribute_group loop_attribute_group = { + .name = "loop", + .attrs= loop_attrs, +}; + +static int loop_sysfs_init(struct loop_device *lo) +{ + return sysfs_create_group(&disk_to_dev(lo->lo_disk)->kobj, + &loop_attribute_group); +} + +static void loop_sysfs_exit(struct loop_device *lo) +{ + sysfs_remove_group(&disk_to_dev(lo->lo_disk)->kobj, + &loop_attribute_group); +} + static int loop_set_fd(struct loop_device *lo, fmode_t mode, struct block_device *bdev, unsigned int arg) { @@ -836,6 +934,7 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode, set_capacity(lo->lo_disk, size); bd_set_size(bdev, size << 9); + loop_sysfs_init(lo); /* let user-space know about the new size */ kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, KOBJ_CHANGE); @@ -854,6 +953,7 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode, return 0; out_clr: + loop_sysfs_exit(lo); lo->lo_thread = NULL; lo->lo_device = NULL; lo->lo_backing_file = NULL; @@ -950,6 +1050,7 @@ static int loop_clr_fd(struct loop_device *lo, struct block_device *bdev) set_capacity(lo->lo_disk, 0); if (bdev) { bd_set_size(bdev, 0); + loop_sysfs_exit(lo); /* let user-space know about this change */ kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, KOBJ_CHANGE); } -- cgit v0.10.2 From 0a25a5aee727c4a56c7d39e0e595947b02ee2696 Mon Sep 17 00:00:00 2001 From: "Stephen M. Cameron" Date: Thu, 26 Aug 2010 13:55:34 -0500 Subject: cciss: factor out cciss_getpciinfo Signed-off-by: Stephen M. Cameron Signed-off-by: Jens Axboe diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c index 31064df..4fe5e42 100644 --- a/drivers/block/cciss.c +++ b/drivers/block/cciss.c @@ -1230,9 +1230,22 @@ static void check_ioctl_unit_attention(ctlr_info_t *h, CommandList_struct *c) c->err_info->ScsiStatus != SAM_STAT_CHECK_CONDITION) (void)check_for_unit_attention(h, c); } -/* - * ioctl - */ + +static int cciss_getpciinfo(ctlr_info_t *h, void __user *argp) +{ + cciss_pci_info_struct pciinfo; + + if (!argp) + return -EINVAL; + pciinfo.domain = pci_domain_nr(h->pdev->bus); + pciinfo.bus = h->pdev->bus->number; + pciinfo.dev_fn = h->pdev->devfn; + pciinfo.board_id = h->board_id; + if (copy_to_user(argp, &pciinfo, sizeof(cciss_pci_info_struct))) + return -EFAULT; + return 0; +} + static int cciss_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg) { @@ -1245,20 +1258,7 @@ static int cciss_ioctl(struct block_device *bdev, fmode_t mode, cmd, arg); switch (cmd) { case CCISS_GETPCIINFO: - { - cciss_pci_info_struct pciinfo; - - if (!arg) - return -EINVAL; - pciinfo.domain = pci_domain_nr(h->pdev->bus); - pciinfo.bus = h->pdev->bus->number; - pciinfo.dev_fn = h->pdev->devfn; - pciinfo.board_id = h->board_id; - if (copy_to_user - (argp, &pciinfo, sizeof(cciss_pci_info_struct))) - return -EFAULT; - return 0; - } + return cciss_getpciinfo(h, argp); case CCISS_GETINTINFO: { cciss_coalint_struct intinfo; -- cgit v0.10.2 From 576e661c658ab7d2a15cc12d5b8a1600db81ec0a Mon Sep 17 00:00:00 2001 From: "Stephen M. Cameron" Date: Thu, 26 Aug 2010 13:55:39 -0500 Subject: cciss: factor out cciss_getintinfo Signed-off-by: Stephen M. Cameron Signed-off-by: Jens Axboe diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c index 4fe5e42..fc76140 100644 --- a/drivers/block/cciss.c +++ b/drivers/block/cciss.c @@ -1246,6 +1246,20 @@ static int cciss_getpciinfo(ctlr_info_t *h, void __user *argp) return 0; } +static int cciss_getintinfo(ctlr_info_t *h, void __user *argp) +{ + cciss_coalint_struct intinfo; + + if (!argp) + return -EINVAL; + intinfo.delay = readl(&h->cfgtable->HostWrite.CoalIntDelay); + intinfo.count = readl(&h->cfgtable->HostWrite.CoalIntCount); + if (copy_to_user + (argp, &intinfo, sizeof(cciss_coalint_struct))) + return -EFAULT; + return 0; +} + static int cciss_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg) { @@ -1260,19 +1274,7 @@ static int cciss_ioctl(struct block_device *bdev, fmode_t mode, case CCISS_GETPCIINFO: return cciss_getpciinfo(h, argp); case CCISS_GETINTINFO: - { - cciss_coalint_struct intinfo; - if (!arg) - return -EINVAL; - intinfo.delay = - readl(&h->cfgtable->HostWrite.CoalIntDelay); - intinfo.count = - readl(&h->cfgtable->HostWrite.CoalIntCount); - if (copy_to_user - (argp, &intinfo, sizeof(cciss_coalint_struct))) - return -EFAULT; - return 0; - } + return cciss_getintinfo(h, argp); case CCISS_SETINTINFO: { cciss_coalint_struct intinfo; -- cgit v0.10.2 From 4c800eed9a46f7b6a469d24e7e6051b23e62bb69 Mon Sep 17 00:00:00 2001 From: "Stephen M. Cameron" Date: Thu, 26 Aug 2010 13:55:44 -0500 Subject: cciss: factor out cciss_setintinfo Signed-off-by: Stephen M. Cameron Signed-off-by: Jens Axboe diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c index fc76140..5119c88 100644 --- a/drivers/block/cciss.c +++ b/drivers/block/cciss.c @@ -1260,6 +1260,37 @@ static int cciss_getintinfo(ctlr_info_t *h, void __user *argp) return 0; } +static int cciss_setintinfo(ctlr_info_t *h, void __user *argp) +{ + cciss_coalint_struct intinfo; + unsigned long flags; + int i; + + if (!argp) + return -EINVAL; + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + if (copy_from_user(&intinfo, argp, sizeof(intinfo))) + return -EFAULT; + if ((intinfo.delay == 0) && (intinfo.count == 0)) + return -EINVAL; + spin_lock_irqsave(&h->lock, flags); + /* Update the field, and then ring the doorbell */ + writel(intinfo.delay, &(h->cfgtable->HostWrite.CoalIntDelay)); + writel(intinfo.count, &(h->cfgtable->HostWrite.CoalIntCount)); + writel(CFGTBL_ChangeReq, h->vaddr + SA5_DOORBELL); + + for (i = 0; i < MAX_IOCTL_CONFIG_WAIT; i++) { + if (!(readl(h->vaddr + SA5_DOORBELL) & CFGTBL_ChangeReq)) + break; + udelay(1000); /* delay and try again */ + } + spin_unlock_irqrestore(&h->lock, flags); + if (i >= MAX_IOCTL_CONFIG_WAIT) + return -EAGAIN; + return 0; +} + static int cciss_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg) { @@ -1276,40 +1307,7 @@ static int cciss_ioctl(struct block_device *bdev, fmode_t mode, case CCISS_GETINTINFO: return cciss_getintinfo(h, argp); case CCISS_SETINTINFO: - { - cciss_coalint_struct intinfo; - unsigned long flags; - int i; - - if (!arg) - return -EINVAL; - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - if (copy_from_user - (&intinfo, argp, sizeof(cciss_coalint_struct))) - return -EFAULT; - if ((intinfo.delay == 0) && (intinfo.count == 0)) - return -EINVAL; - spin_lock_irqsave(&h->lock, flags); - /* Update the field, and then ring the doorbell */ - writel(intinfo.delay, - &(h->cfgtable->HostWrite.CoalIntDelay)); - writel(intinfo.count, - &(h->cfgtable->HostWrite.CoalIntCount)); - writel(CFGTBL_ChangeReq, h->vaddr + SA5_DOORBELL); - - for (i = 0; i < MAX_IOCTL_CONFIG_WAIT; i++) { - if (!(readl(h->vaddr + SA5_DOORBELL) - & CFGTBL_ChangeReq)) - break; - /* delay and try again */ - udelay(1000); - } - spin_unlock_irqrestore(&h->lock, flags); - if (i >= MAX_IOCTL_CONFIG_WAIT) - return -EAGAIN; - return 0; - } + return cciss_setintinfo(h, argp); case CCISS_GETNODENAME: { NodeName_type NodeName; -- cgit v0.10.2 From 252161094255ffdc277b8a0e5e12b0a8ff67b10f Mon Sep 17 00:00:00 2001 From: "Stephen M. Cameron" Date: Thu, 26 Aug 2010 13:55:49 -0500 Subject: cciss: factor out cciss_getnodename Signed-off-by: Stephen M. Cameron Signed-off-by: Jens Axboe diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c index 5119c88..9a3dd20 100644 --- a/drivers/block/cciss.c +++ b/drivers/block/cciss.c @@ -1291,6 +1291,20 @@ static int cciss_setintinfo(ctlr_info_t *h, void __user *argp) return 0; } +static int cciss_getnodename(ctlr_info_t *h, void __user *argp) +{ + NodeName_type NodeName; + int i; + + if (!argp) + return -EINVAL; + for (i = 0; i < 16; i++) + NodeName[i] = readb(&h->cfgtable->ServerName[i]); + if (copy_to_user(argp, NodeName, sizeof(NodeName_type))) + return -EFAULT; + return 0; +} + static int cciss_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg) { @@ -1309,19 +1323,7 @@ static int cciss_ioctl(struct block_device *bdev, fmode_t mode, case CCISS_SETINTINFO: return cciss_setintinfo(h, argp); case CCISS_GETNODENAME: - { - NodeName_type NodeName; - int i; - - if (!arg) - return -EINVAL; - for (i = 0; i < 16; i++) - NodeName[i] = - readb(&h->cfgtable->ServerName[i]); - if (copy_to_user(argp, NodeName, sizeof(NodeName_type))) - return -EFAULT; - return 0; - } + return cciss_getnodename(h, argp); case CCISS_SETNODENAME: { NodeName_type NodeName; -- cgit v0.10.2 From 4f43f32cd3f65cfee5f30d7e6be55854cf33809b Mon Sep 17 00:00:00 2001 From: "Stephen M. Cameron" Date: Thu, 26 Aug 2010 13:55:54 -0500 Subject: cciss: factor out cciss_setnodename Signed-off-by: Stephen M. Cameron Signed-off-by: Jens Axboe diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c index 9a3dd20..8bbb1c8 100644 --- a/drivers/block/cciss.c +++ b/drivers/block/cciss.c @@ -1305,6 +1305,34 @@ static int cciss_getnodename(ctlr_info_t *h, void __user *argp) return 0; } +static int cciss_setnodename(ctlr_info_t *h, void __user *argp) +{ + NodeName_type NodeName; + unsigned long flags; + int i; + + if (!argp) + return -EINVAL; + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + if (copy_from_user(NodeName, argp, sizeof(NodeName_type))) + return -EFAULT; + spin_lock_irqsave(&h->lock, flags); + /* Update the field, and then ring the doorbell */ + for (i = 0; i < 16; i++) + writeb(NodeName[i], &h->cfgtable->ServerName[i]); + writel(CFGTBL_ChangeReq, h->vaddr + SA5_DOORBELL); + for (i = 0; i < MAX_IOCTL_CONFIG_WAIT; i++) { + if (!(readl(h->vaddr + SA5_DOORBELL) & CFGTBL_ChangeReq)) + break; + udelay(1000); /* delay and try again */ + } + spin_unlock_irqrestore(&h->lock, flags); + if (i >= MAX_IOCTL_CONFIG_WAIT) + return -EAGAIN; + return 0; +} + static int cciss_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg) { @@ -1325,42 +1353,7 @@ static int cciss_ioctl(struct block_device *bdev, fmode_t mode, case CCISS_GETNODENAME: return cciss_getnodename(h, argp); case CCISS_SETNODENAME: - { - NodeName_type NodeName; - unsigned long flags; - int i; - - if (!arg) - return -EINVAL; - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - if (copy_from_user - (NodeName, argp, sizeof(NodeName_type))) - return -EFAULT; - - spin_lock_irqsave(&h->lock, flags); - - /* Update the field, and then ring the doorbell */ - for (i = 0; i < 16; i++) - writeb(NodeName[i], - &h->cfgtable->ServerName[i]); - - writel(CFGTBL_ChangeReq, h->vaddr + SA5_DOORBELL); - - for (i = 0; i < MAX_IOCTL_CONFIG_WAIT; i++) { - if (!(readl(h->vaddr + SA5_DOORBELL) - & CFGTBL_ChangeReq)) - break; - /* delay and try again */ - udelay(1000); - } - spin_unlock_irqrestore(&h->lock, flags); - if (i >= MAX_IOCTL_CONFIG_WAIT) - return -EAGAIN; - return 0; - } - + return cciss_setnodename(h, argp); case CCISS_GETHEARTBEAT: { Heartbeat_type heartbeat; -- cgit v0.10.2 From 93c74931131d3a7af3a6aa18aab0d39978b5d3b5 Mon Sep 17 00:00:00 2001 From: "Stephen M. Cameron" Date: Thu, 26 Aug 2010 13:55:59 -0500 Subject: cciss: factor out cciss_getheartbeat Signed-off-by: Stephen M. Cameron Signed-off-by: Jens Axboe diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c index 8bbb1c8..e716fc9 100644 --- a/drivers/block/cciss.c +++ b/drivers/block/cciss.c @@ -1333,6 +1333,18 @@ static int cciss_setnodename(ctlr_info_t *h, void __user *argp) return 0; } +static int cciss_getheartbeat(ctlr_info_t *h, void __user *argp) +{ + Heartbeat_type heartbeat; + + if (!argp) + return -EINVAL; + heartbeat = readl(&h->cfgtable->HeartBeat); + if (copy_to_user(argp, &heartbeat, sizeof(Heartbeat_type))) + return -EFAULT; + return 0; +} + static int cciss_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg) { @@ -1355,17 +1367,7 @@ static int cciss_ioctl(struct block_device *bdev, fmode_t mode, case CCISS_SETNODENAME: return cciss_setnodename(h, argp); case CCISS_GETHEARTBEAT: - { - Heartbeat_type heartbeat; - - if (!arg) - return -EINVAL; - heartbeat = readl(&h->cfgtable->HeartBeat); - if (copy_to_user - (argp, &heartbeat, sizeof(Heartbeat_type))) - return -EFAULT; - return 0; - } + return cciss_getheartbeat(h, argp); case CCISS_GETBUSTYPES: { BusTypes_type BusTypes; -- cgit v0.10.2 From d18dfad4e2bf6c2d2c5c104d882b14b9fee71b14 Mon Sep 17 00:00:00 2001 From: "Stephen M. Cameron" Date: Thu, 26 Aug 2010 13:56:05 -0500 Subject: cciss: factor out cciss_getbustypes Signed-off-by: Stephen M. Cameron Signed-off-by: Jens Axboe diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c index e716fc9..e7b650b 100644 --- a/drivers/block/cciss.c +++ b/drivers/block/cciss.c @@ -1345,6 +1345,18 @@ static int cciss_getheartbeat(ctlr_info_t *h, void __user *argp) return 0; } +static int cciss_getbustypes(ctlr_info_t *h, void __user *argp) +{ + BusTypes_type BusTypes; + + if (!argp) + return -EINVAL; + BusTypes = readl(&h->cfgtable->BusTypes); + if (copy_to_user(argp, &BusTypes, sizeof(BusTypes_type))) + return -EFAULT; + return 0; +} + static int cciss_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg) { @@ -1369,17 +1381,7 @@ static int cciss_ioctl(struct block_device *bdev, fmode_t mode, case CCISS_GETHEARTBEAT: return cciss_getheartbeat(h, argp); case CCISS_GETBUSTYPES: - { - BusTypes_type BusTypes; - - if (!arg) - return -EINVAL; - BusTypes = readl(&h->cfgtable->BusTypes); - if (copy_to_user - (argp, &BusTypes, sizeof(BusTypes_type))) - return -EFAULT; - return 0; - } + return cciss_getbustypes(h, argp); case CCISS_GETFIRMVER: { FirmwareVer_type firmware; -- cgit v0.10.2 From 8a4f7fbfdd76a304c8f04dde6b8fd0e5c50bbe76 Mon Sep 17 00:00:00 2001 From: "Stephen M. Cameron" Date: Thu, 26 Aug 2010 13:56:10 -0500 Subject: cciss: factor out cciss_getfirmver Signed-off-by: Stephen M. Cameron Signed-off-by: Jens Axboe diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c index e7b650b..c47e601 100644 --- a/drivers/block/cciss.c +++ b/drivers/block/cciss.c @@ -1357,6 +1357,20 @@ static int cciss_getbustypes(ctlr_info_t *h, void __user *argp) return 0; } +static int cciss_getfirmver(ctlr_info_t *h, void __user *argp) +{ + FirmwareVer_type firmware; + + if (!argp) + return -EINVAL; + memcpy(firmware, h->firm_ver, 4); + + if (copy_to_user + (argp, firmware, sizeof(FirmwareVer_type))) + return -EFAULT; + return 0; +} + static int cciss_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg) { @@ -1383,18 +1397,7 @@ static int cciss_ioctl(struct block_device *bdev, fmode_t mode, case CCISS_GETBUSTYPES: return cciss_getbustypes(h, argp); case CCISS_GETFIRMVER: - { - FirmwareVer_type firmware; - - if (!arg) - return -EINVAL; - memcpy(firmware, h->firm_ver, 4); - - if (copy_to_user - (argp, firmware, sizeof(FirmwareVer_type))) - return -EFAULT; - return 0; - } + return cciss_getfirmver(h, argp); case CCISS_GETDRIVVER: { DriverVer_type DriverVer = DRIVER_VERSION; -- cgit v0.10.2 From c525919ddf0b10ff0e1f528dd50d90edfdee4797 Mon Sep 17 00:00:00 2001 From: "Stephen M. Cameron" Date: Thu, 26 Aug 2010 13:56:15 -0500 Subject: cciss: factor out cciss_getdrivver Signed-off-by: Stephen M. Cameron Signed-off-by: Jens Axboe diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c index c47e601..71018b9 100644 --- a/drivers/block/cciss.c +++ b/drivers/block/cciss.c @@ -1371,8 +1371,19 @@ static int cciss_getfirmver(ctlr_info_t *h, void __user *argp) return 0; } +static int cciss_getdrivver(ctlr_info_t *h, void __user *argp) +{ + DriverVer_type DriverVer = DRIVER_VERSION; + + if (!argp) + return -EINVAL; + if (copy_to_user(argp, &DriverVer, sizeof(DriverVer_type))) + return -EFAULT; + return 0; +} + static int cciss_ioctl(struct block_device *bdev, fmode_t mode, - unsigned int cmd, unsigned long arg) + unsigned int cmd, unsigned long arg) { struct gendisk *disk = bdev->bd_disk; ctlr_info_t *h = get_host(disk); @@ -1399,18 +1410,7 @@ static int cciss_ioctl(struct block_device *bdev, fmode_t mode, case CCISS_GETFIRMVER: return cciss_getfirmver(h, argp); case CCISS_GETDRIVVER: - { - DriverVer_type DriverVer = DRIVER_VERSION; - - if (!arg) - return -EINVAL; - - if (copy_to_user - (argp, &DriverVer, sizeof(DriverVer_type))) - return -EFAULT; - return 0; - } - + return cciss_getdrivver(h, argp); case CCISS_DEREGDISK: case CCISS_REGNEWD: case CCISS_REVALIDVOLS: -- cgit v0.10.2 From 0894b32c5c1444d25f3e988cf03415ce8dfc5142 Mon Sep 17 00:00:00 2001 From: "Stephen M. Cameron" Date: Thu, 26 Aug 2010 13:56:20 -0500 Subject: cciss: factor out cciss_getluninfo Signed-off-by: Stephen M. Cameron Signed-off-by: Jens Axboe diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c index 71018b9..5abdce3 100644 --- a/drivers/block/cciss.c +++ b/drivers/block/cciss.c @@ -1382,12 +1382,27 @@ static int cciss_getdrivver(ctlr_info_t *h, void __user *argp) return 0; } +static int cciss_getluninfo(ctlr_info_t *h, + struct gendisk *disk, void __user *argp) +{ + LogvolInfo_struct luninfo; + drive_info_struct *drv = get_drv(disk); + + if (!argp) + return -EINVAL; + memcpy(&luninfo.LunID, drv->LunID, sizeof(luninfo.LunID)); + luninfo.num_opens = drv->usage_count; + luninfo.num_parts = 0; + if (copy_to_user(argp, &luninfo, sizeof(LogvolInfo_struct))) + return -EFAULT; + return 0; +} + static int cciss_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg) { struct gendisk *disk = bdev->bd_disk; ctlr_info_t *h = get_host(disk); - drive_info_struct *drv = get_drv(disk); void __user *argp = (void __user *)arg; dev_dbg(&h->pdev->dev, "cciss_ioctl: Called with cmd=%x %lx\n", @@ -1415,19 +1430,8 @@ static int cciss_ioctl(struct block_device *bdev, fmode_t mode, case CCISS_REGNEWD: case CCISS_REVALIDVOLS: return rebuild_lun_table(h, 0, 1); - - case CCISS_GETLUNINFO:{ - LogvolInfo_struct luninfo; - - memcpy(&luninfo.LunID, drv->LunID, - sizeof(luninfo.LunID)); - luninfo.num_opens = drv->usage_count; - luninfo.num_parts = 0; - if (copy_to_user(argp, &luninfo, - sizeof(LogvolInfo_struct))) - return -EFAULT; - return 0; - } + case CCISS_GETLUNINFO: + return cciss_getluninfo(h, disk, argp); case CCISS_PASSTHRU: { IOCTL_Command_struct iocommand; -- cgit v0.10.2 From f32f125b1c14dcde49ec415ec941af750433251e Mon Sep 17 00:00:00 2001 From: "Stephen M. Cameron" Date: Thu, 26 Aug 2010 13:56:25 -0500 Subject: cciss: factor out cciss_passthru Signed-off-by: Stephen M. Cameron Signed-off-by: Jens Axboe diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c index 5abdce3..076dbcf 100644 --- a/drivers/block/cciss.c +++ b/drivers/block/cciss.c @@ -1398,6 +1398,106 @@ static int cciss_getluninfo(ctlr_info_t *h, return 0; } +static int cciss_passthru(ctlr_info_t *h, void __user *argp) +{ + IOCTL_Command_struct iocommand; + CommandList_struct *c; + char *buff = NULL; + u64bit temp64; + DECLARE_COMPLETION_ONSTACK(wait); + + if (!argp) + return -EINVAL; + + if (!capable(CAP_SYS_RAWIO)) + return -EPERM; + + if (copy_from_user + (&iocommand, argp, sizeof(IOCTL_Command_struct))) + return -EFAULT; + if ((iocommand.buf_size < 1) && + (iocommand.Request.Type.Direction != XFER_NONE)) { + return -EINVAL; + } + if (iocommand.buf_size > 0) { + buff = kmalloc(iocommand.buf_size, GFP_KERNEL); + if (buff == NULL) + return -EFAULT; + } + if (iocommand.Request.Type.Direction == XFER_WRITE) { + /* Copy the data into the buffer we created */ + if (copy_from_user(buff, iocommand.buf, iocommand.buf_size)) { + kfree(buff); + return -EFAULT; + } + } else { + memset(buff, 0, iocommand.buf_size); + } + c = cmd_special_alloc(h); + if (!c) { + kfree(buff); + return -ENOMEM; + } + /* Fill in the command type */ + c->cmd_type = CMD_IOCTL_PEND; + /* Fill in Command Header */ + c->Header.ReplyQueue = 0; /* unused in simple mode */ + if (iocommand.buf_size > 0) { /* buffer to fill */ + c->Header.SGList = 1; + c->Header.SGTotal = 1; + } else { /* no buffers to fill */ + c->Header.SGList = 0; + c->Header.SGTotal = 0; + } + c->Header.LUN = iocommand.LUN_info; + /* use the kernel address the cmd block for tag */ + c->Header.Tag.lower = c->busaddr; + + /* Fill in Request block */ + c->Request = iocommand.Request; + + /* Fill in the scatter gather information */ + if (iocommand.buf_size > 0) { + temp64.val = pci_map_single(h->pdev, buff, + iocommand.buf_size, PCI_DMA_BIDIRECTIONAL); + c->SG[0].Addr.lower = temp64.val32.lower; + c->SG[0].Addr.upper = temp64.val32.upper; + c->SG[0].Len = iocommand.buf_size; + c->SG[0].Ext = 0; /* we are not chaining */ + } + c->waiting = &wait; + + enqueue_cmd_and_start_io(h, c); + wait_for_completion(&wait); + + /* unlock the buffers from DMA */ + temp64.val32.lower = c->SG[0].Addr.lower; + temp64.val32.upper = c->SG[0].Addr.upper; + pci_unmap_single(h->pdev, (dma_addr_t) temp64.val, iocommand.buf_size, + PCI_DMA_BIDIRECTIONAL); + check_ioctl_unit_attention(h, c); + + /* Copy the error information out */ + iocommand.error_info = *(c->err_info); + if (copy_to_user(argp, &iocommand, sizeof(IOCTL_Command_struct))) { + kfree(buff); + cmd_special_free(h, c); + return -EFAULT; + } + + if (iocommand.Request.Type.Direction == XFER_READ) { + /* Copy the data out of the buffer we created */ + if (copy_to_user(iocommand.buf, buff, iocommand.buf_size)) { + kfree(buff); + cmd_special_free(h, c); + return -EFAULT; + } + } + kfree(buff); + cmd_special_free(h, c); + return 0; +} + static int cciss_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg) { @@ -1433,117 +1533,7 @@ static int cciss_ioctl(struct block_device *bdev, fmode_t mode, case CCISS_GETLUNINFO: return cciss_getluninfo(h, disk, argp); case CCISS_PASSTHRU: - { - IOCTL_Command_struct iocommand; - CommandList_struct *c; - char *buff = NULL; - u64bit temp64; - DECLARE_COMPLETION_ONSTACK(wait); - - if (!arg) - return -EINVAL; - - if (!capable(CAP_SYS_RAWIO)) - return -EPERM; - - if (copy_from_user - (&iocommand, argp, sizeof(IOCTL_Command_struct))) - return -EFAULT; - if ((iocommand.buf_size < 1) && - (iocommand.Request.Type.Direction != XFER_NONE)) { - return -EINVAL; - } -#if 0 /* 'buf_size' member is 16-bits, and always smaller than kmalloc limit */ - /* Check kmalloc limits */ - if (iocommand.buf_size > 128000) - return -EINVAL; -#endif - if (iocommand.buf_size > 0) { - buff = kmalloc(iocommand.buf_size, GFP_KERNEL); - if (buff == NULL) - return -EFAULT; - } - if (iocommand.Request.Type.Direction == XFER_WRITE) { - /* Copy the data into the buffer we created */ - if (copy_from_user - (buff, iocommand.buf, iocommand.buf_size)) { - kfree(buff); - return -EFAULT; - } - } else { - memset(buff, 0, iocommand.buf_size); - } - c = cmd_special_alloc(h); - if (!c) { - kfree(buff); - return -ENOMEM; - } - /* Fill in the command type */ - c->cmd_type = CMD_IOCTL_PEND; - /* Fill in Command Header */ - c->Header.ReplyQueue = 0; /* unused in simple mode */ - if (iocommand.buf_size > 0) /* buffer to fill */ - { - c->Header.SGList = 1; - c->Header.SGTotal = 1; - } else /* no buffers to fill */ - { - c->Header.SGList = 0; - c->Header.SGTotal = 0; - } - c->Header.LUN = iocommand.LUN_info; - /* use the kernel address the cmd block for tag */ - c->Header.Tag.lower = c->busaddr; - - /* Fill in Request block */ - c->Request = iocommand.Request; - - /* Fill in the scatter gather information */ - if (iocommand.buf_size > 0) { - temp64.val = pci_map_single(h->pdev, buff, - iocommand.buf_size, - PCI_DMA_BIDIRECTIONAL); - c->SG[0].Addr.lower = temp64.val32.lower; - c->SG[0].Addr.upper = temp64.val32.upper; - c->SG[0].Len = iocommand.buf_size; - c->SG[0].Ext = 0; /* we are not chaining */ - } - c->waiting = &wait; - - enqueue_cmd_and_start_io(h, c); - wait_for_completion(&wait); - - /* unlock the buffers from DMA */ - temp64.val32.lower = c->SG[0].Addr.lower; - temp64.val32.upper = c->SG[0].Addr.upper; - pci_unmap_single(h->pdev, (dma_addr_t) temp64.val, - iocommand.buf_size, - PCI_DMA_BIDIRECTIONAL); - - check_ioctl_unit_attention(h, c); - - /* Copy the error information out */ - iocommand.error_info = *(c->err_info); - if (copy_to_user - (argp, &iocommand, sizeof(IOCTL_Command_struct))) { - kfree(buff); - cmd_special_free(h, c); - return -EFAULT; - } - - if (iocommand.Request.Type.Direction == XFER_READ) { - /* Copy the data out of the buffer we created */ - if (copy_to_user - (iocommand.buf, buff, iocommand.buf_size)) { - kfree(buff); - cmd_special_free(h, c); - return -EFAULT; - } - } - kfree(buff); - cmd_special_free(h, c); - return 0; - } + return cciss_passthru(h, argp); case CCISS_BIG_PASSTHRU:{ BIG_IOCTL_Command_struct *ioc; CommandList_struct *c; -- cgit v0.10.2 From 0c9f5ba7cb7435ea4b99599de4af0729f0740647 Mon Sep 17 00:00:00 2001 From: "Stephen M. Cameron" Date: Thu, 26 Aug 2010 13:56:30 -0500 Subject: cciss: factor out cciss_big_passthru Signed-off-by: Stephen M. Cameron Signed-off-by: Jens Axboe diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c index 076dbcf..cff2fa1 100644 --- a/drivers/block/cciss.c +++ b/drivers/block/cciss.c @@ -1498,6 +1498,155 @@ static int cciss_passthru(ctlr_info_t *h, void __user *argp) return 0; } +static int cciss_bigpassthru(ctlr_info_t *h, void __user *argp) +{ + BIG_IOCTL_Command_struct *ioc; + CommandList_struct *c; + unsigned char **buff = NULL; + int *buff_size = NULL; + u64bit temp64; + BYTE sg_used = 0; + int status = 0; + int i; + DECLARE_COMPLETION_ONSTACK(wait); + __u32 left; + __u32 sz; + BYTE __user *data_ptr; + + if (!argp) + return -EINVAL; + if (!capable(CAP_SYS_RAWIO)) + return -EPERM; + ioc = (BIG_IOCTL_Command_struct *) + kmalloc(sizeof(*ioc), GFP_KERNEL); + if (!ioc) { + status = -ENOMEM; + goto cleanup1; + } + if (copy_from_user(ioc, argp, sizeof(*ioc))) { + status = -EFAULT; + goto cleanup1; + } + if ((ioc->buf_size < 1) && + (ioc->Request.Type.Direction != XFER_NONE)) { + status = -EINVAL; + goto cleanup1; + } + /* Check kmalloc limits using all SGs */ + if (ioc->malloc_size > MAX_KMALLOC_SIZE) { + status = -EINVAL; + goto cleanup1; + } + if (ioc->buf_size > ioc->malloc_size * MAXSGENTRIES) { + status = -EINVAL; + goto cleanup1; + } + buff = kzalloc(MAXSGENTRIES * sizeof(char *), GFP_KERNEL); + if (!buff) { + status = -ENOMEM; + goto cleanup1; + } + buff_size = kmalloc(MAXSGENTRIES * sizeof(int), GFP_KERNEL); + if (!buff_size) { + status = -ENOMEM; + goto cleanup1; + } + left = ioc->buf_size; + data_ptr = ioc->buf; + while (left) { + sz = (left > ioc->malloc_size) ? ioc->malloc_size : left; + buff_size[sg_used] = sz; + buff[sg_used] = kmalloc(sz, GFP_KERNEL); + if (buff[sg_used] == NULL) { + status = -ENOMEM; + goto cleanup1; + } + if (ioc->Request.Type.Direction == XFER_WRITE) { + if (copy_from_user(buff[sg_used], data_ptr, sz)) { + status = -EFAULT; + goto cleanup1; + } + } else { + memset(buff[sg_used], 0, sz); + } + left -= sz; + data_ptr += sz; + sg_used++; + } + c = cmd_special_alloc(h); + if (!c) { + status = -ENOMEM; + goto cleanup1; + } + c->cmd_type = CMD_IOCTL_PEND; + c->Header.ReplyQueue = 0; + + if (ioc->buf_size > 0) { + c->Header.SGList = sg_used; + c->Header.SGTotal = sg_used; + } else { + c->Header.SGList = 0; + c->Header.SGTotal = 0; + } + c->Header.LUN = ioc->LUN_info; + c->Header.Tag.lower = c->busaddr; + + c->Request = ioc->Request; + if (ioc->buf_size > 0) { + for (i = 0; i < sg_used; i++) { + temp64.val = + pci_map_single(h->pdev, buff[i], buff_size[i], + PCI_DMA_BIDIRECTIONAL); + c->SG[i].Addr.lower = temp64.val32.lower; + c->SG[i].Addr.upper = temp64.val32.upper; + c->SG[i].Len = buff_size[i]; + c->SG[i].Ext = 0; /* we are not chaining */ + } + } + c->waiting = &wait; + enqueue_cmd_and_start_io(h, c); + wait_for_completion(&wait); + /* unlock the buffers from DMA */ + for (i = 0; i < sg_used; i++) { + temp64.val32.lower = c->SG[i].Addr.lower; + temp64.val32.upper = c->SG[i].Addr.upper; + pci_unmap_single(h->pdev, + (dma_addr_t) temp64.val, buff_size[i], + PCI_DMA_BIDIRECTIONAL); + } + check_ioctl_unit_attention(h, c); + /* Copy the error information out */ + ioc->error_info = *(c->err_info); + if (copy_to_user(argp, ioc, sizeof(*ioc))) { + cmd_special_free(h, c); + status = -EFAULT; + goto cleanup1; + } + if (ioc->Request.Type.Direction == XFER_READ) { + /* Copy the data out of the buffer we created */ + BYTE __user *ptr = ioc->buf; + for (i = 0; i < sg_used; i++) { + if (copy_to_user(ptr, buff[i], buff_size[i])) { + cmd_special_free(h, c); + status = -EFAULT; + goto cleanup1; + } + ptr += buff_size[i]; + } + } + cmd_special_free(h, c); + status = 0; +cleanup1: + if (buff) { + for (i = 0; i < sg_used; i++) + kfree(buff[i]); + kfree(buff); + } + kfree(buff_size); + kfree(ioc); + return status; +} + static int cciss_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg) { @@ -1534,162 +1683,8 @@ static int cciss_ioctl(struct block_device *bdev, fmode_t mode, return cciss_getluninfo(h, disk, argp); case CCISS_PASSTHRU: return cciss_passthru(h, argp); - case CCISS_BIG_PASSTHRU:{ - BIG_IOCTL_Command_struct *ioc; - CommandList_struct *c; - unsigned char **buff = NULL; - int *buff_size = NULL; - u64bit temp64; - BYTE sg_used = 0; - int status = 0; - int i; - DECLARE_COMPLETION_ONSTACK(wait); - __u32 left; - __u32 sz; - BYTE __user *data_ptr; - - if (!arg) - return -EINVAL; - if (!capable(CAP_SYS_RAWIO)) - return -EPERM; - ioc = (BIG_IOCTL_Command_struct *) - kmalloc(sizeof(*ioc), GFP_KERNEL); - if (!ioc) { - status = -ENOMEM; - goto cleanup1; - } - if (copy_from_user(ioc, argp, sizeof(*ioc))) { - status = -EFAULT; - goto cleanup1; - } - if ((ioc->buf_size < 1) && - (ioc->Request.Type.Direction != XFER_NONE)) { - status = -EINVAL; - goto cleanup1; - } - /* Check kmalloc limits using all SGs */ - if (ioc->malloc_size > MAX_KMALLOC_SIZE) { - status = -EINVAL; - goto cleanup1; - } - if (ioc->buf_size > ioc->malloc_size * MAXSGENTRIES) { - status = -EINVAL; - goto cleanup1; - } - buff = - kzalloc(MAXSGENTRIES * sizeof(char *), GFP_KERNEL); - if (!buff) { - status = -ENOMEM; - goto cleanup1; - } - buff_size = kmalloc(MAXSGENTRIES * sizeof(int), - GFP_KERNEL); - if (!buff_size) { - status = -ENOMEM; - goto cleanup1; - } - left = ioc->buf_size; - data_ptr = ioc->buf; - while (left) { - sz = (left > - ioc->malloc_size) ? ioc-> - malloc_size : left; - buff_size[sg_used] = sz; - buff[sg_used] = kmalloc(sz, GFP_KERNEL); - if (buff[sg_used] == NULL) { - status = -ENOMEM; - goto cleanup1; - } - if (ioc->Request.Type.Direction == XFER_WRITE) { - if (copy_from_user - (buff[sg_used], data_ptr, sz)) { - status = -EFAULT; - goto cleanup1; - } - } else { - memset(buff[sg_used], 0, sz); - } - left -= sz; - data_ptr += sz; - sg_used++; - } - c = cmd_special_alloc(h); - if (!c) { - status = -ENOMEM; - goto cleanup1; - } - c->cmd_type = CMD_IOCTL_PEND; - c->Header.ReplyQueue = 0; - - if (ioc->buf_size > 0) { - c->Header.SGList = sg_used; - c->Header.SGTotal = sg_used; - } else { - c->Header.SGList = 0; - c->Header.SGTotal = 0; - } - c->Header.LUN = ioc->LUN_info; - c->Header.Tag.lower = c->busaddr; - - c->Request = ioc->Request; - if (ioc->buf_size > 0) { - for (i = 0; i < sg_used; i++) { - temp64.val = - pci_map_single(h->pdev, buff[i], - buff_size[i], - PCI_DMA_BIDIRECTIONAL); - c->SG[i].Addr.lower = - temp64.val32.lower; - c->SG[i].Addr.upper = - temp64.val32.upper; - c->SG[i].Len = buff_size[i]; - c->SG[i].Ext = 0; /* we are not chaining */ - } - } - c->waiting = &wait; - enqueue_cmd_and_start_io(h, c); - wait_for_completion(&wait); - /* unlock the buffers from DMA */ - for (i = 0; i < sg_used; i++) { - temp64.val32.lower = c->SG[i].Addr.lower; - temp64.val32.upper = c->SG[i].Addr.upper; - pci_unmap_single(h->pdev, - (dma_addr_t) temp64.val, buff_size[i], - PCI_DMA_BIDIRECTIONAL); - } - check_ioctl_unit_attention(h, c); - /* Copy the error information out */ - ioc->error_info = *(c->err_info); - if (copy_to_user(argp, ioc, sizeof(*ioc))) { - cmd_special_free(h, c); - status = -EFAULT; - goto cleanup1; - } - if (ioc->Request.Type.Direction == XFER_READ) { - /* Copy the data out of the buffer we created */ - BYTE __user *ptr = ioc->buf; - for (i = 0; i < sg_used; i++) { - if (copy_to_user - (ptr, buff[i], buff_size[i])) { - cmd_special_free(h, c); - status = -EFAULT; - goto cleanup1; - } - ptr += buff_size[i]; - } - } - cmd_special_free(h, c); - status = 0; - cleanup1: - if (buff) { - for (i = 0; i < sg_used; i++) - kfree(buff[i]); - kfree(buff); - } - kfree(buff_size); - kfree(ioc); - return status; - } + case CCISS_BIG_PASSTHRU: + return cciss_bigpassthru(h, argp); /* scsi_cmd_ioctl handles these, below, though some are not */ /* very meaningful for cciss. SG_IO is the main one people want. */ -- cgit v0.10.2 From fcfb5c0ce11fe474cd6b4f7e83e0cc25f220d911 Mon Sep 17 00:00:00 2001 From: "Stephen M. Cameron" Date: Thu, 26 Aug 2010 13:56:35 -0500 Subject: cciss: remove some superfluous tests from cciss_bigpassthru() Signed-off-by: Stephen M. Cameron Signed-off-by: Jens Axboe diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c index cff2fa1..df2ed4d 100644 --- a/drivers/block/cciss.c +++ b/drivers/block/cciss.c @@ -1580,28 +1580,19 @@ static int cciss_bigpassthru(ctlr_info_t *h, void __user *argp) } c->cmd_type = CMD_IOCTL_PEND; c->Header.ReplyQueue = 0; - - if (ioc->buf_size > 0) { - c->Header.SGList = sg_used; - c->Header.SGTotal = sg_used; - } else { - c->Header.SGList = 0; - c->Header.SGTotal = 0; - } + c->Header.SGList = sg_used; + c->Header.SGTotal = sg_used; c->Header.LUN = ioc->LUN_info; c->Header.Tag.lower = c->busaddr; c->Request = ioc->Request; - if (ioc->buf_size > 0) { - for (i = 0; i < sg_used; i++) { - temp64.val = - pci_map_single(h->pdev, buff[i], buff_size[i], + for (i = 0; i < sg_used; i++) { + temp64.val = pci_map_single(h->pdev, buff[i], buff_size[i], PCI_DMA_BIDIRECTIONAL); - c->SG[i].Addr.lower = temp64.val32.lower; - c->SG[i].Addr.upper = temp64.val32.upper; - c->SG[i].Len = buff_size[i]; - c->SG[i].Ext = 0; /* we are not chaining */ - } + c->SG[i].Addr.lower = temp64.val32.lower; + c->SG[i].Addr.upper = temp64.val32.upper; + c->SG[i].Len = buff_size[i]; + c->SG[i].Ext = 0; /* we are not chaining */ } c->waiting = &wait; enqueue_cmd_and_start_io(h, c); -- cgit v0.10.2 From 488211844e0c3fad6ffa98a6f3c4f2139074e79a Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 22 Sep 2010 09:32:36 +0200 Subject: floppy: switch to one queue per drive instead of sharing a queue Pretty straight forward conversion. Note that we do round-robin between the drives that have available requests, before we simply used the drive that the IO scheduler told us to. Since the IO scheduler doesn't care about multiple devices per queue, the resulting sort would not have made sense. Fixed by Vivek to get rid of a double lock problem in set_next_request() Signed-off-by: Jens Axboe Signed-off-by: Vivek Goyal diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c index cf04c1b..aa42e77 100644 --- a/drivers/block/floppy.c +++ b/drivers/block/floppy.c @@ -258,8 +258,8 @@ static int irqdma_allocated; #include static struct request *current_req; -static struct request_queue *floppy_queue; static void do_fd_request(struct request_queue *q); +static int set_next_request(void); #ifndef fd_get_dma_residue #define fd_get_dma_residue() get_dma_residue(FLOPPY_DMA) @@ -413,6 +413,7 @@ static struct gendisk *disks[N_DRIVE]; static struct block_device *opened_bdev[N_DRIVE]; static DEFINE_MUTEX(open_lock); static struct floppy_raw_cmd *raw_cmd, default_raw_cmd; +static int fdc_queue; /* * This struct defines the different floppy types. @@ -890,8 +891,8 @@ static void unlock_fdc(void) del_timer(&fd_timeout); cont = NULL; clear_bit(0, &fdc_busy); - if (current_req || blk_peek_request(floppy_queue)) - do_fd_request(floppy_queue); + if (current_req || set_next_request()) + do_fd_request(current_req->q); spin_unlock_irqrestore(&floppy_lock, flags); wake_up(&fdc_wait); } @@ -2243,8 +2244,8 @@ static void floppy_end_request(struct request *req, int error) * logical buffer */ static void request_done(int uptodate) { - struct request_queue *q = floppy_queue; struct request *req = current_req; + struct request_queue *q; unsigned long flags; int block; char msg[sizeof("request done ") + sizeof(int) * 3]; @@ -2258,6 +2259,8 @@ static void request_done(int uptodate) return; } + q = req->q; + if (uptodate) { /* maintain values for invalidation on geometry * change */ @@ -2811,6 +2814,28 @@ static int make_raw_rw_request(void) return 2; } +/* + * Round-robin between our available drives, doing one request from each + */ +static int set_next_request(void) +{ + struct request_queue *q; + int old_pos = fdc_queue; + + do { + q = disks[fdc_queue]->queue; + if (++fdc_queue == N_DRIVE) + fdc_queue = 0; + if (q) { + current_req = blk_fetch_request(q); + if (current_req) + break; + } + } while (fdc_queue != old_pos); + + return current_req != NULL; +} + static void redo_fd_request(void) { int drive; @@ -2822,17 +2847,17 @@ static void redo_fd_request(void) do_request: if (!current_req) { - struct request *req; + int pending; + + spin_lock_irq(&floppy_lock); + pending = set_next_request(); + spin_unlock_irq(&floppy_lock); - spin_lock_irq(floppy_queue->queue_lock); - req = blk_fetch_request(floppy_queue); - spin_unlock_irq(floppy_queue->queue_lock); - if (!req) { + if (!pending) { do_floppy = NULL; unlock_fdc(); return; } - current_req = req; } drive = (long)current_req->rq_disk->private_data; set_fdc(drive); @@ -4165,6 +4190,13 @@ static int __init floppy_init(void) goto out_put_disk; } + disks[dr]->queue = blk_init_queue(do_fd_request, &floppy_lock); + if (!disks[dr]->queue) { + err = -ENOMEM; + goto out_put_disk; + } + + blk_queue_max_hw_sectors(disks[dr]->queue, 64); disks[dr]->major = FLOPPY_MAJOR; disks[dr]->first_minor = TOMINOR(dr); disks[dr]->fops = &floppy_fops; @@ -4183,13 +4215,6 @@ static int __init floppy_init(void) if (err) goto out_unreg_blkdev; - floppy_queue = blk_init_queue(do_fd_request, &floppy_lock); - if (!floppy_queue) { - err = -ENOMEM; - goto out_unreg_driver; - } - blk_queue_max_hw_sectors(floppy_queue, 64); - blk_register_region(MKDEV(FLOPPY_MAJOR, 0), 256, THIS_MODULE, floppy_find, NULL, NULL); @@ -4317,7 +4342,6 @@ static int __init floppy_init(void) /* to be cleaned up... */ disks[drive]->private_data = (void *)(long)drive; - disks[drive]->queue = floppy_queue; disks[drive]->flags |= GENHD_FL_REMOVABLE; disks[drive]->driverfs_dev = &floppy_device[drive].dev; add_disk(disks[drive]); @@ -4333,8 +4357,6 @@ out_flush_work: floppy_release_irq_and_dma(); out_unreg_region: blk_unregister_region(MKDEV(FLOPPY_MAJOR, 0), 256); - blk_cleanup_queue(floppy_queue); -out_unreg_driver: platform_driver_unregister(&floppy_driver); out_unreg_blkdev: unregister_blkdev(FLOPPY_MAJOR, "fd"); @@ -4342,6 +4364,8 @@ out_put_disk: while (dr--) { del_timer(&motor_off_timer[dr]); put_disk(disks[dr]); + if (disks[dr]->queue) + blk_cleanup_queue(disks[dr]->queue); } return err; } @@ -4550,11 +4574,11 @@ static void __exit floppy_module_exit(void) platform_device_unregister(&floppy_device[drive]); } put_disk(disks[drive]); + blk_cleanup_queue(disks[drive]->queue); } del_timer_sync(&fd_timeout); del_timer_sync(&fd_timer); - blk_cleanup_queue(floppy_queue); if (atomic_read(&usage_count)) floppy_release_irq_and_dma(); -- cgit v0.10.2 From 786029ff810ff4a2fd52c0462713985a415417ab Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Fri, 24 Sep 2010 20:35:44 +0200 Subject: amiga floppy: Stop sharing request queue across multiple gendisks o Use one request queue per gendisk instead of sharing request queue o Don't have hardware. No compile testing or run time testing done. Completely untested. Signed-off-by: Vivek Goyal Signed-off-by: Jens Axboe diff --git a/drivers/block/amiflop.c b/drivers/block/amiflop.c index 76f114f..ead8b77 100644 --- a/drivers/block/amiflop.c +++ b/drivers/block/amiflop.c @@ -114,8 +114,6 @@ static unsigned long int fd_def_df0 = FD_DD_3; /* default for df0 if it does module_param(fd_def_df0, ulong, 0); MODULE_LICENSE("GPL"); -static struct request_queue *floppy_queue; - /* * Macros */ @@ -164,6 +162,7 @@ static volatile int selected = -1; /* currently selected drive */ static int writepending; static int writefromint; static char *raw_buf; +static int fdc_queue; static DEFINE_SPINLOCK(amiflop_lock); @@ -1334,6 +1333,42 @@ static int get_track(int drive, int track) return -1; } +/* + * Round-robin between our available drives, doing one request from each + */ +static struct request *set_next_request(void) +{ + struct request_queue *q; + int cnt = FD_MAX_UNITS; + struct request *rq; + + /* Find next queue we can dispatch from */ + fdc_queue = fdc_queue + 1; + if (fdc_queue == FD_MAX_UNITS) + fdc_queue = 0; + + for(cnt = FD_MAX_UNITS; cnt > 0, cnt--) { + + if (unit[fdc_queue].type->code == FD_NODRIVE) { + if (++fdc_queue == FD_MAX_UNITS) + fdc_queue = 0; + cotinue; + } + + q = unit[fdc_queue].gendisk->queue; + if (q) { + rq = blk_fetch_request(q); + if (rq) + break; + } + + if (++fdc_queue == FD_MAX_UNITS) + fdc_queue = 0; + } + + return rq; +} + static void redo_fd_request(void) { struct request *rq; @@ -1345,7 +1380,7 @@ static void redo_fd_request(void) int err; next_req: - rq = blk_fetch_request(floppy_queue); + rq = set_next_request(); if (!rq) { /* Nothing left to do */ return; @@ -1682,6 +1717,13 @@ static int __init fd_probe_drives(void) continue; } unit[drive].gendisk = disk; + + disk->queue = blk_init_queue(do_fd_request, &amiflop_lock); + if (!disk->queue) { + unit[drive].type->code = FD_NODRIVE; + continue; + } + drives++; if ((unit[drive].trackbuf = kmalloc(FLOPPY_MAX_SECTORS * 512, GFP_KERNEL)) == NULL) { printk("no mem for "); @@ -1695,7 +1737,6 @@ static int __init fd_probe_drives(void) disk->fops = &floppy_fops; sprintf(disk->disk_name, "fd%d", drive); disk->private_data = &unit[drive]; - disk->queue = floppy_queue; set_capacity(disk, 880*2); add_disk(disk); } @@ -1743,11 +1784,6 @@ static int __init amiga_floppy_probe(struct platform_device *pdev) goto out_irq2; } - ret = -ENOMEM; - floppy_queue = blk_init_queue(do_fd_request, &amiflop_lock); - if (!floppy_queue) - goto out_queue; - ret = -ENODEV; if (fd_probe_drives() < 1) /* No usable drives */ goto out_probe; @@ -1791,7 +1827,6 @@ static int __init amiga_floppy_probe(struct platform_device *pdev) return 0; out_probe: - blk_cleanup_queue(floppy_queue); out_queue: free_irq(IRQ_AMIGA_CIAA_TB, NULL); out_irq2: @@ -1810,9 +1845,12 @@ static int __exit amiga_floppy_remove(struct platform_device *pdev) for( i = 0; i < FD_MAX_UNITS; i++) { if (unit[i].type->code != FD_NODRIVE) { + struct request_queue *q = unit[i].gendisk->queue; del_gendisk(unit[i].gendisk); put_disk(unit[i].gendisk); kfree(unit[i].trackbuf); + if (q) + blk_cleanup_queue(q); } } blk_unregister_region(MKDEV(FLOPPY_MAJOR, 0), 256); @@ -1820,7 +1858,6 @@ static int __exit amiga_floppy_remove(struct platform_device *pdev) free_irq(IRQ_AMIGA_DSKBLK, NULL); custom.dmacon = DMAF_DISK; /* disable DMA */ amiga_chip_free(raw_buf); - blk_cleanup_queue(floppy_queue); unregister_blkdev(FLOPPY_MAJOR, "fd"); } #endif -- cgit v0.10.2 From 639e2f2aa76eefaf22078dccbbf2f3483f587aa7 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Fri, 24 Sep 2010 20:35:45 +0200 Subject: atari floppy: Stop sharing request queue across multiple gendisks o Use one request queue per gendisk instead of sharing the queue. o Don't have hardware. No compile testing or run time testing done. Completely untested. Signed-off-by: Vivek Goyal Signed-off-by: Jens Axboe diff --git a/drivers/block/ataflop.c b/drivers/block/ataflop.c index aceb964..0f4eec4 100644 --- a/drivers/block/ataflop.c +++ b/drivers/block/ataflop.c @@ -79,8 +79,8 @@ #undef DEBUG -static struct request_queue *floppy_queue; static struct request *fd_request; +static int fdc_queue; /* Disk types: DD, HD, ED */ static struct atari_disk_type { @@ -1391,6 +1391,29 @@ static void setup_req_params( int drive ) ReqTrack, ReqSector, (unsigned long)ReqData )); } +/* + * Round-robin between our available drives, doing one request from each + */ +static struct request *set_next_request(void) +{ + struct request_queue *q; + int old_pos = fdc_queue; + struct request *rq; + + do { + q = unit[fdc_queue].disk->queue; + if (++fdc_queue == FD_MAX_UNITS) + fdc_queue = 0; + if (q) { + rq = blk_fetch_request(q); + if (rq) + break; + } + } while (fdc_queue != old_pos); + + return rq; +} + static void redo_fd_request(void) { @@ -1405,7 +1428,7 @@ static void redo_fd_request(void) repeat: if (!fd_request) { - fd_request = blk_fetch_request(floppy_queue); + fd_request = set_next_request(); if (!fd_request) goto the_end; } @@ -1932,10 +1955,6 @@ static int __init atari_floppy_init (void) PhysTrackBuffer = virt_to_phys(TrackBuffer); BufferDrive = BufferSide = BufferTrack = -1; - floppy_queue = blk_init_queue(do_fd_request, &ataflop_lock); - if (!floppy_queue) - goto Enomem; - for (i = 0; i < FD_MAX_UNITS; i++) { unit[i].track = -1; unit[i].flags = 0; @@ -1944,7 +1963,10 @@ static int __init atari_floppy_init (void) sprintf(unit[i].disk->disk_name, "fd%d", i); unit[i].disk->fops = &floppy_fops; unit[i].disk->private_data = &unit[i]; - unit[i].disk->queue = floppy_queue; + unit[i].disk->queue = blk_init_queue(do_fd_request, + &ataflop_lock); + if (!unit[i].disk->queue) + goto Enomem; set_capacity(unit[i].disk, MAX_DISK_SIZE * 2); add_disk(unit[i].disk); } @@ -1959,10 +1981,14 @@ static int __init atari_floppy_init (void) return 0; Enomem: - while (i--) + while (i--) { + struct request_queue *q = unit[i].disk->queue; + put_disk(unit[i].disk); - if (floppy_queue) - blk_cleanup_queue(floppy_queue); + if (q) + blk_cleanup_queue(q); + } + unregister_blkdev(FLOPPY_MAJOR, "fd"); return -ENOMEM; } @@ -2011,12 +2037,14 @@ static void __exit atari_floppy_exit(void) int i; blk_unregister_region(MKDEV(FLOPPY_MAJOR, 0), 256); for (i = 0; i < FD_MAX_UNITS; i++) { + struct request_queue *q = unit[i].disk->queue; + del_gendisk(unit[i].disk); put_disk(unit[i].disk); + blk_cleanup_queue(q); } unregister_blkdev(FLOPPY_MAJOR, "fd"); - blk_cleanup_queue(floppy_queue); del_timer_sync(&fd_timer); atari_stram_free( DMABuffer ); } -- cgit v0.10.2 From 504c6d1b44bc6e694bdba8d9a2a4e046275b5e2b Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Sun, 26 Sep 2010 12:23:25 +0900 Subject: amiga floppy: Compile failure fixes o Compile fixes for amiga floppy driver. Signed-off-by: Vivek Goyal Signed-off-by: Jens Axboe diff --git a/drivers/block/amiflop.c b/drivers/block/amiflop.c index ead8b77..327ed27 100644 --- a/drivers/block/amiflop.c +++ b/drivers/block/amiflop.c @@ -1347,12 +1347,12 @@ static struct request *set_next_request(void) if (fdc_queue == FD_MAX_UNITS) fdc_queue = 0; - for(cnt = FD_MAX_UNITS; cnt > 0, cnt--) { + for(cnt = FD_MAX_UNITS; cnt > 0; cnt--) { if (unit[fdc_queue].type->code == FD_NODRIVE) { if (++fdc_queue == FD_MAX_UNITS) fdc_queue = 0; - cotinue; + continue; } q = unit[fdc_queue].gendisk->queue; @@ -1827,7 +1827,6 @@ static int __init amiga_floppy_probe(struct platform_device *pdev) return 0; out_probe: -out_queue: free_irq(IRQ_AMIGA_CIAA_TB, NULL); out_irq2: free_irq(IRQ_AMIGA_DSKBLK, NULL); -- cgit v0.10.2 From 7e602c0aaf3e686c36cc742119f0f53f42e9befe Mon Sep 17 00:00:00 2001 From: Philipp Reisner Date: Thu, 27 May 2010 14:49:27 +0200 Subject: drbd: renamed drbd_tl_epoch.n_req to drbd_tl_epoch.n_writes Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index 352441b..11b7c6f 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -697,7 +697,7 @@ struct drbd_tl_epoch { struct list_head requests; /* requests before */ struct drbd_tl_epoch *next; /* pointer to the next barrier */ unsigned int br_number; /* the barriers identifier. */ - int n_req; /* number of requests attached before this barrier */ + int n_writes; /* number of requests attached before this barrier */ }; struct drbd_request; diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index fa650dd..a9bc6bc 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -199,7 +199,7 @@ static int tl_init(struct drbd_conf *mdev) INIT_LIST_HEAD(&b->w.list); b->next = NULL; b->br_number = 4711; - b->n_req = 0; + b->n_writes = 0; b->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */ mdev->oldest_tle = b; @@ -240,7 +240,7 @@ void _tl_add_barrier(struct drbd_conf *mdev, struct drbd_tl_epoch *new) INIT_LIST_HEAD(&new->w.list); new->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */ new->next = NULL; - new->n_req = 0; + new->n_writes = 0; newest_before = mdev->newest_tle; /* never send a barrier number == 0, because that is special-cased @@ -284,9 +284,9 @@ void tl_release(struct drbd_conf *mdev, unsigned int barrier_nr, barrier_nr, b->br_number); goto bail; } - if (b->n_req != set_size) { - dev_err(DEV, "BAD! BarrierAck #%u received with n_req=%u, expected n_req=%u!\n", - barrier_nr, set_size, b->n_req); + if (b->n_writes != set_size) { + dev_err(DEV, "BAD! BarrierAck #%u received with n_writes=%u, expected n_writes=%u!\n", + barrier_nr, set_size, b->n_writes); goto bail; } @@ -378,7 +378,7 @@ void tl_clear(struct drbd_conf *mdev) INIT_LIST_HEAD(&b->w.list); b->w.cb = NULL; b->br_number = new_initial_bnr; - b->n_req = 0; + b->n_writes = 0; mdev->oldest_tle = b; break; diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c index f761d98..976d7941 100644 --- a/drivers/block/drbd/drbd_req.c +++ b/drivers/block/drbd/drbd_req.c @@ -521,7 +521,7 @@ void __req_mod(struct drbd_request *req, enum drbd_req_event what, &mdev->newest_tle->requests); /* increment size of current epoch */ - mdev->newest_tle->n_req++; + mdev->newest_tle->n_writes++; /* queue work item to send data */ D_ASSERT(req->rq_state & RQ_NET_PENDING); @@ -530,7 +530,7 @@ void __req_mod(struct drbd_request *req, enum drbd_req_event what, drbd_queue_work(&mdev->data.work, &req->w); /* close the epoch, in case it outgrew the limit */ - if (mdev->newest_tle->n_req >= mdev->net_conf->max_epoch_size) + if (mdev->newest_tle->n_writes >= mdev->net_conf->max_epoch_size) queue_barrier(mdev); break; -- cgit v0.10.2 From 288f422ec13667de40b278535d2a5fb5c77352c4 Mon Sep 17 00:00:00 2001 From: Philipp Reisner Date: Thu, 27 May 2010 15:07:43 +0200 Subject: drbd: Track all IO requests on the TL, not writes only With that the drbd_fail_pending_reads() function becomes obsolete. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index a9bc6bc..a86e6f1 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -401,6 +401,8 @@ void tl_clear(struct drbd_conf *mdev) /* ensure bit indicating barrier is required is clear */ clear_bit(CREATE_BARRIER, &mdev->flags); + memset(mdev->app_reads_hash, 0, APP_R_HSIZE*sizeof(void *)); + spin_unlock_irq(&mdev->req_lock); } diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index 081522d..88a5e1f 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -3666,41 +3666,6 @@ static void drbdd(struct drbd_conf *mdev) } } -static void drbd_fail_pending_reads(struct drbd_conf *mdev) -{ - struct hlist_head *slot; - struct hlist_node *pos; - struct hlist_node *tmp; - struct drbd_request *req; - int i; - - /* - * Application READ requests - */ - spin_lock_irq(&mdev->req_lock); - for (i = 0; i < APP_R_HSIZE; i++) { - slot = mdev->app_reads_hash+i; - hlist_for_each_entry_safe(req, pos, tmp, slot, colision) { - /* it may (but should not any longer!) - * be on the work queue; if that assert triggers, - * we need to also grab the - * spin_lock_irq(&mdev->data.work.q_lock); - * and list_del_init here. */ - D_ASSERT(list_empty(&req->w.list)); - /* It would be nice to complete outside of spinlock. - * But this is easier for now. */ - _req_mod(req, connection_lost_while_pending); - } - } - for (i = 0; i < APP_R_HSIZE; i++) - if (!hlist_empty(mdev->app_reads_hash+i)) - dev_warn(DEV, "ASSERT FAILED: app_reads_hash[%d].first: " - "%p, should be NULL\n", i, mdev->app_reads_hash[i].first); - - memset(mdev->app_reads_hash, 0, APP_R_HSIZE*sizeof(void *)); - spin_unlock_irq(&mdev->req_lock); -} - void drbd_flush_workqueue(struct drbd_conf *mdev) { struct drbd_wq_barrier barr; @@ -3770,8 +3735,6 @@ static void drbd_disconnect(struct drbd_conf *mdev) if (!mdev->state.susp) tl_clear(mdev); - drbd_fail_pending_reads(mdev); - dev_info(DEV, "Connection closed\n"); drbd_md_sync(mdev); diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c index 976d7941..4a30e2c 100644 --- a/drivers/block/drbd/drbd_req.c +++ b/drivers/block/drbd/drbd_req.c @@ -59,17 +59,19 @@ static void _drbd_end_io_acct(struct drbd_conf *mdev, struct drbd_request *req) static void _req_is_done(struct drbd_conf *mdev, struct drbd_request *req, const int rw) { const unsigned long s = req->rq_state; + + /* remove it from the transfer log. + * well, only if it had been there in the first + * place... if it had not (local only or conflicting + * and never sent), it should still be "empty" as + * initialized in drbd_req_new(), so we can list_del() it + * here unconditionally */ + list_del(&req->tl_requests); + /* if it was a write, we may have to set the corresponding * bit(s) out-of-sync first. If it had a local part, we need to * release the reference to the activity log. */ if (rw == WRITE) { - /* remove it from the transfer log. - * well, only if it had been there in the first - * place... if it had not (local only or conflicting - * and never sent), it should still be "empty" as - * initialized in drbd_req_new(), so we can list_del() it - * here unconditionally */ - list_del(&req->tl_requests); /* Set out-of-sync unless both OK flags are set * (local only or remote failed). * Other places where we set out-of-sync: @@ -517,8 +519,6 @@ void __req_mod(struct drbd_request *req, enum drbd_req_event what, D_ASSERT(test_bit(CREATE_BARRIER, &mdev->flags) == 0); req->epoch = mdev->newest_tle->br_number; - list_add_tail(&req->tl_requests, - &mdev->newest_tle->requests); /* increment size of current epoch */ mdev->newest_tle->n_writes++; @@ -634,6 +634,9 @@ void __req_mod(struct drbd_request *req, enum drbd_req_event what, break; case barrier_acked: + if (!(req->rq_state & RQ_WRITE)) + break; + if (req->rq_state & RQ_NET_PENDING) { /* barrier came in before all requests have been acked. * this is bad, because if the connection is lost now, @@ -892,6 +895,9 @@ allocate_barrier: remote = 0; } + + list_add_tail(&req->tl_requests, &mdev->newest_tle->requests); + /* NOTE remote first: to get the concurrent write detection right, * we must register the request before start of local IO. */ if (remote) { diff --git a/drivers/block/drbd/drbd_req.h b/drivers/block/drbd/drbd_req.h index 02d575d..47b931f 100644 --- a/drivers/block/drbd/drbd_req.h +++ b/drivers/block/drbd/drbd_req.h @@ -183,6 +183,9 @@ enum drbd_req_state_bits { /* keep this last, its for the RQ_NET_MASK */ __RQ_NET_MAX, + + /* Set when this is a write, clear for a read */ + __RQ_WRITE, }; #define RQ_LOCAL_PENDING (1UL << __RQ_LOCAL_PENDING) @@ -201,6 +204,8 @@ enum drbd_req_state_bits { /* 0x1f8 */ #define RQ_NET_MASK (((1UL << __RQ_NET_MAX)-1) & ~RQ_LOCAL_MASK) +#define RQ_WRITE (1UL << __RQ_WRITE) + /* epoch entries */ static inline struct hlist_head *ee_hash_slot(struct drbd_conf *mdev, sector_t sector) @@ -253,7 +258,7 @@ static inline struct drbd_request *drbd_req_new(struct drbd_conf *mdev, if (likely(req)) { bio = bio_clone(bio_src, GFP_NOIO); /* XXX cannot fail?? */ - req->rq_state = 0; + req->rq_state = bio_data_dir(bio_src) == WRITE ? RQ_WRITE : 0; req->mdev = mdev; req->master_bio = bio_src; req->private_bio = bio; -- cgit v0.10.2 From 2a80699f807885d501f08a7006f6a56c1c937a6e Mon Sep 17 00:00:00 2001 From: Philipp Reisner Date: Wed, 9 Jun 2010 14:07:43 +0200 Subject: drbd: mod_req has now a return value Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c index 4a30e2c..d9df1a1 100644 --- a/drivers/block/drbd/drbd_req.c +++ b/drivers/block/drbd/drbd_req.c @@ -382,10 +382,11 @@ out_conflict: * and it enforces that we have to think in a very structured manner * about the "events" that may happen to a request during its life time ... */ -void __req_mod(struct drbd_request *req, enum drbd_req_event what, +int __req_mod(struct drbd_request *req, enum drbd_req_event what, struct bio_and_error *m) { struct drbd_conf *mdev = req->mdev; + int rv = 0; m->bio = NULL; switch (what) { @@ -657,6 +658,8 @@ void __req_mod(struct drbd_request *req, enum drbd_req_event what, _req_may_be_done(req, m); break; }; + + return rv; } /* we may do a local read if: diff --git a/drivers/block/drbd/drbd_req.h b/drivers/block/drbd/drbd_req.h index 47b931f..db37c6e 100644 --- a/drivers/block/drbd/drbd_req.h +++ b/drivers/block/drbd/drbd_req.h @@ -297,36 +297,43 @@ struct bio_and_error { extern void _req_may_be_done(struct drbd_request *req, struct bio_and_error *m); -extern void __req_mod(struct drbd_request *req, enum drbd_req_event what, +extern int __req_mod(struct drbd_request *req, enum drbd_req_event what, struct bio_and_error *m); extern void complete_master_bio(struct drbd_conf *mdev, struct bio_and_error *m); /* use this if you don't want to deal with calling complete_master_bio() * outside the spinlock, e.g. when walking some list on cleanup. */ -static inline void _req_mod(struct drbd_request *req, enum drbd_req_event what) +static inline int _req_mod(struct drbd_request *req, enum drbd_req_event what) { struct drbd_conf *mdev = req->mdev; struct bio_and_error m; + int rv; /* __req_mod possibly frees req, do not touch req after that! */ - __req_mod(req, what, &m); + rv = __req_mod(req, what, &m); if (m.bio) complete_master_bio(mdev, &m); + + return rv; } /* completion of master bio is outside of spinlock. * If you need it irqsave, do it your self! */ -static inline void req_mod(struct drbd_request *req, +static inline int req_mod(struct drbd_request *req, enum drbd_req_event what) { struct drbd_conf *mdev = req->mdev; struct bio_and_error m; + int rv; + spin_lock_irq(&mdev->req_lock); - __req_mod(req, what, &m); + rv = __req_mod(req, what, &m); spin_unlock_irq(&mdev->req_lock); if (m.bio) complete_master_bio(mdev, &m); + + return rv; } #endif -- cgit v0.10.2 From 11b58e73a3a3d1bbb582370d59f9b2c4d0136b42 Mon Sep 17 00:00:00 2001 From: Philipp Reisner Date: Wed, 12 May 2010 17:08:26 +0200 Subject: drbd: factored tl_restart() out of tl_clear(). If IO was frozen for a temporal network outage, resend the content of the transfer-log into the newly established connection. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index 11b7c6f..bef9138 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -1138,6 +1138,8 @@ extern void drbd_free_resources(struct drbd_conf *mdev); extern void tl_release(struct drbd_conf *mdev, unsigned int barrier_nr, unsigned int set_size); extern void tl_clear(struct drbd_conf *mdev); +enum drbd_req_event; +extern void tl_restart(struct drbd_conf *mdev, enum drbd_req_event what); extern void _tl_add_barrier(struct drbd_conf *, struct drbd_tl_epoch *); extern void drbd_free_sock(struct drbd_conf *mdev); extern int drbd_send(struct drbd_conf *mdev, struct socket *sock, diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index a86e6f1..a8a0341 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -333,59 +333,94 @@ bail: drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR)); } - /** - * tl_clear() - Clears all requests and &struct drbd_tl_epoch objects out of the TL + * _tl_restart() - Walks the transfer log, and applies an action to all requests * @mdev: DRBD device. + * @what: The action/event to perform with all request objects * - * This is called after the connection to the peer was lost. The storage covered - * by the requests on the transfer gets marked as our of sync. Called from the - * receiver thread and the worker thread. + * @what might be one of connection_lost_while_pending, resend, fail_frozen_disk_io, + * restart_frozen_disk_io. */ -void tl_clear(struct drbd_conf *mdev) +static void _tl_restart(struct drbd_conf *mdev, enum drbd_req_event what) { - struct drbd_tl_epoch *b, *tmp; + struct drbd_tl_epoch *b, *tmp, **pn; struct list_head *le, *tle; - struct drbd_request *r; - int new_initial_bnr = net_random(); - - spin_lock_irq(&mdev->req_lock); + struct drbd_request *req; + int rv, n_writes, n_reads; b = mdev->oldest_tle; + pn = &mdev->oldest_tle; while (b) { + n_writes = 0; + n_reads = 0; list_for_each_safe(le, tle, &b->requests) { - r = list_entry(le, struct drbd_request, tl_requests); - /* It would be nice to complete outside of spinlock. - * But this is easier for now. */ - _req_mod(r, connection_lost_while_pending); + req = list_entry(le, struct drbd_request, tl_requests); + rv = _req_mod(req, what); + + n_writes += (rv & MR_WRITE) >> MR_WRITE_SHIFT; + n_reads += (rv & MR_READ) >> MR_READ_SHIFT; } tmp = b->next; - /* there could still be requests on that ring list, - * in case local io is still pending */ - list_del(&b->requests); - - /* dec_ap_pending corresponding to queue_barrier. - * the newest barrier may not have been queued yet, - * in which case w.cb is still NULL. */ - if (b->w.cb != NULL) - dec_ap_pending(mdev); - - if (b == mdev->newest_tle) { - /* recycle, but reinit! */ - D_ASSERT(tmp == NULL); - INIT_LIST_HEAD(&b->requests); - INIT_LIST_HEAD(&b->w.list); - b->w.cb = NULL; - b->br_number = new_initial_bnr; - b->n_writes = 0; - - mdev->oldest_tle = b; - break; + if (n_writes + n_reads) { + if (what == resend) { + b->n_writes = n_writes; + if (b->w.cb == NULL) { + b->w.cb = w_send_barrier; + inc_ap_pending(mdev); + set_bit(CREATE_BARRIER, &mdev->flags); + } + + drbd_queue_work(&mdev->data.work, &b->w); + } + pn = &b->next; + } else { + /* there could still be requests on that ring list, + * in case local io is still pending */ + list_del(&b->requests); + + /* dec_ap_pending corresponding to queue_barrier. + * the newest barrier may not have been queued yet, + * in which case w.cb is still NULL. */ + if (b->w.cb != NULL) + dec_ap_pending(mdev); + + if (b == mdev->newest_tle) { + /* recycle, but reinit! */ + D_ASSERT(tmp == NULL); + INIT_LIST_HEAD(&b->requests); + INIT_LIST_HEAD(&b->w.list); + b->w.cb = NULL; + b->br_number = net_random(); + b->n_writes = 0; + + *pn = b; + break; + } + *pn = tmp; + kfree(b); } - kfree(b); b = tmp; } +} + + +/** + * tl_clear() - Clears all requests and &struct drbd_tl_epoch objects out of the TL + * @mdev: DRBD device. + * + * This is called after the connection to the peer was lost. The storage covered + * by the requests on the transfer gets marked as our of sync. Called from the + * receiver thread and the worker thread. + */ +void tl_clear(struct drbd_conf *mdev) +{ + struct list_head *le, *tle; + struct drbd_request *r; + + spin_lock_irq(&mdev->req_lock); + + _tl_restart(mdev, connection_lost_while_pending); /* we expect this list to be empty. */ D_ASSERT(list_empty(&mdev->out_of_sequence_requests)); @@ -406,6 +441,13 @@ void tl_clear(struct drbd_conf *mdev) spin_unlock_irq(&mdev->req_lock); } +void tl_restart(struct drbd_conf *mdev, enum drbd_req_event what) +{ + spin_lock_irq(&mdev->req_lock); + _tl_restart(mdev, what); + spin_unlock_irq(&mdev->req_lock); +} + /** * cl_wide_st_chg() - TRUE if the state change is a cluster wide one * @mdev: DRBD device. diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index 88a5e1f..8daa920 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -776,9 +776,6 @@ static int drbd_connect(struct drbd_conf *mdev) D_ASSERT(!mdev->data.socket); - if (test_and_clear_bit(CREATE_BARRIER, &mdev->flags)) - dev_err(DEV, "CREATE_BARRIER flag was set in drbd_connect - now cleared!\n"); - if (drbd_request_state(mdev, NS(conn, C_WF_CONNECTION)) < SS_SUCCESS) return -2; diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c index d9df1a1..39c2cc3 100644 --- a/drivers/block/drbd/drbd_req.c +++ b/drivers/block/drbd/drbd_req.c @@ -634,6 +634,20 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what, /* else: done by handed_over_to_network */ break; + case resend: + /* If RQ_NET_OK is already set, we got a P_WRITE_ACK or P_RECV_ACK + before the connection loss; only P_BARRIER_ACK was missing. + Trowing them out of the TL here by pretending we got a BARRIER_ACK + TODO: Either resync them, or ensure peer was not rebooted. */ + if (!(req->rq_state & RQ_NET_OK)) { + if (req->w.cb) { + drbd_queue_work(&mdev->data.work, &req->w); + rv = req->rq_state & RQ_WRITE ? MR_WRITE : MR_READ; + } + break; + } + /* else, fall through to barrier_acked */ + case barrier_acked: if (!(req->rq_state & RQ_WRITE)) break; diff --git a/drivers/block/drbd/drbd_req.h b/drivers/block/drbd/drbd_req.h index db37c6e..1bcb855 100644 --- a/drivers/block/drbd/drbd_req.h +++ b/drivers/block/drbd/drbd_req.h @@ -104,6 +104,7 @@ enum drbd_req_event { read_ahead_completed_with_error, write_completed_with_error, completed_ok, + resend, nothing, /* for tracing only */ }; @@ -206,6 +207,13 @@ enum drbd_req_state_bits { #define RQ_WRITE (1UL << __RQ_WRITE) +/* For waking up the frozen transfer log mod_req() has to return if the request + should be counted in the epoch object*/ +#define MR_WRITE_SHIFT 0 +#define MR_WRITE (1 << MR_WRITE_SHIFT) +#define MR_READ_SHIFT 1 +#define MR_READ (1 << MR_READ_SHIFT) + /* epoch entries */ static inline struct hlist_head *ee_hash_slot(struct drbd_conf *mdev, sector_t sector) -- cgit v0.10.2 From b9b98716f83856b928f1c985ab55520c67663dd2 Mon Sep 17 00:00:00 2001 From: Philipp Reisner Date: Tue, 22 Jun 2010 11:26:48 +0200 Subject: drbd: Do not send two barriers without any writes between them Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index a8a0341..7d35986 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -344,7 +344,7 @@ bail: static void _tl_restart(struct drbd_conf *mdev, enum drbd_req_event what) { struct drbd_tl_epoch *b, *tmp, **pn; - struct list_head *le, *tle; + struct list_head *le, *tle, carry_reads; struct drbd_request *req; int rv, n_writes, n_reads; @@ -353,6 +353,7 @@ static void _tl_restart(struct drbd_conf *mdev, enum drbd_req_event what) while (b) { n_writes = 0; n_reads = 0; + INIT_LIST_HEAD(&carry_reads); list_for_each_safe(le, tle, &b->requests) { req = list_entry(le, struct drbd_request, tl_requests); rv = _req_mod(req, what); @@ -362,7 +363,7 @@ static void _tl_restart(struct drbd_conf *mdev, enum drbd_req_event what) } tmp = b->next; - if (n_writes + n_reads) { + if (n_writes) { if (what == resend) { b->n_writes = n_writes; if (b->w.cb == NULL) { @@ -375,6 +376,8 @@ static void _tl_restart(struct drbd_conf *mdev, enum drbd_req_event what) } pn = &b->next; } else { + if (n_reads) + list_add(&carry_reads, &b->requests); /* there could still be requests on that ring list, * in case local io is still pending */ list_del(&b->requests); @@ -389,6 +392,7 @@ static void _tl_restart(struct drbd_conf *mdev, enum drbd_req_event what) /* recycle, but reinit! */ D_ASSERT(tmp == NULL); INIT_LIST_HEAD(&b->requests); + list_splice(&carry_reads, &b->requests); INIT_LIST_HEAD(&b->w.list); b->w.cb = NULL; b->br_number = net_random(); @@ -401,6 +405,7 @@ static void _tl_restart(struct drbd_conf *mdev, enum drbd_req_event what) kfree(b); } b = tmp; + list_splice(&carry_reads, &b->requests); } } -- cgit v0.10.2 From 5ba82308ea766b33404cb130a88fe4113d9c20a3 Mon Sep 17 00:00:00 2001 From: Philipp Reisner Date: Thu, 10 Jun 2010 13:30:36 +0200 Subject: drbd: factored drbd_req_make_private_bio() out of drbd_req_new() Preparing tl_thaw_dio() Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg diff --git a/drivers/block/drbd/drbd_req.h b/drivers/block/drbd/drbd_req.h index 1bcb855..07cb3b1 100644 --- a/drivers/block/drbd/drbd_req.h +++ b/drivers/block/drbd/drbd_req.h @@ -257,30 +257,36 @@ static inline struct drbd_request *_ar_id_to_req(struct drbd_conf *mdev, return NULL; } +static inline void drbd_req_make_private_bio(struct drbd_request *req, struct bio *bio_src) +{ + struct bio *bio; + bio = bio_clone(bio_src, GFP_NOIO); /* XXX cannot fail?? */ + + req->private_bio = bio; + + bio->bi_private = req; + bio->bi_end_io = drbd_endio_pri; + bio->bi_next = NULL; +} + static inline struct drbd_request *drbd_req_new(struct drbd_conf *mdev, struct bio *bio_src) { - struct bio *bio; struct drbd_request *req = mempool_alloc(drbd_request_mempool, GFP_NOIO); if (likely(req)) { - bio = bio_clone(bio_src, GFP_NOIO); /* XXX cannot fail?? */ + drbd_req_make_private_bio(req, bio_src); req->rq_state = bio_data_dir(bio_src) == WRITE ? RQ_WRITE : 0; req->mdev = mdev; req->master_bio = bio_src; - req->private_bio = bio; req->epoch = 0; - req->sector = bio->bi_sector; - req->size = bio->bi_size; + req->sector = bio_src->bi_sector; + req->size = bio_src->bi_size; req->start_time = jiffies; INIT_HLIST_NODE(&req->colision); INIT_LIST_HEAD(&req->tl_requests); INIT_LIST_HEAD(&req->w.list); - - bio->bi_private = req; - bio->bi_end_io = drbd_endio_pri; - bio->bi_next = NULL; } return req; } -- cgit v0.10.2 From 905cd7d8ac9b18e1f122b90dbebe1246b1c364fd Mon Sep 17 00:00:00 2001 From: Philipp Reisner Date: Mon, 10 May 2010 16:03:10 +0200 Subject: drbd: Removed redundant error checks in the request code path Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c index 39c2cc3..4864758 100644 --- a/drivers/block/drbd/drbd_req.c +++ b/drivers/block/drbd/drbd_req.c @@ -984,21 +984,6 @@ static int drbd_fail_request_early(struct drbd_conf *mdev, int is_write) return 1; } - /* - * Paranoia: we might have been primary, but sync target, or - * even diskless, then lost the connection. - * This should have been handled (panic? suspend?) somewhere - * else. But maybe it was not, so check again here. - * Caution: as long as we do not have a read/write lock on mdev, - * to serialize state changes, this is racy, since we may lose - * the connection *after* we test for the cstate. - */ - if (mdev->state.disk < D_UP_TO_DATE && mdev->state.pdsk < D_UP_TO_DATE) { - if (__ratelimit(&drbd_ratelimit_state)) - dev_err(DEV, "Sorry, I have no access to good data anymore.\n"); - return 1; - } - return 0; } -- cgit v0.10.2 From 265be2d09853d425ad14a61cda0ca63345613d0c Mon Sep 17 00:00:00 2001 From: Philipp Reisner Date: Mon, 31 May 2010 10:14:17 +0200 Subject: drbd: Finished the "on-no-data-accessible suspend-io;" functionality When no data is accessible (no connection to the peer, nor a local disk) allow the user to select to freeze all IO operations instead of getting IO errors. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index bef9138..03cc975 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -1469,6 +1469,7 @@ extern int w_send_barrier(struct drbd_conf *, struct drbd_work *, int); extern int w_send_read_req(struct drbd_conf *, struct drbd_work *, int); extern int w_prev_work_done(struct drbd_conf *, struct drbd_work *, int); extern int w_e_reissue(struct drbd_conf *, struct drbd_work *, int); +extern int w_restart_disk_io(struct drbd_conf *, struct drbd_work *, int); extern void resync_timer_fn(unsigned long data); diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 7d35986..106b9ab 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -925,7 +925,12 @@ static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state if (fp == FP_STONITH && (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk > D_OUTDATED) && !(os.role == R_PRIMARY && os.conn < C_CONNECTED && os.pdsk > D_OUTDATED)) - ns.susp = 1; + ns.susp = 1; /* Suspend IO while fence-peer handler runs (peer lost) */ + + if (mdev->sync_conf.on_no_data == OND_SUSPEND_IO && + (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE) && + !(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE)) + ns.susp = 1; /* Suspend IO while no data available (no accessible data available) */ if (ns.aftr_isp || ns.peer_isp || ns.user_isp) { if (ns.conn == C_SYNC_SOURCE) @@ -1236,6 +1241,25 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os, /* Here we have the actions that are performed after a state change. This function might sleep */ + if (os.susp && ns.susp && mdev->sync_conf.on_no_data == OND_SUSPEND_IO) { + if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED) { + if (ns.conn == C_CONNECTED) { + spin_lock_irq(&mdev->req_lock); + _tl_restart(mdev, resend); + _drbd_set_state(_NS(mdev, susp, 0), CS_VERBOSE, NULL); + spin_unlock_irq(&mdev->req_lock); + } else /* ns.conn > C_CONNECTED */ + dev_err(DEV, "Unexpected Resynd going on!\n"); + } + + if (os.disk == D_ATTACHING && ns.disk > D_ATTACHING) { + spin_lock_irq(&mdev->req_lock); + _tl_restart(mdev, restart_frozen_disk_io); + _drbd_set_state(_NS(mdev, susp, 0), CS_VERBOSE, NULL); + spin_unlock_irq(&mdev->req_lock); + } + } + if (fp == FP_STONITH && ns.susp) { /* case1: The outdate peer handler is successful: * case2: The connection was established again: */ diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index 73131c5..563a6ad 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c @@ -33,6 +33,7 @@ #include #include #include "drbd_int.h" +#include "drbd_req.h" #include "drbd_wrappers.h" #include #include @@ -494,6 +495,8 @@ char *ppsize(char *buf, unsigned long long size) void drbd_suspend_io(struct drbd_conf *mdev) { set_bit(SUSPEND_IO, &mdev->flags); + if (mdev->state.susp) + return; wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_bio_cnt)); } @@ -1557,6 +1560,7 @@ static int drbd_nl_syncer_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *n sc.rate = DRBD_RATE_DEF; sc.after = DRBD_AFTER_DEF; sc.al_extents = DRBD_AL_EXTENTS_DEF; + sc.on_no_data = DRBD_ON_NO_DATA_DEF; } else memcpy(&sc, &mdev->sync_conf, sizeof(struct syncer_conf)); @@ -1765,7 +1769,16 @@ static int drbd_nl_suspend_io(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nl static int drbd_nl_resume_io(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, struct drbd_nl_cfg_reply *reply) { + drbd_suspend_io(mdev); reply->ret_code = drbd_request_state(mdev, NS(susp, 0)); + if (reply->ret_code == SS_SUCCESS) { + if (mdev->state.conn < C_CONNECTED) + tl_clear(mdev); + if (mdev->state.disk == D_DISKLESS || mdev->state.disk == D_FAILED) + tl_restart(mdev, fail_frozen_disk_io); + } + drbd_resume_io(mdev); + return 0; } diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c index 4864758..8259d4f 100644 --- a/drivers/block/drbd/drbd_req.c +++ b/drivers/block/drbd/drbd_req.c @@ -226,6 +226,8 @@ void _req_may_be_done(struct drbd_request *req, struct bio_and_error *m) return; if (s & RQ_LOCAL_PENDING) return; + if (mdev->state.susp) + return; if (req->master_bio) { /* this is data_received (remote read) @@ -634,6 +636,28 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what, /* else: done by handed_over_to_network */ break; + case fail_frozen_disk_io: + if (!(req->rq_state & RQ_LOCAL_COMPLETED)) + break; + + _req_may_be_done(req, m); + break; + + case restart_frozen_disk_io: + if (!(req->rq_state & RQ_LOCAL_COMPLETED)) + break; + + req->rq_state &= ~RQ_LOCAL_COMPLETED; + + rv = MR_READ; + if (bio_data_dir(req->master_bio) == WRITE) + rv = MR_WRITE; + + get_ldev(mdev); + req->w.cb = w_restart_disk_io; + drbd_queue_work(&mdev->data.work, &req->w); + break; + case resend: /* If RQ_NET_OK is already set, we got a P_WRITE_ACK or P_RECV_ACK before the connection loss; only P_BARRIER_ACK was missing. diff --git a/drivers/block/drbd/drbd_req.h b/drivers/block/drbd/drbd_req.h index 07cb3b1..f2e45aa 100644 --- a/drivers/block/drbd/drbd_req.h +++ b/drivers/block/drbd/drbd_req.h @@ -105,6 +105,8 @@ enum drbd_req_event { write_completed_with_error, completed_ok, resend, + fail_frozen_disk_io, + restart_frozen_disk_io, nothing, /* for tracing only */ }; diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c index ca4a16c..3c1e884 100644 --- a/drivers/block/drbd/drbd_worker.c +++ b/drivers/block/drbd/drbd_worker.c @@ -1173,6 +1173,24 @@ int w_send_read_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel) return ok; } +int w_restart_disk_io(struct drbd_conf *mdev, struct drbd_work *w, int cancel) +{ + struct drbd_request *req = container_of(w, struct drbd_request, w); + + if (bio_data_dir(req->master_bio) == WRITE) + drbd_al_begin_io(mdev, req->sector); + /* Calling drbd_al_begin_io() out of the worker might deadlocks + theoretically. Practically it can not deadlock, since this is + only used when unfreezing IOs. All the extents of the requests + that made it into the TL are already active */ + + drbd_req_make_private_bio(req, req->master_bio); + req->private_bio->bi_bdev = mdev->ldev->backing_bdev; + generic_make_request(req->private_bio); + + return 1; +} + static int _drbd_may_sync_now(struct drbd_conf *mdev) { struct drbd_conf *odev = mdev; diff --git a/include/linux/drbd.h b/include/linux/drbd.h index 479ee3a..7be069f 100644 --- a/include/linux/drbd.h +++ b/include/linux/drbd.h @@ -91,6 +91,11 @@ enum drbd_after_sb_p { ASB_VIOLENTLY }; +enum drbd_on_no_data { + OND_IO_ERROR, + OND_SUSPEND_IO +}; + /* KEEP the order, do not delete or insert. Only append. */ enum drbd_ret_codes { ERR_CODE_BASE = 100, diff --git a/include/linux/drbd_limits.h b/include/linux/drbd_limits.h index 440b42e..7eb1e98 100644 --- a/include/linux/drbd_limits.h +++ b/include/linux/drbd_limits.h @@ -128,6 +128,7 @@ #define DRBD_AFTER_SB_1P_DEF ASB_DISCONNECT #define DRBD_AFTER_SB_2P_DEF ASB_DISCONNECT #define DRBD_RR_CONFLICT_DEF ASB_DISCONNECT +#define DRBD_ON_NO_DATA_DEF OND_IO_ERROR #define DRBD_MAX_BIO_BVECS_MIN 0 #define DRBD_MAX_BIO_BVECS_MAX 128 diff --git a/include/linux/drbd_nl.h b/include/linux/drbd_nl.h index 5f04281..9aebd0d 100644 --- a/include/linux/drbd_nl.h +++ b/include/linux/drbd_nl.h @@ -87,6 +87,7 @@ NL_PACKET(syncer_conf, 8, NL_STRING( 51, T_MAY_IGNORE, cpu_mask, 32) NL_STRING( 64, T_MAY_IGNORE, csums_alg, SHARED_SECRET_MAX) NL_BIT( 65, T_MAY_IGNORE, use_rle) + NL_INTEGER( 75, T_MAY_IGNORE, on_no_data) ) NL_PACKET(invalidate, 9, ) -- cgit v0.10.2 From 47ff2d0a8e7ce87fed180729e8341f650bf585c8 Mon Sep 17 00:00:00 2001 From: Philipp Reisner Date: Fri, 18 Jun 2010 13:56:57 +0200 Subject: drbd: Do not allow a fencing-policy of resource-and-stonith with protocol A Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index 563a6ad..5288bd72 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c @@ -806,6 +806,15 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp goto fail; } + if (get_net_conf(mdev)) { + int prot = mdev->net_conf->wire_protocol; + put_net_conf(mdev); + if (nbc->dc.fencing == FP_STONITH && prot == DRBD_PROT_A) { + retcode = ERR_STONITH_AND_PROT_A; + goto fail; + } + } + nbc->lo_file = filp_open(nbc->dc.backing_dev, O_RDWR, 0); if (IS_ERR(nbc->lo_file)) { dev_err(DEV, "open(\"%s\") failed with %ld\n", nbc->dc.backing_dev, @@ -1238,7 +1247,16 @@ static int drbd_nl_net_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, && (new_conf->wire_protocol != DRBD_PROT_C)) { retcode = ERR_NOT_PROTO_C; goto fail; - }; + } + + if (get_ldev(mdev)) { + enum drbd_fencing_p fp = mdev->ldev->dc.fencing; + put_ldev(mdev); + if (new_conf->wire_protocol == DRBD_PROT_A && fp == FP_STONITH) { + retcode = ERR_STONITH_AND_PROT_A; + goto fail; + } + } if (mdev->state.role == R_PRIMARY && new_conf->want_lose) { retcode = ERR_DISCARD; diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c index 8259d4f..fbe0278 100644 --- a/drivers/block/drbd/drbd_req.c +++ b/drivers/block/drbd/drbd_req.c @@ -660,7 +660,7 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what, case resend: /* If RQ_NET_OK is already set, we got a P_WRITE_ACK or P_RECV_ACK - before the connection loss; only P_BARRIER_ACK was missing. + before the connection loss (B&C only); only P_BARRIER_ACK was missing. Trowing them out of the TL here by pretending we got a BARRIER_ACK TODO: Either resync them, or ensure peer was not rebooted. */ if (!(req->rq_state & RQ_NET_OK)) { diff --git a/include/linux/drbd.h b/include/linux/drbd.h index 7be069f..0b2bfb5 100644 --- a/include/linux/drbd.h +++ b/include/linux/drbd.h @@ -145,6 +145,7 @@ enum drbd_ret_codes { ERR_CONNECTED = 151, /* DRBD 8.3 only */ ERR_PERM = 152, ERR_NEED_APV_93 = 153, + ERR_STONITH_AND_PROT_A = 154, /* insert new ones above this line */ AFTER_LAST_ERR_CODE -- cgit v0.10.2 From 894c6a946199cf91e52bc1864c3dc6529cceb3db Mon Sep 17 00:00:00 2001 From: Philipp Reisner Date: Fri, 18 Jun 2010 16:03:20 +0200 Subject: drbd: Disabled the crashed_primary detection for re-attach of last data while IO is frozen Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index 5288bd72..cda7cb3 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c @@ -1033,7 +1033,9 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp else clear_bit(CRASHED_PRIMARY, &mdev->flags); - if (drbd_md_test_flag(mdev->ldev, MDF_PRIMARY_IND)) { + if (drbd_md_test_flag(mdev->ldev, MDF_PRIMARY_IND) && + !(mdev->state.role == R_PRIMARY && mdev->state.susp && + mdev->sync_conf.on_no_data == OND_SUSPEND_IO)) { set_bit(CRASHED_PRIMARY, &mdev->flags); cp_discovered = 1; } -- cgit v0.10.2 From 18a50fa213d46d5592f6542c91ab4c4760cf346c Mon Sep 17 00:00:00 2001 From: Philipp Reisner Date: Mon, 21 Jun 2010 14:14:15 +0200 Subject: drbd: Now we need to handle the ed_uuid of an diskless, unconnected primary correctly Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 106b9ab..c701805 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -1297,8 +1297,10 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os, } if (ns.pdsk < D_INCONSISTENT && get_ldev(mdev)) { - if (ns.peer == R_PRIMARY && mdev->ldev->md.uuid[UI_BITMAP] == 0) + if (ns.peer == R_PRIMARY && mdev->ldev->md.uuid[UI_BITMAP] == 0) { drbd_uuid_new_current(mdev); + drbd_send_uuids(mdev); + } /* D_DISKLESS Peer becomes secondary */ if (os.peer == R_PRIMARY && ns.peer == R_SECONDARY) diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index 8daa920..72bc1a1 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -3104,6 +3104,11 @@ static int receive_uuids(struct drbd_conf *mdev, struct p_header *h) drbd_md_sync(mdev); } put_ldev(mdev); + } else if (mdev->state.disk < D_INCONSISTENT && + mdev->state.role == R_PRIMARY) { + /* I am a diskless primary, the peer just created a new current UUID + for me. */ + drbd_set_ed_uuid(mdev, p_uuid[UI_CURRENT]); } /* Before we test for the disk state, we should wait until an eventually -- cgit v0.10.2 From 999122bc188a4d0a4847bdf1915d357bd6ab53dc Mon Sep 17 00:00:00 2001 From: Philipp Reisner Date: Thu, 10 Jun 2010 16:46:54 +0200 Subject: drbd: Removing a by now obsolete clause in the state sanitizing Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index c701805..c502648 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -847,9 +847,6 @@ static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state if (ns.conn == C_STANDALONE && ns.disk == D_DISKLESS && ns.role == R_SECONDARY) ns.aftr_isp = 0; - if (ns.conn <= C_DISCONNECTING && ns.disk == D_DISKLESS) - ns.pdsk = D_UNKNOWN; - /* Abort resync if a disk fails/detaches */ if (os.conn > C_CONNECTED && ns.conn > C_CONNECTED && (ns.disk <= D_FAILED || ns.pdsk <= D_FAILED)) { -- cgit v0.10.2 From 1616a25493cce727d582886f162c5bb0abd87e6a Mon Sep 17 00:00:00 2001 From: Philipp Reisner Date: Thu, 10 Jun 2010 16:55:15 +0200 Subject: drbd: Reduce the verbosity of some state transitions State transitions in the space of non-allowed states used to be very noisy. Reduce that, since that has little value for the majority of the user base. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index c502648..40baddd 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -997,14 +997,8 @@ int __drbd_set_state(struct drbd_conf *mdev, /* If the old state was illegal as well, then let this happen...*/ - if (is_valid_state(mdev, os) == rv) { - dev_err(DEV, "Considering state change from bad state. " - "Error would be: '%s'\n", - drbd_set_st_err_str(rv)); - print_st(mdev, "old", os); - print_st(mdev, "new", ns); + if (is_valid_state(mdev, os) == rv) rv = is_valid_state_transition(mdev, ns, os); - } } else rv = is_valid_state_transition(mdev, ns, os); } -- cgit v0.10.2 From 87f7be4cf88e93069f4cc63baf2ce70fdfc59c63 Mon Sep 17 00:00:00 2001 From: Philipp Reisner Date: Fri, 11 Jun 2010 13:56:33 +0200 Subject: drbd: Run the fence-peer helper asynchronously Since we can not thaw the transfer log, the next logical step is to allow reconnects while the fence-peer handler runs. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index 03cc975..ab20c00 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -1423,7 +1423,8 @@ extern void resync_after_online_grow(struct drbd_conf *); extern void drbd_setup_queue_param(struct drbd_conf *mdev, unsigned int) __must_hold(local); extern int drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role, int force); -enum drbd_disk_state drbd_try_outdate_peer(struct drbd_conf *mdev); +extern enum drbd_disk_state drbd_try_outdate_peer(struct drbd_conf *mdev); +extern void drbd_try_outdate_peer_async(struct drbd_conf *mdev); extern int drbd_khelper(struct drbd_conf *mdev, char *cmd); /* drbd_worker.c */ diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index cda7cb3..32d0072 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c @@ -38,6 +38,8 @@ #include #include #include +#include +#include static unsigned short *tl_add_blob(unsigned short *, enum drbd_tags, const void *, int); static unsigned short *tl_add_str(unsigned short *, enum drbd_tags, const char *); @@ -256,6 +258,25 @@ enum drbd_disk_state drbd_try_outdate_peer(struct drbd_conf *mdev) return nps; } +static int _try_outdate_peer_async(void *data) +{ + struct drbd_conf *mdev = (struct drbd_conf *)data; + enum drbd_disk_state nps; + + nps = drbd_try_outdate_peer(mdev); + drbd_request_state(mdev, NS(pdsk, nps)); + + return 0; +} + +void drbd_try_outdate_peer_async(struct drbd_conf *mdev) +{ + struct task_struct *opa; + + opa = kthread_run(_try_outdate_peer_async, mdev, "drbd%d_a_helper", mdev_to_minor(mdev)); + if (IS_ERR(opa)) + dev_err(DEV, "out of mem, failed to invoke fence-peer helper\n"); +} int drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role, int force) { diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index 72bc1a1..101ad186 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -3747,12 +3747,8 @@ static void drbd_disconnect(struct drbd_conf *mdev) put_ldev(mdev); } - if (mdev->state.role == R_PRIMARY) { - if (fp >= FP_RESOURCE && mdev->state.pdsk >= D_UNKNOWN) { - enum drbd_disk_state nps = drbd_try_outdate_peer(mdev); - drbd_request_state(mdev, NS(pdsk, nps)); - } - } + if (mdev->state.role == R_PRIMARY && fp >= FP_RESOURCE && mdev->state.pdsk >= D_UNKNOWN) + drbd_try_outdate_peer_async(mdev); spin_lock_irq(&mdev->req_lock); os = mdev->state; -- cgit v0.10.2 From 43a5182cccae5850f7590f78dd9651bd407be440 Mon Sep 17 00:00:00 2001 From: Philipp Reisner Date: Fri, 11 Jun 2010 11:26:34 +0200 Subject: drbd: Delayed creation of current-UUID When a fencing policy of "resource-and-stonith" is configured, and DRBD looses connection to it's peer, we can delay the creation of a new current-UUID until IO gets thawed. That allows one to deploy fence-peer handlers that actually commit suicide on the machine they get started. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index ab20c00..e0e08f5 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -829,6 +829,7 @@ enum { * the peer, if it changed there as well. */ CONN_DRY_RUN, /* Expect disconnect after resync handshake. */ GOT_PING_ACK, /* set when we receive a ping_ack packet, misc wait gets woken */ + NEW_CUR_UUID, /* Create new current UUID when thawing IO */ }; struct drbd_bitmap; /* opaque for drbd_conf */ diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 40baddd..440b1d5 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -1252,12 +1252,23 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os, } if (fp == FP_STONITH && ns.susp) { - /* case1: The outdate peer handler is successful: - * case2: The connection was established again: */ - if ((os.pdsk > D_OUTDATED && ns.pdsk <= D_OUTDATED) || - (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED)) { + /* case1: The outdate peer handler is successful: */ + if (os.pdsk > D_OUTDATED && ns.pdsk <= D_OUTDATED) { tl_clear(mdev); + if (test_bit(NEW_CUR_UUID, &mdev->flags)) { + drbd_uuid_new_current(mdev); + clear_bit(NEW_CUR_UUID, &mdev->flags); + drbd_md_sync(mdev); + } + spin_lock_irq(&mdev->req_lock); + _drbd_set_state(_NS(mdev, susp, 0), CS_VERBOSE, NULL); + spin_unlock_irq(&mdev->req_lock); + } + /* case2: The connection was established again: */ + if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED) { + clear_bit(NEW_CUR_UUID, &mdev->flags); spin_lock_irq(&mdev->req_lock); + _tl_restart(mdev, resend); _drbd_set_state(_NS(mdev, susp, 0), CS_VERBOSE, NULL); spin_unlock_irq(&mdev->req_lock); } @@ -1280,8 +1291,12 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os, if (get_ldev(mdev)) { if ((ns.role == R_PRIMARY || ns.peer == R_PRIMARY) && mdev->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) { - drbd_uuid_new_current(mdev); - drbd_send_uuids(mdev); + if (mdev->state.susp) { + set_bit(NEW_CUR_UUID, &mdev->flags); + } else { + drbd_uuid_new_current(mdev); + drbd_send_uuids(mdev); + } } put_ldev(mdev); } diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index 32d0072..d764f3c 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c @@ -1810,6 +1810,11 @@ static int drbd_nl_suspend_io(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nl static int drbd_nl_resume_io(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, struct drbd_nl_cfg_reply *reply) { + if (test_bit(NEW_CUR_UUID, &mdev->flags)) { + drbd_uuid_new_current(mdev); + clear_bit(NEW_CUR_UUID, &mdev->flags); + drbd_md_sync(mdev); + } drbd_suspend_io(mdev); reply->ret_code = drbd_request_state(mdev, NS(susp, 0)); if (reply->ret_code == SS_SUCCESS) { -- cgit v0.10.2 From 481c6f503213ab14f69ce88fff7b1ece325522f2 Mon Sep 17 00:00:00 2001 From: Philipp Reisner Date: Tue, 22 Jun 2010 14:03:27 +0200 Subject: drbd: Ensure that the peer was not rebootet in the meantime before resending TL Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index d764f3c..921793c 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c @@ -208,9 +208,6 @@ enum drbd_disk_state drbd_try_outdate_peer(struct drbd_conf *mdev) return mdev->state.pdsk; } - if (fp == FP_STONITH) - _drbd_request_state(mdev, NS(susp, 1), CS_WAIT_COMPLETE); - r = drbd_khelper(mdev, "fence-peer"); switch ((r>>8) & 0xff) { diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index 101ad186..3a8131a 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -3265,7 +3265,18 @@ static int receive_state(struct drbd_conf *mdev, struct p_header *h) ns.peer_isp = (peer_state.aftr_isp | peer_state.user_isp); if ((nconn == C_CONNECTED || nconn == C_WF_BITMAP_S) && ns.disk == D_NEGOTIATING) ns.disk = mdev->new_state_tmp.disk; - + if (ns.pdsk == D_CONSISTENT && ns.susp && nconn == C_CONNECTED && oconn < C_CONNECTED && + test_bit(NEW_CUR_UUID, &mdev->flags)) { + /* Do not allow tl_restart(resend) for a rebooted peer. We can only allow this + for temporal network outages! */ + spin_unlock_irq(&mdev->req_lock); + dev_err(DEV, "Aborting Connect, can not thaw IO with an only Consistent peer\n"); + tl_clear(mdev); + drbd_uuid_new_current(mdev); + clear_bit(NEW_CUR_UUID, &mdev->flags); + drbd_force_state(mdev, NS2(conn, C_PROTOCOL_ERROR, susp, 0)); + return FALSE; + } rv = _drbd_set_state(mdev, ns, CS_VERBOSE | CS_HARD, NULL); ns = mdev->state; spin_unlock_irq(&mdev->req_lock); diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c index fbe0278..76b6682 100644 --- a/drivers/block/drbd/drbd_req.c +++ b/drivers/block/drbd/drbd_req.c @@ -662,7 +662,7 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what, /* If RQ_NET_OK is already set, we got a P_WRITE_ACK or P_RECV_ACK before the connection loss (B&C only); only P_BARRIER_ACK was missing. Trowing them out of the TL here by pretending we got a BARRIER_ACK - TODO: Either resync them, or ensure peer was not rebooted. */ + We ensure that the peer was not rebooted */ if (!(req->rq_state & RQ_NET_OK)) { if (req->w.cb) { drbd_queue_work(&mdev->data.work, &req->w); -- cgit v0.10.2 From 65d922c33ebd359db25d5846929b2eafc4238fcc Mon Sep 17 00:00:00 2001 From: Philipp Reisner Date: Wed, 16 Jun 2010 16:18:09 +0200 Subject: drbd: Do not do a hard state change when establishing a connection [bugz 304] Make sure the state engine can deny two primaries to connect Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index 3a8131a..224c79e 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -3188,6 +3188,7 @@ static int receive_state(struct drbd_conf *mdev, struct p_header *h) enum drbd_conns nconn, oconn; union drbd_state ns, peer_state; enum drbd_disk_state real_peer_disk; + enum chg_state_flags cs_flags; int rv; ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) @@ -3265,6 +3266,7 @@ static int receive_state(struct drbd_conf *mdev, struct p_header *h) ns.peer_isp = (peer_state.aftr_isp | peer_state.user_isp); if ((nconn == C_CONNECTED || nconn == C_WF_BITMAP_S) && ns.disk == D_NEGOTIATING) ns.disk = mdev->new_state_tmp.disk; + cs_flags = CS_VERBOSE + (oconn < C_CONNECTED && nconn >= C_CONNECTED ? 0 : CS_HARD); if (ns.pdsk == D_CONSISTENT && ns.susp && nconn == C_CONNECTED && oconn < C_CONNECTED && test_bit(NEW_CUR_UUID, &mdev->flags)) { /* Do not allow tl_restart(resend) for a rebooted peer. We can only allow this @@ -3277,7 +3279,7 @@ static int receive_state(struct drbd_conf *mdev, struct p_header *h) drbd_force_state(mdev, NS2(conn, C_PROTOCOL_ERROR, susp, 0)); return FALSE; } - rv = _drbd_set_state(mdev, ns, CS_VERBOSE | CS_HARD, NULL); + rv = _drbd_set_state(mdev, ns, cs_flags, NULL); ns = mdev->state; spin_unlock_irq(&mdev->req_lock); -- cgit v0.10.2 From 84dfb9f564208a0331131d1ab922382c7d61a553 Mon Sep 17 00:00:00 2001 From: Philipp Reisner Date: Wed, 23 Jun 2010 11:20:05 +0200 Subject: drbd: Fixed a deadlock, probably only affected UP machines After disconnect (most likely mdev->net_cnt == 0) and we are still in an unstable state (!drbd_state_is_stable()). When we get an IO request in drbd_get_max_buffers() (called from __inc_ap_bio_cond(), called from inc_ap_bio()) we wake up misc_wait. Misc_wait is also used in inc_ap_bio() to sleep until the outcome of __inc_ap_bio_cond() changes. => Busy loop! Solution: Have a dedicated wait queue for get_net_conf() and put_net_conf(). Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index e0e08f5..aa9bb21 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -947,6 +947,7 @@ struct drbd_conf { union drbd_state state; wait_queue_head_t misc_wait; wait_queue_head_t state_wait; /* upon each state change. */ + wait_queue_head_t net_cnt_wait; unsigned int send_cnt; unsigned int recv_cnt; unsigned int read_cnt; @@ -2018,7 +2019,7 @@ static inline void inc_unacked(struct drbd_conf *mdev) static inline void put_net_conf(struct drbd_conf *mdev) { if (atomic_dec_and_test(&mdev->net_cnt)) - wake_up(&mdev->misc_wait); + wake_up(&mdev->net_cnt_wait); } /** diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 440b1d5..9fe9bdd 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -2763,6 +2763,7 @@ void drbd_init_set_defaults(struct drbd_conf *mdev) init_waitqueue_head(&mdev->misc_wait); init_waitqueue_head(&mdev->state_wait); + init_waitqueue_head(&mdev->net_cnt_wait); init_waitqueue_head(&mdev->ee_wait); init_waitqueue_head(&mdev->al_wait); init_waitqueue_head(&mdev->seq_wait); diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index 224c79e..22d74d7 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -3775,7 +3775,7 @@ static void drbd_disconnect(struct drbd_conf *mdev) if (os.conn == C_DISCONNECTING) { struct hlist_head *h; - wait_event(mdev->misc_wait, atomic_read(&mdev->net_cnt) == 0); + wait_event(mdev->net_cnt_wait, atomic_read(&mdev->net_cnt) == 0); /* we must not free the tl_hash * while application io is still on the fly */ -- cgit v0.10.2 From cfa03415a14dd0055f2ff8c3d348d4c1452acba6 Mon Sep 17 00:00:00 2001 From: Philipp Reisner Date: Wed, 23 Jun 2010 17:18:51 +0200 Subject: drbd: Allow tl_restart() to do IO completion while IO is suspended Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c index 76b6682..4e1e10d 100644 --- a/drivers/block/drbd/drbd_req.c +++ b/drivers/block/drbd/drbd_req.c @@ -226,8 +226,6 @@ void _req_may_be_done(struct drbd_request *req, struct bio_and_error *m) return; if (s & RQ_LOCAL_PENDING) return; - if (mdev->state.susp) - return; if (req->master_bio) { /* this is data_received (remote read) @@ -284,6 +282,14 @@ void _req_may_be_done(struct drbd_request *req, struct bio_and_error *m) * protocol A or B, barrier ack still pending... */ } +static void _req_may_be_done_not_susp(struct drbd_request *req, struct bio_and_error *m) +{ + struct drbd_conf *mdev = req->mdev; + + if (!mdev->state.susp) + _req_may_be_done(req, m); +} + /* * checks whether there was an overlapping request * or ee already registered. @@ -425,7 +431,7 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what, req->rq_state |= (RQ_LOCAL_COMPLETED|RQ_LOCAL_OK); req->rq_state &= ~RQ_LOCAL_PENDING; - _req_may_be_done(req, m); + _req_may_be_done_not_susp(req, m); put_ldev(mdev); break; @@ -434,7 +440,7 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what, req->rq_state &= ~RQ_LOCAL_PENDING; __drbd_chk_io_error(mdev, FALSE); - _req_may_be_done(req, m); + _req_may_be_done_not_susp(req, m); put_ldev(mdev); break; @@ -442,7 +448,7 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what, /* it is legal to fail READA */ req->rq_state |= RQ_LOCAL_COMPLETED; req->rq_state &= ~RQ_LOCAL_PENDING; - _req_may_be_done(req, m); + _req_may_be_done_not_susp(req, m); put_ldev(mdev); break; @@ -460,7 +466,7 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what, /* no point in retrying if there is no good remote data, * or we have no connection. */ if (mdev->state.pdsk != D_UP_TO_DATE) { - _req_may_be_done(req, m); + _req_may_be_done_not_susp(req, m); break; } @@ -546,7 +552,7 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what, req->rq_state &= ~RQ_NET_QUEUED; /* if we did it right, tl_clear should be scheduled only after * this, so this should not be necessary! */ - _req_may_be_done(req, m); + _req_may_be_done_not_susp(req, m); break; case handed_over_to_network: @@ -571,7 +577,7 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what, * "completed_ok" events came in, once we return from * _drbd_send_zc_bio (drbd_send_dblock), we have to check * whether it is done already, and end it. */ - _req_may_be_done(req, m); + _req_may_be_done_not_susp(req, m); break; case read_retry_remote_canceled: @@ -587,7 +593,7 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what, /* if it is still queued, we may not complete it here. * it will be canceled soon. */ if (!(req->rq_state & RQ_NET_QUEUED)) - _req_may_be_done(req, m); + _req_may_be_done(req, m); /* Allowed while state.susp */ break; case write_acked_by_peer_and_sis: @@ -622,7 +628,7 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what, D_ASSERT(req->rq_state & RQ_NET_PENDING); dec_ap_pending(mdev); req->rq_state &= ~RQ_NET_PENDING; - _req_may_be_done(req, m); + _req_may_be_done_not_susp(req, m); break; case neg_acked: @@ -632,7 +638,7 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what, req->rq_state &= ~(RQ_NET_OK|RQ_NET_PENDING); req->rq_state |= RQ_NET_DONE; - _req_may_be_done(req, m); + _req_may_be_done_not_susp(req, m); /* else: done by handed_over_to_network */ break; @@ -640,7 +646,7 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what, if (!(req->rq_state & RQ_LOCAL_COMPLETED)) break; - _req_may_be_done(req, m); + _req_may_be_done(req, m); /* Allowed while state.susp */ break; case restart_frozen_disk_io: @@ -685,7 +691,7 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what, } D_ASSERT(req->rq_state & RQ_NET_SENT); req->rq_state |= RQ_NET_DONE; - _req_may_be_done(req, m); + _req_may_be_done(req, m); /* Allowed while state.susp */ break; case data_received: @@ -693,7 +699,7 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what, dec_ap_pending(mdev); req->rq_state &= ~RQ_NET_PENDING; req->rq_state |= (RQ_NET_OK|RQ_NET_DONE); - _req_may_be_done(req, m); + _req_may_be_done_not_susp(req, m); break; }; -- cgit v0.10.2 From 8f488156c0635dcc9c668737d05386113a745ef9 Mon Sep 17 00:00:00 2001 From: Philipp Reisner Date: Thu, 24 Jun 2010 12:05:53 +0200 Subject: drbd: Allow attach while IO is suspended Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index 921793c..dc0a9ac 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c @@ -954,7 +954,7 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp drbd_suspend_io(mdev); /* also wait for the last barrier ack. */ - wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_pending_cnt)); + wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_pending_cnt) || mdev->state.susp); /* and for any other previously queued work */ drbd_flush_workqueue(mdev); -- cgit v0.10.2 From f70b3511599c49a3dc20ae349d6cdc5af47659df Mon Sep 17 00:00:00 2001 From: Philipp Reisner Date: Thu, 24 Jun 2010 14:34:40 +0200 Subject: drbd: Do not try to free tl_hash in drbd_disconnect() when IO is suspended We may not free tl_hash when IO is suspended, since we can not wait until ap_bio_cnt reaches zero. We can do this after susp reched 0, since then tl_clear was called Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index aa9bb21..f84ffb1 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -1493,6 +1493,7 @@ extern void _drbd_wait_ee_list_empty(struct drbd_conf *mdev, extern void drbd_set_recv_tcq(struct drbd_conf *mdev, int tcq_enabled); extern void _drbd_clear_done_ee(struct drbd_conf *mdev, struct list_head *to_be_freed); extern void drbd_flush_workqueue(struct drbd_conf *mdev); +extern void drbd_free_tl_hash(struct drbd_conf *mdev); /* yes, there is kernel_setsockopt, but only since 2.6.18. we don't need to * mess with get_fs/set_fs, we know we are KERNEL_DS always. */ diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 9fe9bdd..8d14635 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -1409,6 +1409,10 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os, (os.user_isp && !ns.user_isp)) resume_next_sg(mdev); + /* free tl_hash if we Got thawed and are C_STANDALONE */ + if (ns.conn == C_STANDALONE && ns.susp == 0 && mdev->tl_hash) + drbd_free_tl_hash(mdev); + /* Upon network connection, we need to start the receiver */ if (os.conn == C_STANDALONE && ns.conn == C_UNCONNECTED) drbd_thread_start(&mdev->receiver); diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index dc0a9ac..6c08e63 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c @@ -1391,6 +1391,7 @@ static int drbd_nl_net_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, } } + drbd_flush_workqueue(mdev); spin_lock_irq(&mdev->req_lock); if (mdev->net_conf != NULL) { retcode = ERR_NET_CONFIGURED; @@ -1429,10 +1430,9 @@ static int drbd_nl_net_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, mdev->int_dig_out=int_dig_out; mdev->int_dig_in=int_dig_in; mdev->int_dig_vv=int_dig_vv; + retcode = _drbd_set_state(_NS(mdev, conn, C_UNCONNECTED), CS_VERBOSE, NULL); spin_unlock_irq(&mdev->req_lock); - retcode = _drbd_request_state(mdev, NS(conn, C_UNCONNECTED), CS_VERBOSE); - kobject_uevent(&disk_to_dev(mdev->vdisk)->kobj, KOBJ_CHANGE); reply->ret_code = retcode; drbd_reconfig_done(mdev); diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index 22d74d7..5e49ee7 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -3691,6 +3691,36 @@ void drbd_flush_workqueue(struct drbd_conf *mdev) wait_for_completion(&barr.done); } +void drbd_free_tl_hash(struct drbd_conf *mdev) +{ + struct hlist_head *h; + + spin_lock_irq(&mdev->req_lock); + + if (!mdev->tl_hash || mdev->state.conn != C_STANDALONE) { + spin_unlock_irq(&mdev->req_lock); + return; + } + /* paranoia code */ + for (h = mdev->ee_hash; h < mdev->ee_hash + mdev->ee_hash_s; h++) + if (h->first) + dev_err(DEV, "ASSERT FAILED ee_hash[%u].first == %p, expected NULL\n", + (int)(h - mdev->ee_hash), h->first); + kfree(mdev->ee_hash); + mdev->ee_hash = NULL; + mdev->ee_hash_s = 0; + + /* paranoia code */ + for (h = mdev->tl_hash; h < mdev->tl_hash + mdev->tl_hash_s; h++) + if (h->first) + dev_err(DEV, "ASSERT FAILED tl_hash[%u] == %p, expected NULL\n", + (int)(h - mdev->tl_hash), h->first); + kfree(mdev->tl_hash); + mdev->tl_hash = NULL; + mdev->tl_hash_s = 0; + spin_unlock_irq(&mdev->req_lock); +} + static void drbd_disconnect(struct drbd_conf *mdev) { enum drbd_fencing_p fp; @@ -3774,32 +3804,14 @@ static void drbd_disconnect(struct drbd_conf *mdev) spin_unlock_irq(&mdev->req_lock); if (os.conn == C_DISCONNECTING) { - struct hlist_head *h; wait_event(mdev->net_cnt_wait, atomic_read(&mdev->net_cnt) == 0); - /* we must not free the tl_hash - * while application io is still on the fly */ - wait_event(mdev->misc_wait, atomic_read(&mdev->ap_bio_cnt) == 0); - - spin_lock_irq(&mdev->req_lock); - /* paranoia code */ - for (h = mdev->ee_hash; h < mdev->ee_hash + mdev->ee_hash_s; h++) - if (h->first) - dev_err(DEV, "ASSERT FAILED ee_hash[%u].first == %p, expected NULL\n", - (int)(h - mdev->ee_hash), h->first); - kfree(mdev->ee_hash); - mdev->ee_hash = NULL; - mdev->ee_hash_s = 0; - - /* paranoia code */ - for (h = mdev->tl_hash; h < mdev->tl_hash + mdev->tl_hash_s; h++) - if (h->first) - dev_err(DEV, "ASSERT FAILED tl_hash[%u] == %p, expected NULL\n", - (int)(h - mdev->tl_hash), h->first); - kfree(mdev->tl_hash); - mdev->tl_hash = NULL; - mdev->tl_hash_s = 0; - spin_unlock_irq(&mdev->req_lock); + if (!mdev->state.susp) { + /* we must not free the tl_hash + * while application io is still on the fly */ + wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_bio_cnt)); + drbd_free_tl_hash(mdev); + } crypto_free_hash(mdev->cram_hmac_tfm); mdev->cram_hmac_tfm = NULL; -- cgit v0.10.2 From 6709893059105d7859ae772af70c7db5bbab7de0 Mon Sep 17 00:00:00 2001 From: Philipp Reisner Date: Thu, 24 Jun 2010 16:24:25 +0200 Subject: drbd: Make sure tl_restart(, resend) can not get called multiple times for a new connection Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 8d14635..c6658f5 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -1209,6 +1209,7 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os, union drbd_state ns, enum chg_state_flags flags) { enum drbd_fencing_p fp; + enum drbd_req_event what = nothing; if (os.conn != C_CONNECTED && ns.conn == C_CONNECTED) { clear_bit(CRASHED_PRIMARY, &mdev->flags); @@ -1234,21 +1235,14 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os, if (os.susp && ns.susp && mdev->sync_conf.on_no_data == OND_SUSPEND_IO) { if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED) { - if (ns.conn == C_CONNECTED) { - spin_lock_irq(&mdev->req_lock); - _tl_restart(mdev, resend); - _drbd_set_state(_NS(mdev, susp, 0), CS_VERBOSE, NULL); - spin_unlock_irq(&mdev->req_lock); - } else /* ns.conn > C_CONNECTED */ + if (ns.conn == C_CONNECTED) + what = resend; + else /* ns.conn > C_CONNECTED */ dev_err(DEV, "Unexpected Resynd going on!\n"); } - if (os.disk == D_ATTACHING && ns.disk > D_ATTACHING) { - spin_lock_irq(&mdev->req_lock); - _tl_restart(mdev, restart_frozen_disk_io); - _drbd_set_state(_NS(mdev, susp, 0), CS_VERBOSE, NULL); - spin_unlock_irq(&mdev->req_lock); - } + if (os.disk == D_ATTACHING && ns.disk > D_ATTACHING) + what = restart_frozen_disk_io; } if (fp == FP_STONITH && ns.susp) { @@ -1267,12 +1261,17 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os, /* case2: The connection was established again: */ if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED) { clear_bit(NEW_CUR_UUID, &mdev->flags); - spin_lock_irq(&mdev->req_lock); - _tl_restart(mdev, resend); - _drbd_set_state(_NS(mdev, susp, 0), CS_VERBOSE, NULL); - spin_unlock_irq(&mdev->req_lock); + what = resend; } } + + if (what != nothing) { + spin_lock_irq(&mdev->req_lock); + _tl_restart(mdev, what); + _drbd_set_state(_NS(mdev, susp, 0), CS_VERBOSE, NULL); + spin_unlock_irq(&mdev->req_lock); + } + /* Do not change the order of the if above and the two below... */ if (os.pdsk == D_DISKLESS && ns.pdsk > D_DISKLESS) { /* attach on the peer */ drbd_send_uuids(mdev); -- cgit v0.10.2 From e756414f7daa93b862f1670dd0a6aaa676ea71e3 Mon Sep 17 00:00:00 2001 From: Philipp Reisner Date: Tue, 29 Jun 2010 17:35:34 +0200 Subject: drbd: Initialize all members of sync_conf to their defaults [Bugz 315] Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index c6658f5..410d3d4 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -2689,7 +2689,8 @@ static void drbd_set_defaults(struct drbd_conf *mdev) /* .verify_alg = */ {}, 0, /* .cpu_mask = */ {}, 0, /* .csums_alg = */ {}, 0, - /* .use_rle = */ 0 + /* .use_rle = */ 0, + /* .on_no_data = */ DRBD_ON_NO_DATA_DEF }; /* Have to use that way, because the layout differs between -- cgit v0.10.2 From d28fd092a55b504a0d699b65802a995086d70647 Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Fri, 9 Jul 2010 23:28:10 +0200 Subject: drbd: fix list corruption (recent regression) The commit 288f422ec13667de40b278535d2a5fb5c77352c4 drbd: Track all IO requests on the TL, not writes only moved a list_add_tail(req, ) into a region where req may have just been freed due to conflict detection. Fix this by adding a proper cleanup section for that code path. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c index 4e1e10d..3b61d76 100644 --- a/drivers/block/drbd/drbd_req.c +++ b/drivers/block/drbd/drbd_req.c @@ -917,31 +917,8 @@ allocate_barrier: /* check this request on the collision detection hash tables. * if we have a conflict, just complete it here. * THINK do we want to check reads, too? (I don't think so...) */ - if (rw == WRITE && _req_conflicts(req)) { - /* this is a conflicting request. - * even though it may have been only _partially_ - * overlapping with one of the currently pending requests, - * without even submitting or sending it, we will - * pretend that it was successfully served right now. - */ - if (local) { - bio_put(req->private_bio); - req->private_bio = NULL; - drbd_al_complete_io(mdev, req->sector); - put_ldev(mdev); - local = 0; - } - if (remote) - dec_ap_pending(mdev); - _drbd_end_io_acct(mdev, req); - /* THINK: do we want to fail it (-EIO), or pretend success? */ - bio_endio(req->master_bio, 0); - req->master_bio = NULL; - dec_ap_bio(mdev); - drbd_req_free(req); - remote = 0; - } - + if (rw == WRITE && _req_conflicts(req)) + goto fail_conflicting; list_add_tail(&req->tl_requests, &mdev->newest_tle->requests); @@ -976,6 +953,21 @@ allocate_barrier: return 0; +fail_conflicting: + /* this is a conflicting request. + * even though it may have been only _partially_ + * overlapping with one of the currently pending requests, + * without even submitting or sending it, we will + * pretend that it was successfully served right now. + */ + _drbd_end_io_acct(mdev, req); + spin_unlock_irq(&mdev->req_lock); + if (remote) + dec_ap_pending(mdev); + /* THINK: do we want to fail it (-EIO), or pretend success? + * this pretends success. */ + err = 0; + fail_free_complete: if (rw == WRITE && local) drbd_al_complete_io(mdev, sector); -- cgit v0.10.2 From 9a31d7164d409ca59cfadb7957ac7b0acf4545b8 Mon Sep 17 00:00:00 2001 From: Philipp Reisner Date: Mon, 5 Jul 2010 13:42:03 +0200 Subject: drbd: New sync parameters for the smart resync rate controller Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 410d3d4..5a484c1 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -2690,7 +2690,11 @@ static void drbd_set_defaults(struct drbd_conf *mdev) /* .cpu_mask = */ {}, 0, /* .csums_alg = */ {}, 0, /* .use_rle = */ 0, - /* .on_no_data = */ DRBD_ON_NO_DATA_DEF + /* .on_no_data = */ DRBD_ON_NO_DATA_DEF, + /* .c_plan_ahead = */ DRBD_C_PLAN_AHEAD_DEF, + /* .c_delay_target = */ DRBD_C_DELAY_TARGET_DEF, + /* .c_fill_target = */ DRBD_C_FILL_TARGET_DEF, + /* .c_max_rate = */ DRBD_C_MAX_RATE_DEF }; /* Have to use that way, because the layout differs between diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index 6c08e63..7d384fd 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c @@ -1599,6 +1599,10 @@ static int drbd_nl_syncer_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *n sc.after = DRBD_AFTER_DEF; sc.al_extents = DRBD_AL_EXTENTS_DEF; sc.on_no_data = DRBD_ON_NO_DATA_DEF; + sc.c_plan_ahead = DRBD_C_PLAN_AHEAD_DEF; + sc.c_delay_target = DRBD_C_DELAY_TARGET_DEF; + sc.c_fill_target = DRBD_C_FILL_TARGET_DEF; + sc.c_max_rate = DRBD_C_MAX_RATE_DEF; } else memcpy(&sc, &mdev->sync_conf, sizeof(struct syncer_conf)); diff --git a/include/linux/drbd_limits.h b/include/linux/drbd_limits.h index 7eb1e98..06dbba4 100644 --- a/include/linux/drbd_limits.h +++ b/include/linux/drbd_limits.h @@ -134,21 +134,21 @@ #define DRBD_MAX_BIO_BVECS_MAX 128 #define DRBD_MAX_BIO_BVECS_DEF 0 -#define DRBD_DP_VOLUME_MIN 4 -#define DRBD_DP_VOLUME_MAX 1048576 -#define DRBD_DP_VOLUME_DEF 16384 +#define DRBD_C_PLAN_AHEAD_MIN 0 +#define DRBD_C_PLAN_AHEAD_MAX 300 +#define DRBD_C_PLAN_AHEAD_DEF 0 /* RS rate controller disabled by default */ -#define DRBD_DP_INTERVAL_MIN 1 -#define DRBD_DP_INTERVAL_MAX 600 -#define DRBD_DP_INTERVAL_DEF 5 +#define DRBD_C_DELAY_TARGET_MIN 1 +#define DRBD_C_DELAY_TARGET_MAX 100 +#define DRBD_C_DELAY_TARGET_DEF 10 -#define DRBD_RS_THROTTLE_TH_MIN 1 -#define DRBD_RS_THROTTLE_TH_MAX 600 -#define DRBD_RS_THROTTLE_TH_DEF 20 +#define DRBD_C_FILL_TARGET_MIN 0 +#define DRBD_C_FILL_TARGET_MAX 100000 +#define DRBD_C_FILL_TARGET_DEF 0 /* By default disabled -> controlled by delay_target */ -#define DRBD_RS_HOLD_OFF_TH_MIN 1 -#define DRBD_RS_HOLD_OFF_TH_MAX 6000 -#define DRBD_RS_HOLD_OFF_TH_DEF 100 +#define DRBD_C_MAX_RATE_MIN 250 /* kByte/sec */ +#define DRBD_C_MAX_RATE_MAX (4 << 20) +#define DRBD_C_MAX_RATE_DEF 102400 #undef RANGE #endif diff --git a/include/linux/drbd_nl.h b/include/linux/drbd_nl.h index 9aebd0d..e23683c 100644 --- a/include/linux/drbd_nl.h +++ b/include/linux/drbd_nl.h @@ -88,6 +88,10 @@ NL_PACKET(syncer_conf, 8, NL_STRING( 64, T_MAY_IGNORE, csums_alg, SHARED_SECRET_MAX) NL_BIT( 65, T_MAY_IGNORE, use_rle) NL_INTEGER( 75, T_MAY_IGNORE, on_no_data) + NL_INTEGER( 76, T_MAY_IGNORE, c_plan_ahead) + NL_INTEGER( 77, T_MAY_IGNORE, c_delay_target) + NL_INTEGER( 78, T_MAY_IGNORE, c_fill_target) + NL_INTEGER( 79, T_MAY_IGNORE, c_max_rate) ) NL_PACKET(invalidate, 9, ) -- cgit v0.10.2 From 8e26f9ccb9be00fdb33551a34c8f6029e89ab79f Mon Sep 17 00:00:00 2001 From: Philipp Reisner Date: Tue, 6 Jul 2010 17:25:54 +0200 Subject: drbd: New sync_param packet, that includes the parameters of the new controller Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index f84ffb1..fd2cdd4 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -451,6 +451,17 @@ struct p_rs_param_89 { char csums_alg[SHARED_SECRET_MAX]; } __packed; +struct p_rs_param_95 { + struct p_header head; + u32 rate; + char verify_alg[SHARED_SECRET_MAX]; + char csums_alg[SHARED_SECRET_MAX]; + u32 c_plan_ahead; + u32 c_delay_target; + u32 c_fill_target; + u32 c_max_rate; +} __packed; + enum drbd_conn_flags { CF_WANT_LOSE = 1, CF_DRY_RUN = 2, @@ -610,6 +621,7 @@ union p_polymorph { struct p_barrier barrier; struct p_barrier_ack barrier_ack; struct p_rs_param_89 rs_param_89; + struct p_rs_param_95 rs_param_95; struct p_protocol protocol; struct p_sizes sizes; struct p_uuids uuids; @@ -1268,6 +1280,8 @@ struct bm_extent { * Bit 1 ==> local node thinks this block needs to be synced. */ +#define SLEEP_TIME (HZ/10) + #define BM_BLOCK_SHIFT 12 /* 4k per bit */ #define BM_BLOCK_SIZE (1<agreed_pro_version; @@ -1721,7 +1721,8 @@ int drbd_send_sync_param(struct drbd_conf *mdev, struct syncer_conf *sc) size = apv <= 87 ? sizeof(struct p_rs_param) : apv == 88 ? sizeof(struct p_rs_param) + strlen(mdev->sync_conf.verify_alg) + 1 - : /* 89 */ sizeof(struct p_rs_param_89); + : apv <= 94 ? sizeof(struct p_rs_param_89) + : /* apv >= 95 */ sizeof(struct p_rs_param_95); /* used from admin command context and receiver/worker context. * to avoid kmalloc, grab the socket right here, @@ -1732,12 +1733,16 @@ int drbd_send_sync_param(struct drbd_conf *mdev, struct syncer_conf *sc) if (likely(sock != NULL)) { enum drbd_packets cmd = apv >= 89 ? P_SYNC_PARAM89 : P_SYNC_PARAM; - p = &mdev->data.sbuf.rs_param_89; + p = &mdev->data.sbuf.rs_param_95; /* initialize verify_alg and csums_alg */ memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX); p->rate = cpu_to_be32(sc->rate); + p->c_plan_ahead = cpu_to_be32(sc->c_plan_ahead); + p->c_delay_target = cpu_to_be32(sc->c_delay_target); + p->c_fill_target = cpu_to_be32(sc->c_fill_target); + p->c_max_rate = cpu_to_be32(sc->c_max_rate); if (apv >= 88) strcpy(p->verify_alg, mdev->sync_conf.verify_alg); diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index 5e49ee7..34bea97 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -2805,7 +2805,7 @@ struct crypto_hash *drbd_crypto_alloc_digest_safe(const struct drbd_conf *mdev, static int receive_SyncParam(struct drbd_conf *mdev, struct p_header *h) { int ok = TRUE; - struct p_rs_param_89 *p = (struct p_rs_param_89 *)h; + struct p_rs_param_95 *p = (struct p_rs_param_95 *)h; unsigned int header_size, data_size, exp_max_sz; struct crypto_hash *verify_tfm = NULL; struct crypto_hash *csums_tfm = NULL; @@ -2814,7 +2814,8 @@ static int receive_SyncParam(struct drbd_conf *mdev, struct p_header *h) exp_max_sz = apv <= 87 ? sizeof(struct p_rs_param) : apv == 88 ? sizeof(struct p_rs_param) + SHARED_SECRET_MAX - : /* 89 */ sizeof(struct p_rs_param_89); + : apv <= 94 ? sizeof(struct p_rs_param_89) + : /* apv >= 95 */ sizeof(struct p_rs_param_95); if (h->length > exp_max_sz) { dev_err(DEV, "SyncParam packet too long: received %u, expected <= %u bytes\n", @@ -2825,10 +2826,14 @@ static int receive_SyncParam(struct drbd_conf *mdev, struct p_header *h) if (apv <= 88) { header_size = sizeof(struct p_rs_param) - sizeof(*h); data_size = h->length - header_size; - } else /* apv >= 89 */ { + } else if (apv <= 94) { header_size = sizeof(struct p_rs_param_89) - sizeof(*h); data_size = h->length - header_size; D_ASSERT(data_size == 0); + } else { + header_size = sizeof(struct p_rs_param_95) - sizeof(*h); + data_size = h->length - header_size; + D_ASSERT(data_size == 0); } /* initialize verify_alg and csums_alg */ @@ -2893,6 +2898,13 @@ static int receive_SyncParam(struct drbd_conf *mdev, struct p_header *h) } } + if (apv > 94) { + mdev->sync_conf.rate = be32_to_cpu(p->rate); + mdev->sync_conf.c_plan_ahead = be32_to_cpu(p->c_plan_ahead); + mdev->sync_conf.c_delay_target = be32_to_cpu(p->c_delay_target); + mdev->sync_conf.c_fill_target = be32_to_cpu(p->c_fill_target); + mdev->sync_conf.c_max_rate = be32_to_cpu(p->c_max_rate); + } spin_lock(&mdev->peer_seq_lock); /* lock against drbd_nl_syncer_conf() */ diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c index 3c1e884..d94720f 100644 --- a/drivers/block/drbd/drbd_worker.c +++ b/drivers/block/drbd/drbd_worker.c @@ -39,8 +39,6 @@ #include "drbd_int.h" #include "drbd_req.h" -#define SLEEP_TIME (HZ/10) - static int w_make_ov_request(struct drbd_conf *mdev, struct drbd_work *w, int cancel); -- cgit v0.10.2 From 778f271dfe7a7173c0bae2d6cde8d9bd1533e668 Mon Sep 17 00:00:00 2001 From: Philipp Reisner Date: Tue, 6 Jul 2010 11:14:00 +0200 Subject: drbd: The new, smarter resync speed controller Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index fd2cdd4..facb72c 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -928,6 +928,12 @@ enum write_ordering_e { WO_bio_barrier }; +struct fifo_buffer { + int *values; + unsigned int head_index; + unsigned int size; +}; + struct drbd_conf { /* things that are stored as / read from meta data on disk */ unsigned long flags; @@ -1068,6 +1074,11 @@ struct drbd_conf { u64 ed_uuid; /* UUID of the exposed data */ struct mutex state_mutex; char congestion_reason; /* Why we where congested... */ + atomic_t rs_sect_in; /* counter to measure the incoming resync data rate */ + int c_sync_rate; /* current resync rate after delay_probe magic */ + struct fifo_buffer rs_plan_s; /* correction values of resync planer */ + int rs_in_flight; /* resync sectors in flight (to proxy, in proxy and from proxy) */ + int rs_planed; /* resync sectors already planed */ }; static inline struct drbd_conf *minor_to_mdev(unsigned int minor) diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index bff4f59..ed09a84 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -2734,6 +2734,7 @@ void drbd_init_set_defaults(struct drbd_conf *mdev) atomic_set(&mdev->net_cnt, 0); atomic_set(&mdev->packet_seq, 0); atomic_set(&mdev->pp_in_use, 0); + atomic_set(&mdev->rs_sect_in, 0); mutex_init(&mdev->md_io_mutex); mutex_init(&mdev->data.mutex); diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index 7d384fd..295b8d5 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c @@ -1587,6 +1587,8 @@ static int drbd_nl_syncer_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *n struct crypto_hash *csums_tfm = NULL; struct syncer_conf sc; cpumask_var_t new_cpu_mask; + int *rs_plan_s = NULL; + int fifo_size; if (!zalloc_cpumask_var(&new_cpu_mask, GFP_KERNEL)) { retcode = ERR_NOMEM; @@ -1687,6 +1689,16 @@ static int drbd_nl_syncer_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *n if (retcode != NO_ERROR) goto fail; + fifo_size = (sc.c_plan_ahead * 10 * SLEEP_TIME) / HZ; + if (fifo_size != mdev->rs_plan_s.size && fifo_size > 0) { + rs_plan_s = kzalloc(sizeof(int) * fifo_size, GFP_KERNEL); + if (!rs_plan_s) { + dev_err(DEV, "kmalloc of fifo_buffer failed"); + retcode = ERR_NOMEM; + goto fail; + } + } + /* ok, assign the rest of it as well. * lock against receive_SyncParam() */ spin_lock(&mdev->peer_seq_lock); @@ -1703,6 +1715,15 @@ static int drbd_nl_syncer_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *n mdev->verify_tfm = verify_tfm; verify_tfm = NULL; } + + if (fifo_size != mdev->rs_plan_s.size) { + kfree(mdev->rs_plan_s.values); + mdev->rs_plan_s.values = rs_plan_s; + mdev->rs_plan_s.size = fifo_size; + mdev->rs_planed = 0; + rs_plan_s = NULL; + } + spin_unlock(&mdev->peer_seq_lock); if (get_ldev(mdev)) { @@ -1734,6 +1755,7 @@ static int drbd_nl_syncer_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *n kobject_uevent(&disk_to_dev(mdev->vdisk)->kobj, KOBJ_CHANGE); fail: + kfree(rs_plan_s); free_cpumask_var(new_cpu_mask); crypto_free_hash(csums_tfm); crypto_free_hash(verify_tfm); diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index 34bea97..5f80b22 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -1640,6 +1640,8 @@ static int receive_RSDataReply(struct drbd_conf *mdev, struct p_header *h) drbd_send_ack_dp(mdev, P_NEG_ACK, p); } + atomic_add(data_size >> 9, &mdev->rs_sect_in); + return ok; } @@ -2810,6 +2812,8 @@ static int receive_SyncParam(struct drbd_conf *mdev, struct p_header *h) struct crypto_hash *verify_tfm = NULL; struct crypto_hash *csums_tfm = NULL; const int apv = mdev->agreed_pro_version; + int *rs_plan_s = NULL; + int fifo_size = 0; exp_max_sz = apv <= 87 ? sizeof(struct p_rs_param) : apv == 88 ? sizeof(struct p_rs_param) @@ -2904,6 +2908,15 @@ static int receive_SyncParam(struct drbd_conf *mdev, struct p_header *h) mdev->sync_conf.c_delay_target = be32_to_cpu(p->c_delay_target); mdev->sync_conf.c_fill_target = be32_to_cpu(p->c_fill_target); mdev->sync_conf.c_max_rate = be32_to_cpu(p->c_max_rate); + + fifo_size = (mdev->sync_conf.c_plan_ahead * 10 * SLEEP_TIME) / HZ; + if (fifo_size != mdev->rs_plan_s.size && fifo_size > 0) { + rs_plan_s = kzalloc(sizeof(int) * fifo_size, GFP_KERNEL); + if (!rs_plan_s) { + dev_err(DEV, "kmalloc of fifo_buffer failed"); + goto disconnect; + } + } } spin_lock(&mdev->peer_seq_lock); @@ -2922,6 +2935,12 @@ static int receive_SyncParam(struct drbd_conf *mdev, struct p_header *h) mdev->csums_tfm = csums_tfm; dev_info(DEV, "using csums-alg: \"%s\"\n", p->csums_alg); } + if (fifo_size != mdev->rs_plan_s.size) { + kfree(mdev->rs_plan_s.values); + mdev->rs_plan_s.values = rs_plan_s; + mdev->rs_plan_s.size = fifo_size; + mdev->rs_planed = 0; + } spin_unlock(&mdev->peer_seq_lock); } @@ -4202,6 +4221,7 @@ static int got_IsInSync(struct drbd_conf *mdev, struct p_header *h) /* rs_same_csums is supposed to count in units of BM_BLOCK_SIZE */ mdev->rs_same_csum += (blksize >> BM_BLOCK_SHIFT); dec_rs_pending(mdev); + atomic_add(blksize >> 9, &mdev->rs_sect_in); return TRUE; } diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c index d94720f..fd3e1e9 100644 --- a/drivers/block/drbd/drbd_worker.c +++ b/drivers/block/drbd/drbd_worker.c @@ -422,6 +422,89 @@ void resync_timer_fn(unsigned long data) drbd_queue_work(&mdev->data.work, &mdev->resync_work); } +static void fifo_set(struct fifo_buffer *fb, int value) +{ + int i; + + for (i = 0; i < fb->size; i++) + fb->values[i] += value; +} + +static int fifo_push(struct fifo_buffer *fb, int value) +{ + int ov; + + ov = fb->values[fb->head_index]; + fb->values[fb->head_index++] = value; + + if (fb->head_index >= fb->size) + fb->head_index = 0; + + return ov; +} + +static void fifo_add_val(struct fifo_buffer *fb, int value) +{ + int i; + + for (i = 0; i < fb->size; i++) + fb->values[i] += value; +} + +int drbd_rs_controller(struct drbd_conf *mdev) +{ + unsigned int sect_in; /* Number of sectors that came in since the last turn */ + unsigned int want; /* The number of sectors we want in the proxy */ + int req_sect; /* Number of sectors to request in this turn */ + int correction; /* Number of sectors more we need in the proxy*/ + int cps; /* correction per invocation of drbd_rs_controller() */ + int steps; /* Number of time steps to plan ahead */ + int curr_corr; + int max_sect; + + sect_in = atomic_xchg(&mdev->rs_sect_in, 0); /* Number of sectors that came in */ + mdev->rs_in_flight -= sect_in; + + spin_lock(&mdev->peer_seq_lock); /* get an atomic view on mdev->rs_plan_s */ + + steps = mdev->rs_plan_s.size; /* (mdev->sync_conf.c_plan_ahead * 10 * SLEEP_TIME) / HZ; */ + + if (mdev->rs_in_flight + sect_in == 0) { /* At start of resync */ + want = ((mdev->sync_conf.rate * 2 * SLEEP_TIME) / HZ) * steps; + } else { /* normal path */ + want = mdev->sync_conf.c_fill_target ? mdev->sync_conf.c_fill_target : + sect_in * mdev->sync_conf.c_delay_target * HZ / (SLEEP_TIME * 10); + } + + correction = want - mdev->rs_in_flight - mdev->rs_planed; + + /* Plan ahead */ + cps = correction / steps; + fifo_add_val(&mdev->rs_plan_s, cps); + mdev->rs_planed += cps * steps; + + /* What we do in this step */ + curr_corr = fifo_push(&mdev->rs_plan_s, 0); + spin_unlock(&mdev->peer_seq_lock); + mdev->rs_planed -= curr_corr; + + req_sect = sect_in + curr_corr; + if (req_sect < 0) + req_sect = 0; + + max_sect = (mdev->sync_conf.c_max_rate * 2 * SLEEP_TIME) / HZ; + if (req_sect > max_sect) + req_sect = max_sect; + + /* + dev_warn(DEV, "si=%u if=%d wa=%u co=%d st=%d cps=%d pl=%d cc=%d rs=%d\n", + sect_in, mdev->rs_in_flight, want, correction, + steps, cps, mdev->rs_planed, curr_corr, req_sect); + */ + + return req_sect; +} + int w_make_resync_request(struct drbd_conf *mdev, struct drbd_work *w, int cancel) { @@ -459,7 +542,13 @@ int w_make_resync_request(struct drbd_conf *mdev, max_segment_size = mdev->agreed_pro_version < 94 ? queue_max_segment_size(mdev->rq_queue) : DRBD_MAX_SEGMENT_SIZE; - number = SLEEP_TIME * mdev->sync_conf.rate / ((BM_BLOCK_SIZE / 1024) * HZ); + if (mdev->rs_plan_s.size) { /* mdev->sync_conf.c_plan_ahead */ + number = drbd_rs_controller(mdev) >> (BM_BLOCK_SHIFT - 9); + mdev->c_sync_rate = number * HZ * (BM_BLOCK_SIZE / 1024) / SLEEP_TIME; + } else { + mdev->c_sync_rate = mdev->sync_conf.rate; + number = SLEEP_TIME * mdev->c_sync_rate / ((BM_BLOCK_SIZE / 1024) * HZ); + } pe = atomic_read(&mdev->rs_pending_cnt); mutex_lock(&mdev->data.mutex); @@ -593,6 +682,7 @@ next_sector: } requeue: + mdev->rs_in_flight += (i << (BM_BLOCK_SHIFT - 9)); mod_timer(&mdev->resync_timer, jiffies + SLEEP_TIME); put_ldev(mdev); return 1; @@ -1419,6 +1509,12 @@ void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side) drbd_resync_finished(mdev); } + atomic_set(&mdev->rs_sect_in, 0); + mdev->rs_in_flight = 0; + mdev->rs_planed = 0; + spin_lock(&mdev->peer_seq_lock); + fifo_set(&mdev->rs_plan_s, 0); + spin_unlock(&mdev->peer_seq_lock); /* ns.conn may already be != mdev->state.conn, * we may have been paused in between, or become paused until * the timer triggers. -- cgit v0.10.2 From d207450cf2731c6a2afa8c78fb31c7206cd35eba Mon Sep 17 00:00:00 2001 From: Philipp Reisner Date: Thu, 22 Jul 2010 15:27:27 +0200 Subject: drbd: Bugfix: rs_in_flight could become wrong if read_for_csum() requested reschedule later Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c index fd3e1e9..30b8e46 100644 --- a/drivers/block/drbd/drbd_worker.c +++ b/drivers/block/drbd/drbd_worker.c @@ -512,7 +512,7 @@ int w_make_resync_request(struct drbd_conf *mdev, sector_t sector; const sector_t capacity = drbd_get_capacity(mdev->this_bdev); int max_segment_size; - int number, i, size, pe, mx; + int number, i, rollback_i, size, pe, mx; int align, queued, sndbuf; if (unlikely(cancel)) @@ -613,6 +613,7 @@ next_sector: * be prepared for all stripe sizes of software RAIDs. */ align = 1; + rollback_i = i; for (;;) { if (size + BM_BLOCK_SIZE > max_segment_size) break; @@ -654,6 +655,7 @@ next_sector: case 2: /* Allocation failed */ drbd_rs_complete_io(mdev, sector); mdev->bm_resync_fo = BM_SECT_TO_BIT(sector); + i = rollback_i; goto requeue; /* case 1: everything ok */ } -- cgit v0.10.2 From 85719573dd716bc2ac3e098b44adfed884250bab Mon Sep 17 00:00:00 2001 From: Philipp Reisner Date: Wed, 21 Jul 2010 10:20:17 +0200 Subject: drbd: Replaced some casts by an union. Improved comments Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index facb72c..b0cbfa1 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -759,7 +759,7 @@ struct digest_info { struct drbd_epoch_entry { struct drbd_work w; struct hlist_node colision; - struct drbd_epoch *epoch; + struct drbd_epoch *epoch; /* for writes */ struct drbd_conf *mdev; struct page *pages; atomic_t pending_bios; @@ -767,7 +767,10 @@ struct drbd_epoch_entry { /* see comments on ee flag bits below */ unsigned long flags; sector_t sector; - u64 block_id; + union { + u64 block_id; + struct digest_info *digest; + }; }; /* ee flag bits. @@ -1032,10 +1035,10 @@ struct drbd_conf { spinlock_t epoch_lock; unsigned int epochs; enum write_ordering_e write_ordering; - struct list_head active_ee; /* IO in progress */ - struct list_head sync_ee; /* IO in progress */ + struct list_head active_ee; /* IO in progress (P_DATA gets written to disk) */ + struct list_head sync_ee; /* IO in progress (P_RS_DATA_REPLY gets written to disk) */ struct list_head done_ee; /* send ack */ - struct list_head read_ee; /* IO in progress */ + struct list_head read_ee; /* IO in progress (any read) */ struct list_head net_ee; /* zero-copy network send in progress */ struct hlist_head *ee_hash; /* is proteced by req_lock! */ unsigned int ee_hash_s; diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index 5f80b22..0b03e31 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -2097,7 +2097,7 @@ static int receive_DataRequest(struct drbd_conf *mdev, struct p_header *h) if (drbd_recv(mdev, di->digest, digest_size) != digest_size) goto out_free_e; - e->block_id = (u64)(unsigned long)di; + e->digest = di; if (h->command == P_CSUM_RS_REQUEST) { D_ASSERT(mdev->agreed_pro_version >= 89); e->w.cb = w_e_end_csum_rs_req; @@ -3769,6 +3769,7 @@ static void drbd_disconnect(struct drbd_conf *mdev) drbd_thread_stop(&mdev->asender); drbd_free_sock(mdev); + /* wait for current activity to cease. */ spin_lock_irq(&mdev->req_lock); _drbd_wait_ee_list_empty(mdev, &mdev->active_ee); _drbd_wait_ee_list_empty(mdev, &mdev->sync_ee); diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c index 30b8e46..f979e22 100644 --- a/drivers/block/drbd/drbd_worker.c +++ b/drivers/block/drbd/drbd_worker.c @@ -1018,7 +1018,7 @@ int w_e_end_csum_rs_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel) drbd_rs_complete_io(mdev, e->sector); - di = (struct digest_info *)(unsigned long)e->block_id; + di = e->digest; if (likely((e->flags & EE_WAS_ERROR) == 0)) { /* quick hack to try to avoid a race against reconfiguration. @@ -1126,7 +1126,7 @@ int w_e_end_ov_reply(struct drbd_conf *mdev, struct drbd_work *w, int cancel) * the resync lru has been cleaned up already */ drbd_rs_complete_io(mdev, e->sector); - di = (struct digest_info *)(unsigned long)e->block_id; + di = e->digest; if (likely((e->flags & EE_WAS_ERROR) == 0)) { digest_size = crypto_hash_digestsize(mdev->verify_tfm); -- cgit v0.10.2 From c36c3ced692b38d0cf90a5e6f875be2f9ebbc037 Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Wed, 11 Aug 2010 20:42:55 +0200 Subject: drbd: let drbd_free_ee implicitly free any digest Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index b0cbfa1..3a94174 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -796,12 +796,16 @@ enum { * if any of those fail, we set this flag atomically * from the endio callback */ __EE_WAS_ERROR, + + /* This ee has a pointer to a digest instead of a block id */ + __EE_HAS_DIGEST, }; #define EE_CALL_AL_COMPLETE_IO (1<<__EE_CALL_AL_COMPLETE_IO) #define EE_MAY_SET_IN_SYNC (1<<__EE_MAY_SET_IN_SYNC) #define EE_IS_BARRIER (1<<__EE_IS_BARRIER) #define EE_RESUBMITTED (1<<__EE_RESUBMITTED) #define EE_WAS_ERROR (1<<__EE_WAS_ERROR) +#define EE_HAS_DIGEST (1<<__EE_HAS_DIGEST) /* global flag bits */ enum { diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index 0b03e31..2f9320b 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -377,6 +377,8 @@ struct drbd_epoch_entry *drbd_alloc_ee(struct drbd_conf *mdev, void drbd_free_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e) { + if (e->flags & EE_HAS_DIGEST) + kfree(e->digest); drbd_pp_free(mdev, e->pages); D_ASSERT(atomic_read(&e->pending_bios) == 0); D_ASSERT(hlist_unhashed(&e->colision)); @@ -2094,10 +2096,12 @@ static int receive_DataRequest(struct drbd_conf *mdev, struct p_header *h) di->digest_size = digest_size; di->digest = (((char *)di)+sizeof(struct digest_info)); + e->digest = di; + e->flags |= EE_HAS_DIGEST; + if (drbd_recv(mdev, di->digest, digest_size) != digest_size) goto out_free_e; - e->digest = di; if (h->command == P_CSUM_RS_REQUEST) { D_ASSERT(mdev->agreed_pro_version >= 89); e->w.cb = w_e_end_csum_rs_req; @@ -2159,7 +2163,6 @@ static int receive_DataRequest(struct drbd_conf *mdev, struct p_header *h) return TRUE; out_free_e: - kfree(di); put_ldev(mdev); drbd_free_ee(mdev, e); return FALSE; diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c index f979e22..48452fe 100644 --- a/drivers/block/drbd/drbd_worker.c +++ b/drivers/block/drbd/drbd_worker.c @@ -1052,9 +1052,6 @@ int w_e_end_csum_rs_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel) } dec_unacked(mdev); - - kfree(di); - move_to_net_ee_or_free(mdev, e); if (unlikely(!ok)) @@ -1145,9 +1142,6 @@ int w_e_end_ov_reply(struct drbd_conf *mdev, struct drbd_work *w, int cancel) } dec_unacked(mdev); - - kfree(di); - if (!eq) drbd_ov_oos_found(mdev, e->sector, e->size); else -- cgit v0.10.2 From 0bb70bf601579b0d4c56acbb54b8eb0688541e19 Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Wed, 11 Aug 2010 20:53:21 +0200 Subject: drbd: remove outdated comment and dead code Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index 3a94174..72d2047 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -858,10 +858,6 @@ struct drbd_bitmap; /* opaque for drbd_conf */ /* THINK maybe we actually want to use the default "event/%s" worker threads * or similar in linux 2.6, which uses per cpu data and threads. - * - * To be general, this might need a spin_lock member. - * For now, please use the mdev->req_lock to protect list_head, - * see drbd_queue_work below. */ struct drbd_work_queue { struct list_head q; @@ -1895,13 +1891,6 @@ static inline sector_t drbd_md_ss__(struct drbd_conf *mdev, } static inline void -_drbd_queue_work(struct drbd_work_queue *q, struct drbd_work *w) -{ - list_add_tail(&w->list, &q->q); - up(&q->s); -} - -static inline void drbd_queue_work_front(struct drbd_work_queue *q, struct drbd_work *w) { unsigned long flags; -- cgit v0.10.2 From 1d7734a0df02ff5068ff8baa1447c7baee601db1 Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Wed, 11 Aug 2010 21:21:50 +0200 Subject: drbd: use rolling marks for resync speed calculation The current resync speed as displayed in /proc/drbd fluctuates a lot. Using an array of rolling marks makes this calculation much more stable. We used to have this (a long time ago with 0.7), but it got lost somehow. If "stalled", do not discard the rest of the information, just add a " (stalled)" tag to the progress line. This patch also shortens a spinlock critical section somewhat, and reduces the number of atomic operations in put_ldev. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c index 9400845..b895470 100644 --- a/drivers/block/drbd/drbd_actlog.c +++ b/drivers/block/drbd/drbd_actlog.c @@ -965,29 +965,30 @@ void __drbd_set_in_sync(struct drbd_conf *mdev, sector_t sector, int size, * ok, (capacity & 7) != 0 sometimes, but who cares... * we count rs_{total,left} in bits, not sectors. */ - spin_lock_irqsave(&mdev->al_lock, flags); count = drbd_bm_clear_bits(mdev, sbnr, ebnr); - if (count) { - /* we need the lock for drbd_try_clear_on_disk_bm */ - if (jiffies - mdev->rs_mark_time > HZ*10) { - /* should be rolling marks, - * but we estimate only anyways. */ - if (mdev->rs_mark_left != drbd_bm_total_weight(mdev) && + if (count && get_ldev(mdev)) { + unsigned long now = jiffies; + unsigned long last = mdev->rs_mark_time[mdev->rs_last_mark]; + int next = (mdev->rs_last_mark + 1) % DRBD_SYNC_MARKS; + if (time_after_eq(now, last + DRBD_SYNC_MARK_STEP)) { + unsigned long tw = drbd_bm_total_weight(mdev); + if (mdev->rs_mark_left[mdev->rs_last_mark] != tw && mdev->state.conn != C_PAUSED_SYNC_T && mdev->state.conn != C_PAUSED_SYNC_S) { - mdev->rs_mark_time = jiffies; - mdev->rs_mark_left = drbd_bm_total_weight(mdev); + mdev->rs_mark_time[next] = now; + mdev->rs_mark_left[next] = tw; + mdev->rs_last_mark = next; } } - if (get_ldev(mdev)) { - drbd_try_clear_on_disk_bm(mdev, sector, count, TRUE); - put_ldev(mdev); - } + spin_lock_irqsave(&mdev->al_lock, flags); + drbd_try_clear_on_disk_bm(mdev, sector, count, TRUE); + spin_unlock_irqrestore(&mdev->al_lock, flags); + /* just wake_up unconditional now, various lc_chaged(), * lc_put() in drbd_try_clear_on_disk_bm(). */ wake_up = 1; + put_ldev(mdev); } - spin_unlock_irqrestore(&mdev->al_lock, flags); if (wake_up) wake_up(&mdev->al_wait); } diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index 72d2047..0fce3f3 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -997,12 +997,16 @@ struct drbd_conf { unsigned long rs_start; /* cumulated time in PausedSyncX state [unit jiffies] */ unsigned long rs_paused; + /* skipped because csum was equal [unit BM_BLOCK_SIZE] */ + unsigned long rs_same_csum; +#define DRBD_SYNC_MARKS 8 +#define DRBD_SYNC_MARK_STEP (3*HZ) /* block not up-to-date at mark [unit BM_BLOCK_SIZE] */ - unsigned long rs_mark_left; + unsigned long rs_mark_left[DRBD_SYNC_MARKS]; /* marks's time [unit jiffies] */ - unsigned long rs_mark_time; - /* skipped because csum was equeal [unit BM_BLOCK_SIZE] */ - unsigned long rs_same_csum; + unsigned long rs_mark_time[DRBD_SYNC_MARKS]; + /* current index into rs_mark_{left,time} */ + int rs_last_mark; /* where does the admin want us to start? (sector) */ sector_t ov_start_sector; @@ -1077,8 +1081,12 @@ struct drbd_conf { u64 ed_uuid; /* UUID of the exposed data */ struct mutex state_mutex; char congestion_reason; /* Why we where congested... */ - atomic_t rs_sect_in; /* counter to measure the incoming resync data rate */ - int c_sync_rate; /* current resync rate after delay_probe magic */ + atomic_t rs_sect_in; /* for incoming resync data rate, SyncTarget */ + atomic_t rs_sect_ev; /* for submitted resync data rate, both */ + int rs_last_sect_ev; /* counter to compare with */ + int rs_last_events; /* counter of read or write "events" (unit sectors) + * on the lower level device when we last looked. */ + int c_sync_rate; /* current resync rate after syncer throttle magic */ struct fifo_buffer rs_plan_s; /* correction values of resync planer */ int rs_in_flight; /* resync sectors in flight (to proxy, in proxy and from proxy) */ int rs_planed; /* resync sectors already planed */ @@ -2072,10 +2080,11 @@ static inline int get_net_conf(struct drbd_conf *mdev) static inline void put_ldev(struct drbd_conf *mdev) { + int i = atomic_dec_return(&mdev->local_cnt); __release(local); - if (atomic_dec_and_test(&mdev->local_cnt)) + D_ASSERT(i >= 0); + if (i == 0) wake_up(&mdev->misc_wait); - D_ASSERT(atomic_read(&mdev->local_cnt) >= 0); } #ifndef __CHECKER__ diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index ed09a84..1ff8418a 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -1064,7 +1064,8 @@ int __drbd_set_state(struct drbd_conf *mdev, if ((os.conn == C_PAUSED_SYNC_T || os.conn == C_PAUSED_SYNC_S) && (ns.conn == C_SYNC_TARGET || ns.conn == C_SYNC_SOURCE)) { dev_info(DEV, "Syncer continues.\n"); - mdev->rs_paused += (long)jiffies-(long)mdev->rs_mark_time; + mdev->rs_paused += (long)jiffies + -(long)mdev->rs_mark_time[mdev->rs_last_mark]; if (ns.conn == C_SYNC_TARGET) { if (!test_and_clear_bit(STOP_SYNC_TIMER, &mdev->flags)) mod_timer(&mdev->resync_timer, jiffies); @@ -1078,27 +1079,33 @@ int __drbd_set_state(struct drbd_conf *mdev, if ((os.conn == C_SYNC_TARGET || os.conn == C_SYNC_SOURCE) && (ns.conn == C_PAUSED_SYNC_T || ns.conn == C_PAUSED_SYNC_S)) { dev_info(DEV, "Resync suspended\n"); - mdev->rs_mark_time = jiffies; + mdev->rs_mark_time[mdev->rs_last_mark] = jiffies; if (ns.conn == C_PAUSED_SYNC_T) set_bit(STOP_SYNC_TIMER, &mdev->flags); } if (os.conn == C_CONNECTED && (ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T)) { + unsigned long now = jiffies; + int i; + mdev->ov_position = 0; - mdev->rs_total = - mdev->rs_mark_left = drbd_bm_bits(mdev); + mdev->rs_total = drbd_bm_bits(mdev); if (mdev->agreed_pro_version >= 90) set_ov_position(mdev, ns.conn); else mdev->ov_start_sector = 0; mdev->ov_left = mdev->rs_total - BM_SECT_TO_BIT(mdev->ov_position); - mdev->rs_start = - mdev->rs_mark_time = jiffies; + mdev->rs_start = now; mdev->ov_last_oos_size = 0; mdev->ov_last_oos_start = 0; + for (i = 0; i < DRBD_SYNC_MARKS; i++) { + mdev->rs_mark_left[i] = mdev->rs_total; + mdev->rs_mark_time[i] = now; + } + if (ns.conn == C_VERIFY_S) { dev_info(DEV, "Starting Online Verify from sector %llu\n", (unsigned long long)mdev->ov_position); @@ -2793,6 +2800,7 @@ void drbd_init_set_defaults(struct drbd_conf *mdev) void drbd_mdev_cleanup(struct drbd_conf *mdev) { + int i; if (mdev->receiver.t_state != None) dev_err(DEV, "ASSERT FAILED: receiver t_state == %d expected 0.\n", mdev->receiver.t_state); @@ -2809,9 +2817,12 @@ void drbd_mdev_cleanup(struct drbd_conf *mdev) mdev->p_size = mdev->rs_start = mdev->rs_total = - mdev->rs_failed = - mdev->rs_mark_left = - mdev->rs_mark_time = 0; + mdev->rs_failed = 0; + mdev->rs_last_events = 0; + for (i = 0; i < DRBD_SYNC_MARKS; i++) { + mdev->rs_mark_left[i] = 0; + mdev->rs_mark_time[i] = 0; + } D_ASSERT(mdev->net_conf == NULL); drbd_set_my_capacity(mdev, 0); diff --git a/drivers/block/drbd/drbd_proc.c b/drivers/block/drbd/drbd_proc.c index be3374b..c159692 100644 --- a/drivers/block/drbd/drbd_proc.c +++ b/drivers/block/drbd/drbd_proc.c @@ -57,6 +57,7 @@ static void drbd_syncer_progress(struct drbd_conf *mdev, struct seq_file *seq) unsigned long db, dt, dbdt, rt, rs_left; unsigned int res; int i, x, y; + int stalled = 0; drbd_get_syncer_progress(mdev, &rs_left, &res); @@ -90,18 +91,17 @@ static void drbd_syncer_progress(struct drbd_conf *mdev, struct seq_file *seq) * db: blocks written from mark until now * rt: remaining time */ - dt = (jiffies - mdev->rs_mark_time) / HZ; - - if (dt > 20) { - /* if we made no update to rs_mark_time for too long, - * we are stalled. show that. */ - seq_printf(seq, "stalled\n"); - return; - } + /* Rolling marks. last_mark+1 may just now be modified. last_mark+2 is + * at least (DRBD_SYNC_MARKS-2)*DRBD_SYNC_MARK_STEP old, and has at + * least DRBD_SYNC_MARK_STEP time before it will be modified. */ + i = (mdev->rs_last_mark + 2) % DRBD_SYNC_MARKS; + dt = (jiffies - mdev->rs_mark_time[i]) / HZ; + if (dt > (DRBD_SYNC_MARK_STEP * DRBD_SYNC_MARKS)) + stalled = 1; if (!dt) dt++; - db = mdev->rs_mark_left - rs_left; + db = mdev->rs_mark_left[i] - rs_left; rt = (dt * (rs_left / (db/100+1)))/100; /* seconds */ seq_printf(seq, "finish: %lu:%02lu:%02lu", @@ -128,7 +128,14 @@ static void drbd_syncer_progress(struct drbd_conf *mdev, struct seq_file *seq) else seq_printf(seq, " (%ld)", dbdt); - seq_printf(seq, " K/sec\n"); + if (mdev->state.conn == C_SYNC_TARGET) { + if (mdev->c_sync_rate > 1000) + seq_printf(seq, " want: %d,%03d", + mdev->c_sync_rate / 1000, mdev->c_sync_rate % 1000); + else + seq_printf(seq, " want: %d", mdev->c_sync_rate); + } + seq_printf(seq, " K/sec%s\n", stalled ? " (stalled)" : ""); } static void resync_dump_detail(struct seq_file *seq, struct lc_element *e) diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c index 48452fe..53b7425 100644 --- a/drivers/block/drbd/drbd_worker.c +++ b/drivers/block/drbd/drbd_worker.c @@ -1481,13 +1481,19 @@ void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side) r = SS_UNKNOWN_ERROR; if (r == SS_SUCCESS) { - mdev->rs_total = - mdev->rs_mark_left = drbd_bm_total_weight(mdev); + unsigned long tw = drbd_bm_total_weight(mdev); + unsigned long now = jiffies; + int i; + mdev->rs_failed = 0; mdev->rs_paused = 0; - mdev->rs_start = - mdev->rs_mark_time = jiffies; mdev->rs_same_csum = 0; + mdev->rs_total = tw; + mdev->rs_start = now; + for (i = 0; i < DRBD_SYNC_MARKS; i++) { + mdev->rs_mark_left[i] = tw; + mdev->rs_mark_time[i] = now; + } _drbd_pause_after(mdev); } write_unlock_irq(&global_state_lock); -- cgit v0.10.2 From 80a40e439e5a3f30b0a6210a1add6d7c33392e54 Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Wed, 11 Aug 2010 23:28:00 +0200 Subject: drbd: reduce code duplication when receiving data requests also canonicalize the return values of read_for_csum and drbd_rs_begin_io to return -ESOMETHING, or 0 for success. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c index b895470..ac04ef9 100644 --- a/drivers/block/drbd/drbd_actlog.c +++ b/drivers/block/drbd/drbd_actlog.c @@ -1119,7 +1119,7 @@ static int _is_in_al(struct drbd_conf *mdev, unsigned int enr) * @mdev: DRBD device. * @sector: The sector number. * - * This functions sleeps on al_wait. Returns 1 on success, 0 if interrupted. + * This functions sleeps on al_wait. Returns 0 on success, -EINTR if interrupted. */ int drbd_rs_begin_io(struct drbd_conf *mdev, sector_t sector) { @@ -1130,10 +1130,10 @@ int drbd_rs_begin_io(struct drbd_conf *mdev, sector_t sector) sig = wait_event_interruptible(mdev->al_wait, (bm_ext = _bme_get(mdev, enr))); if (sig) - return 0; + return -EINTR; if (test_bit(BME_LOCKED, &bm_ext->flags)) - return 1; + return 0; for (i = 0; i < AL_EXT_PER_BM_SECT; i++) { sig = wait_event_interruptible(mdev->al_wait, @@ -1146,13 +1146,11 @@ int drbd_rs_begin_io(struct drbd_conf *mdev, sector_t sector) wake_up(&mdev->al_wait); } spin_unlock_irq(&mdev->al_lock); - return 0; + return -EINTR; } } - set_bit(BME_LOCKED, &bm_ext->flags); - - return 1; + return 0; } /** diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index 2f9320b..346aed9 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -2068,21 +2068,12 @@ static int receive_DataRequest(struct drbd_conf *mdev, struct p_header *h) case P_DATA_REQUEST: e->w.cb = w_e_end_data_req; fault_type = DRBD_FAULT_DT_RD; - break; + /* application IO, don't drbd_rs_begin_io */ + goto submit; + case P_RS_DATA_REQUEST: e->w.cb = w_e_end_rsdata_req; fault_type = DRBD_FAULT_RS_RD; - /* Eventually this should become asynchronously. Currently it - * blocks the whole receiver just to delay the reading of a - * resync data block. - * the drbd_work_queue mechanism is made for this... - */ - if (!drbd_rs_begin_io(mdev, sector)) { - /* we have been interrupted, - * probably connection lost! */ - D_ASSERT(signal_pending(current)); - goto out_free_e; - } break; case P_OV_REPLY: @@ -2108,13 +2099,8 @@ static int receive_DataRequest(struct drbd_conf *mdev, struct p_header *h) } else if (h->command == P_OV_REPLY) { e->w.cb = w_e_end_ov_reply; dec_rs_pending(mdev); - break; - } - - if (!drbd_rs_begin_io(mdev, sector)) { - /* we have been interrupted, probably connection lost! */ - D_ASSERT(signal_pending(current)); - goto out_free_e; + /* drbd_rs_begin_io done when we sent this request */ + goto submit; } break; @@ -2133,31 +2119,23 @@ static int receive_DataRequest(struct drbd_conf *mdev, struct p_header *h) } e->w.cb = w_e_end_ov_req; fault_type = DRBD_FAULT_RS_RD; - /* Eventually this should become asynchronous. Currently it - * blocks the whole receiver just to delay the reading of a - * resync data block. - * the drbd_work_queue mechanism is made for this... - */ - if (!drbd_rs_begin_io(mdev, sector)) { - /* we have been interrupted, - * probably connection lost! */ - D_ASSERT(signal_pending(current)); - goto out_free_e; - } break; - default: dev_err(DEV, "unexpected command (%s) in receive_DataRequest\n", cmdname(h->command)); fault_type = DRBD_FAULT_MAX; + goto out_free_e; } - spin_lock_irq(&mdev->req_lock); - list_add(&e->w.list, &mdev->read_ee); - spin_unlock_irq(&mdev->req_lock); + if (drbd_rs_begin_io(mdev, e->sector)) + goto out_free_e; +submit: inc_unacked(mdev); + spin_lock_irq(&mdev->req_lock); + list_add_tail(&e->w.list, &mdev->read_ee); + spin_unlock_irq(&mdev->req_lock); if (drbd_submit_ee(mdev, e, READ, fault_type) == 0) return TRUE; diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c index 53b7425..f5d779b 100644 --- a/drivers/block/drbd/drbd_worker.c +++ b/drivers/block/drbd/drbd_worker.c @@ -374,26 +374,26 @@ static int read_for_csum(struct drbd_conf *mdev, sector_t sector, int size) struct drbd_epoch_entry *e; if (!get_ldev(mdev)) - return 0; + return -EIO; /* GFP_TRY, because if there is no memory available right now, this may * be rescheduled for later. It is "only" background resync, after all. */ e = drbd_alloc_ee(mdev, DRBD_MAGIC+0xbeef, sector, size, GFP_TRY); if (!e) - goto fail; + goto defer; + e->w.cb = w_e_send_csum; spin_lock_irq(&mdev->req_lock); list_add(&e->w.list, &mdev->read_ee); spin_unlock_irq(&mdev->req_lock); - e->w.cb = w_e_send_csum; if (drbd_submit_ee(mdev, e, READ, DRBD_FAULT_RS_RD) == 0) - return 1; + return 0; drbd_free_ee(mdev, e); -fail: +defer: put_ldev(mdev); - return 2; + return -EAGAIN; } void resync_timer_fn(unsigned long data) @@ -649,15 +649,19 @@ next_sector: size = (capacity-sector)<<9; if (mdev->agreed_pro_version >= 89 && mdev->csums_tfm) { switch (read_for_csum(mdev, sector, size)) { - case 0: /* Disk failure*/ + case -EIO: /* Disk failure */ put_ldev(mdev); return 0; - case 2: /* Allocation failed */ + case -EAGAIN: /* allocation failed, or ldev busy */ drbd_rs_complete_io(mdev, sector); mdev->bm_resync_fo = BM_SECT_TO_BIT(sector); i = rollback_i; goto requeue; - /* case 1: everything ok */ + case 0: + /* everything ok */ + break; + default: + BUG(); } } else { inc_rs_pending(mdev); -- cgit v0.10.2 From 0f0601f4ea2f53cfd8bcae060fb03d9bbde070ec Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Wed, 11 Aug 2010 23:40:24 +0200 Subject: drbd: new configuration parameter c-min-rate We now track the data rate of locally submitted resync related requests, and can thus detect non-resync activity on the lower level device. If the current sync rate is above c-min-rate, and the lower level device appears to be busy, we throttle the resyncer. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index 0fce3f3..0fedcc0 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -1513,6 +1513,7 @@ extern int w_restart_disk_io(struct drbd_conf *, struct drbd_work *, int); extern void resync_timer_fn(unsigned long data); /* drbd_receiver.c */ +extern int drbd_rs_should_slow_down(struct drbd_conf *mdev); extern int drbd_submit_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e, const unsigned rw, const int fault_type); extern int drbd_release_ee(struct drbd_conf *mdev, struct list_head *list); diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 1ff8418a..db93eee 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -1098,6 +1098,8 @@ int __drbd_set_state(struct drbd_conf *mdev, mdev->ov_left = mdev->rs_total - BM_SECT_TO_BIT(mdev->ov_position); mdev->rs_start = now; + mdev->rs_last_events = 0; + mdev->rs_last_sect_ev = 0; mdev->ov_last_oos_size = 0; mdev->ov_last_oos_start = 0; @@ -2706,7 +2708,8 @@ static void drbd_set_defaults(struct drbd_conf *mdev) /* .c_plan_ahead = */ DRBD_C_PLAN_AHEAD_DEF, /* .c_delay_target = */ DRBD_C_DELAY_TARGET_DEF, /* .c_fill_target = */ DRBD_C_FILL_TARGET_DEF, - /* .c_max_rate = */ DRBD_C_MAX_RATE_DEF + /* .c_max_rate = */ DRBD_C_MAX_RATE_DEF, + /* .c_min_rate = */ DRBD_C_MIN_RATE_DEF }; /* Have to use that way, because the layout differs between @@ -2742,6 +2745,7 @@ void drbd_init_set_defaults(struct drbd_conf *mdev) atomic_set(&mdev->packet_seq, 0); atomic_set(&mdev->pp_in_use, 0); atomic_set(&mdev->rs_sect_in, 0); + atomic_set(&mdev->rs_sect_ev, 0); mutex_init(&mdev->md_io_mutex); mutex_init(&mdev->data.mutex); @@ -2819,6 +2823,7 @@ void drbd_mdev_cleanup(struct drbd_conf *mdev) mdev->rs_total = mdev->rs_failed = 0; mdev->rs_last_events = 0; + mdev->rs_last_sect_ev = 0; for (i = 0; i < DRBD_SYNC_MARKS; i++) { mdev->rs_mark_left[i] = 0; mdev->rs_mark_time[i] = 0; diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index 295b8d5..6b35d41 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c @@ -1604,7 +1604,8 @@ static int drbd_nl_syncer_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *n sc.c_plan_ahead = DRBD_C_PLAN_AHEAD_DEF; sc.c_delay_target = DRBD_C_DELAY_TARGET_DEF; sc.c_fill_target = DRBD_C_FILL_TARGET_DEF; - sc.c_max_rate = DRBD_C_MAX_RATE_DEF; + sc.c_max_rate = DRBD_C_MAX_RATE_DEF; + sc.c_min_rate = DRBD_C_MIN_RATE_DEF; } else memcpy(&sc, &mdev->sync_conf, sizeof(struct syncer_conf)); diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index 346aed9..0d9967fef 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -1561,6 +1561,7 @@ static int recv_resync_read(struct drbd_conf *mdev, sector_t sector, int data_si list_add(&e->w.list, &mdev->sync_ee); spin_unlock_irq(&mdev->req_lock); + atomic_add(data_size >> 9, &mdev->rs_sect_ev); if (drbd_submit_ee(mdev, e, WRITE, DRBD_FAULT_RS_WR) == 0) return TRUE; @@ -2017,17 +2018,66 @@ out_interrupted: return FALSE; } +/* We may throttle resync, if the lower device seems to be busy, + * and current sync rate is above c_min_rate. + * + * To decide whether or not the lower device is busy, we use a scheme similar + * to MD RAID is_mddev_idle(): if the partition stats reveal "significant" + * (more than 64 sectors) of activity we cannot account for with our own resync + * activity, it obviously is "busy". + * + * The current sync rate used here uses only the most recent two step marks, + * to have a short time average so we can react faster. + */ +int drbd_rs_should_slow_down(struct drbd_conf *mdev) +{ + struct gendisk *disk = mdev->ldev->backing_bdev->bd_contains->bd_disk; + unsigned long db, dt, dbdt; + int curr_events; + int throttle = 0; + + /* feature disabled? */ + if (mdev->sync_conf.c_min_rate == 0) + return 0; + + curr_events = (int)part_stat_read(&disk->part0, sectors[0]) + + (int)part_stat_read(&disk->part0, sectors[1]) - + atomic_read(&mdev->rs_sect_ev); + if (!mdev->rs_last_events || curr_events - mdev->rs_last_events > 64) { + unsigned long rs_left; + int i; + + mdev->rs_last_events = curr_events; + + /* sync speed average over the last 2*DRBD_SYNC_MARK_STEP, + * approx. */ + i = (mdev->rs_last_mark + DRBD_SYNC_MARKS-2) % DRBD_SYNC_MARKS; + rs_left = drbd_bm_total_weight(mdev) - mdev->rs_failed; + + dt = ((long)jiffies - (long)mdev->rs_mark_time[i]) / HZ; + if (!dt) + dt++; + db = mdev->rs_mark_left[i] - rs_left; + dbdt = Bit2KB(db/dt); + + if (dbdt > mdev->sync_conf.c_min_rate) + throttle = 1; + } + return throttle; +} + + static int receive_DataRequest(struct drbd_conf *mdev, struct p_header *h) { sector_t sector; const sector_t capacity = drbd_get_capacity(mdev->this_bdev); struct drbd_epoch_entry *e; struct digest_info *di = NULL; + struct p_block_req *p = (struct p_block_req *)h; + const int brps = sizeof(*p)-sizeof(*h); int size, digest_size; unsigned int fault_type; - struct p_block_req *p = - (struct p_block_req *)h; - const int brps = sizeof(*p)-sizeof(*h); + if (drbd_recv(mdev, h->payload, brps) != brps) return FALSE; @@ -2099,8 +2149,9 @@ static int receive_DataRequest(struct drbd_conf *mdev, struct p_header *h) } else if (h->command == P_OV_REPLY) { e->w.cb = w_e_end_ov_reply; dec_rs_pending(mdev); - /* drbd_rs_begin_io done when we sent this request */ - goto submit; + /* drbd_rs_begin_io done when we sent this request, + * but accounting still needs to be done. */ + goto submit_for_resync; } break; @@ -2128,9 +2179,36 @@ static int receive_DataRequest(struct drbd_conf *mdev, struct p_header *h) goto out_free_e; } + /* Throttle, drbd_rs_begin_io and submit should become asynchronous + * wrt the receiver, but it is not as straightforward as it may seem. + * Various places in the resync start and stop logic assume resync + * requests are processed in order, requeuing this on the worker thread + * introduces a bunch of new code for synchronization between threads. + * + * Unlimited throttling before drbd_rs_begin_io may stall the resync + * "forever", throttling after drbd_rs_begin_io will lock that extent + * for application writes for the same time. For now, just throttle + * here, where the rest of the code expects the receiver to sleep for + * a while, anyways. + */ + + /* Throttle before drbd_rs_begin_io, as that locks out application IO; + * this defers syncer requests for some time, before letting at least + * on request through. The resync controller on the receiving side + * will adapt to the incoming rate accordingly. + * + * We cannot throttle here if remote is Primary/SyncTarget: + * we would also throttle its application reads. + * In that case, throttling is done on the SyncTarget only. + */ + if (mdev->state.peer != R_PRIMARY && drbd_rs_should_slow_down(mdev)) + msleep(100); if (drbd_rs_begin_io(mdev, e->sector)) goto out_free_e; +submit_for_resync: + atomic_add(size >> 9, &mdev->rs_sect_ev); + submit: inc_unacked(mdev); spin_lock_irq(&mdev->req_lock); diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c index f5d779b..99c937a 100644 --- a/drivers/block/drbd/drbd_worker.c +++ b/drivers/block/drbd/drbd_worker.c @@ -215,10 +215,8 @@ void drbd_endio_sec(struct bio *bio, int error) */ void drbd_endio_pri(struct bio *bio, int error) { - unsigned long flags; struct drbd_request *req = bio->bi_private; struct drbd_conf *mdev = req->mdev; - struct bio_and_error m; enum drbd_req_event what; int uptodate = bio_flagged(bio, BIO_UPTODATE); @@ -244,12 +242,7 @@ void drbd_endio_pri(struct bio *bio, int error) bio_put(req->private_bio); req->private_bio = ERR_PTR(error); - spin_lock_irqsave(&mdev->req_lock, flags); - __req_mod(req, what, &m); - spin_unlock_irqrestore(&mdev->req_lock, flags); - - if (m.bio) - complete_master_bio(mdev, &m); + req_mod(req, what); } int w_read_retry_remote(struct drbd_conf *mdev, struct drbd_work *w, int cancel) @@ -376,6 +369,9 @@ static int read_for_csum(struct drbd_conf *mdev, sector_t sector, int size) if (!get_ldev(mdev)) return -EIO; + if (drbd_rs_should_slow_down(mdev)) + goto defer; + /* GFP_TRY, because if there is no memory available right now, this may * be rescheduled for later. It is "only" background resync, after all. */ e = drbd_alloc_ee(mdev, DRBD_MAGIC+0xbeef, sector, size, GFP_TRY); @@ -387,6 +383,7 @@ static int read_for_csum(struct drbd_conf *mdev, sector_t sector, int size) list_add(&e->w.list, &mdev->read_ee); spin_unlock_irq(&mdev->req_lock); + atomic_add(size >> 9, &mdev->rs_sect_ev); if (drbd_submit_ee(mdev, e, READ, DRBD_FAULT_RS_RD) == 0) return 0; @@ -512,8 +509,9 @@ int w_make_resync_request(struct drbd_conf *mdev, sector_t sector; const sector_t capacity = drbd_get_capacity(mdev->this_bdev); int max_segment_size; - int number, i, rollback_i, size, pe, mx; + int number, rollback_i, size, pe, mx; int align, queued, sndbuf; + int i = 0; if (unlikely(cancel)) return 1; @@ -549,7 +547,14 @@ int w_make_resync_request(struct drbd_conf *mdev, mdev->c_sync_rate = mdev->sync_conf.rate; number = SLEEP_TIME * mdev->c_sync_rate / ((BM_BLOCK_SIZE / 1024) * HZ); } - pe = atomic_read(&mdev->rs_pending_cnt); + + /* Throttle resync on lower level disk activity, which may also be + * caused by application IO on Primary/SyncTarget. + * Keep this after the call to drbd_rs_controller, as that assumes + * to be called as precisely as possible every SLEEP_TIME, + * and would be confused otherwise. */ + if (drbd_rs_should_slow_down(mdev)) + goto requeue; mutex_lock(&mdev->data.mutex); if (mdev->data.socket) @@ -563,6 +568,7 @@ int w_make_resync_request(struct drbd_conf *mdev, mx = number; /* Limit the number of pending RS requests to no more than the peer's receive buffer */ + pe = atomic_read(&mdev->rs_pending_cnt); if ((pe + number) > mx) { number = mx - pe; } @@ -1492,6 +1498,8 @@ void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side) mdev->rs_failed = 0; mdev->rs_paused = 0; mdev->rs_same_csum = 0; + mdev->rs_last_events = 0; + mdev->rs_last_sect_ev = 0; mdev->rs_total = tw; mdev->rs_start = now; for (i = 0; i < DRBD_SYNC_MARKS; i++) { @@ -1516,6 +1524,7 @@ void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side) } atomic_set(&mdev->rs_sect_in, 0); + atomic_set(&mdev->rs_sect_ev, 0); mdev->rs_in_flight = 0; mdev->rs_planed = 0; spin_lock(&mdev->peer_seq_lock); diff --git a/include/linux/drbd_limits.h b/include/linux/drbd_limits.h index 06dbba4..0b24ded 100644 --- a/include/linux/drbd_limits.h +++ b/include/linux/drbd_limits.h @@ -150,5 +150,9 @@ #define DRBD_C_MAX_RATE_MAX (4 << 20) #define DRBD_C_MAX_RATE_DEF 102400 +#define DRBD_C_MIN_RATE_MIN 0 /* kByte/sec */ +#define DRBD_C_MIN_RATE_MAX (4 << 20) +#define DRBD_C_MIN_RATE_DEF 4096 + #undef RANGE #endif diff --git a/include/linux/drbd_nl.h b/include/linux/drbd_nl.h index e23683c..ade9110 100644 --- a/include/linux/drbd_nl.h +++ b/include/linux/drbd_nl.h @@ -92,6 +92,7 @@ NL_PACKET(syncer_conf, 8, NL_INTEGER( 77, T_MAY_IGNORE, c_delay_target) NL_INTEGER( 78, T_MAY_IGNORE, c_fill_target) NL_INTEGER( 79, T_MAY_IGNORE, c_max_rate) + NL_INTEGER( 80, T_MAY_IGNORE, c_min_rate) ) NL_PACKET(invalidate, 9, ) -- cgit v0.10.2 From 48acf8689847c061bd82c808c379f1bd79dfe407 Mon Sep 17 00:00:00 2001 From: Philipp Reisner Date: Mon, 23 Aug 2010 15:51:56 +0200 Subject: drbd: Microfix: Assigning sector once is sufficient Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index 0d9967fef..591a171 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -365,7 +365,6 @@ struct drbd_epoch_entry *drbd_alloc_ee(struct drbd_conf *mdev, e->size = data_size; e->flags = 0; e->sector = sector; - e->sector = sector; e->block_id = id; return e; -- cgit v0.10.2 From 204bba9965c4cc175bf5bc65ddd19889e9085c72 Mon Sep 17 00:00:00 2001 From: Philipp Reisner Date: Mon, 23 Aug 2010 16:17:13 +0200 Subject: drbd: Bugfix for regression introduced with f9bc8913c06022e If we intent to use the block_id member of an epoch entry, we may not use the digest member. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c index 99c937a..1eeb554 100644 --- a/drivers/block/drbd/drbd_worker.c +++ b/drivers/block/drbd/drbd_worker.c @@ -1052,7 +1052,9 @@ int w_e_end_csum_rs_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel) ok = drbd_send_ack(mdev, P_RS_IS_IN_SYNC, e); } else { inc_rs_pending(mdev); - e->block_id = ID_SYNCER; + e->block_id = ID_SYNCER; /* By setting block_id, digest pointer becomes invalid! */ + e->flags &= ~EE_HAS_DIGEST; /* This e no longer has a digest pointer */ + kfree(di); ok = drbd_send_block(mdev, P_RS_DATA_REPLY, e); } } else { -- cgit v0.10.2 From 0b70a13dac014ec9274640b9e945bde493ba365e Mon Sep 17 00:00:00 2001 From: Philipp Reisner Date: Fri, 20 Aug 2010 13:36:10 +0200 Subject: drbd: Sending of big packets, for payloads from 64KByte to 4GByte Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index 0fedcc0..3f10efc 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -337,13 +337,25 @@ static inline void bm_xfer_ctx_bit_to_word_offset(struct bm_xfer_ctx *c) * NOTE that the payload starts at a long aligned offset, * regardless of 32 or 64 bit arch! */ -struct p_header { +struct p_header80 { u32 magic; u16 command; u16 length; /* bytes of data after this header */ u8 payload[0]; } __packed; -/* 8 bytes. packet FIXED for the next century! */ + +/* Header for big packets, Used for data packets exceeding 64kB */ +struct p_header95 { + u16 magic; /* use DRBD_MAGIC_BIG here */ + u16 command; + u32 length; + u8 payload[0]; +} __packed; + +union p_header { + struct p_header80 h80; + struct p_header95 h95; +}; /* * short commands, packets without payload, plain p_header: @@ -367,7 +379,7 @@ struct p_header { #define DP_MAY_SET_IN_SYNC 4 struct p_data { - struct p_header head; + union p_header head; u64 sector; /* 64 bits sector number */ u64 block_id; /* to identify the request in protocol B&C */ u32 seq_num; @@ -383,7 +395,7 @@ struct p_data { * P_DATA_REQUEST, P_RS_DATA_REQUEST */ struct p_block_ack { - struct p_header head; + struct p_header80 head; u64 sector; u64 block_id; u32 blksize; @@ -392,7 +404,7 @@ struct p_block_ack { struct p_block_req { - struct p_header head; + struct p_header80 head; u64 sector; u64 block_id; u32 blksize; @@ -409,7 +421,7 @@ struct p_block_req { */ struct p_handshake { - struct p_header head; /* 8 bytes */ + struct p_header80 head; /* 8 bytes */ u32 protocol_min; u32 feature_flags; u32 protocol_max; @@ -424,19 +436,19 @@ struct p_handshake { /* 80 bytes, FIXED for the next century */ struct p_barrier { - struct p_header head; + struct p_header80 head; u32 barrier; /* barrier number _handle_ only */ u32 pad; /* to multiple of 8 Byte */ } __packed; struct p_barrier_ack { - struct p_header head; + struct p_header80 head; u32 barrier; u32 set_size; } __packed; struct p_rs_param { - struct p_header head; + struct p_header80 head; u32 rate; /* Since protocol version 88 and higher. */ @@ -444,7 +456,7 @@ struct p_rs_param { } __packed; struct p_rs_param_89 { - struct p_header head; + struct p_header80 head; u32 rate; /* protocol version 89: */ char verify_alg[SHARED_SECRET_MAX]; @@ -452,7 +464,7 @@ struct p_rs_param_89 { } __packed; struct p_rs_param_95 { - struct p_header head; + struct p_header80 head; u32 rate; char verify_alg[SHARED_SECRET_MAX]; char csums_alg[SHARED_SECRET_MAX]; @@ -468,7 +480,7 @@ enum drbd_conn_flags { }; struct p_protocol { - struct p_header head; + struct p_header80 head; u32 protocol; u32 after_sb_0p; u32 after_sb_1p; @@ -482,17 +494,17 @@ struct p_protocol { } __packed; struct p_uuids { - struct p_header head; + struct p_header80 head; u64 uuid[UI_EXTENDED_SIZE]; } __packed; struct p_rs_uuid { - struct p_header head; + struct p_header80 head; u64 uuid; } __packed; struct p_sizes { - struct p_header head; + struct p_header80 head; u64 d_size; /* size of disk */ u64 u_size; /* user requested size */ u64 c_size; /* current exported size */ @@ -502,18 +514,18 @@ struct p_sizes { } __packed; struct p_state { - struct p_header head; + struct p_header80 head; u32 state; } __packed; struct p_req_state { - struct p_header head; + struct p_header80 head; u32 mask; u32 val; } __packed; struct p_req_state_reply { - struct p_header head; + struct p_header80 head; u32 retcode; } __packed; @@ -528,7 +540,7 @@ struct p_drbd06_param { } __packed; struct p_discard { - struct p_header head; + struct p_header80 head; u64 block_id; u32 seq_num; u32 pad; @@ -544,7 +556,7 @@ enum drbd_bitmap_code { }; struct p_compressed_bm { - struct p_header head; + struct p_header80 head; /* (encoding & 0x0f): actual encoding, see enum drbd_bitmap_code * (encoding & 0x80): polarity (set/unset) of first runlength * ((encoding >> 4) & 0x07): pad_bits, number of trailing zero bits @@ -555,10 +567,10 @@ struct p_compressed_bm { u8 code[0]; } __packed; -struct p_delay_probe { - struct p_header head; - u32 seq_num; /* sequence number to match the two probe packets */ - u32 offset; /* usecs the probe got sent after the reference time point */ +struct p_delay_probe93 { + struct p_header80 head; + u32 seq_num; /* sequence number to match the two probe packets */ + u32 offset; /* usecs the probe got sent after the reference time point */ } __packed; /* DCBP: Drbd Compressed Bitmap Packet ... */ @@ -605,7 +617,7 @@ DCBP_set_pad_bits(struct p_compressed_bm *p, int n) * so we need to use the fixed size 4KiB page size * most architechtures have used for a long time. */ -#define BM_PACKET_PAYLOAD_BYTES (4096 - sizeof(struct p_header)) +#define BM_PACKET_PAYLOAD_BYTES (4096 - sizeof(struct p_header80)) #define BM_PACKET_WORDS (BM_PACKET_PAYLOAD_BYTES/sizeof(long)) #define BM_PACKET_VLI_BYTES_MAX (4096 - sizeof(struct p_compressed_bm)) #if (PAGE_SIZE < 4096) @@ -614,7 +626,7 @@ DCBP_set_pad_bits(struct p_compressed_bm *p, int n) #endif union p_polymorph { - struct p_header header; + struct p_header80 header; struct p_handshake handshake; struct p_data data; struct p_block_ack block_ack; @@ -1188,12 +1200,12 @@ extern int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_f extern int _drbd_send_state(struct drbd_conf *mdev); extern int drbd_send_state(struct drbd_conf *mdev); extern int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock, - enum drbd_packets cmd, struct p_header *h, + enum drbd_packets cmd, struct p_header80 *h, size_t size, unsigned msg_flags); #define USE_DATA_SOCKET 1 #define USE_META_SOCKET 0 extern int drbd_send_cmd(struct drbd_conf *mdev, int use_data_socket, - enum drbd_packets cmd, struct p_header *h, + enum drbd_packets cmd, struct p_header80 *h, size_t size); extern int drbd_send_cmd2(struct drbd_conf *mdev, enum drbd_packets cmd, char *data, size_t size); @@ -1936,19 +1948,19 @@ static inline void request_ping(struct drbd_conf *mdev) static inline int drbd_send_short_cmd(struct drbd_conf *mdev, enum drbd_packets cmd) { - struct p_header h; + struct p_header80 h; return drbd_send_cmd(mdev, USE_DATA_SOCKET, cmd, &h, sizeof(h)); } static inline int drbd_send_ping(struct drbd_conf *mdev) { - struct p_header h; + struct p_header80 h; return drbd_send_cmd(mdev, USE_META_SOCKET, P_PING, &h, sizeof(h)); } static inline int drbd_send_ping_ack(struct drbd_conf *mdev) { - struct p_header h; + struct p_header80 h; return drbd_send_cmd(mdev, USE_META_SOCKET, P_PING_ACK, &h, sizeof(h)); } diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index db93eee..f3f4ea9 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -1647,7 +1647,7 @@ void drbd_thread_current_set_cpu(struct drbd_conf *mdev) /* the appropriate socket mutex must be held already */ int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock, - enum drbd_packets cmd, struct p_header *h, + enum drbd_packets cmd, struct p_header80 *h, size_t size, unsigned msg_flags) { int sent, ok; @@ -1657,7 +1657,7 @@ int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock, h->magic = BE_DRBD_MAGIC; h->command = cpu_to_be16(cmd); - h->length = cpu_to_be16(size-sizeof(struct p_header)); + h->length = cpu_to_be16(size-sizeof(struct p_header80)); sent = drbd_send(mdev, sock, h, size, msg_flags); @@ -1672,7 +1672,7 @@ int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock, * when we hold the appropriate socket mutex. */ int drbd_send_cmd(struct drbd_conf *mdev, int use_data_socket, - enum drbd_packets cmd, struct p_header *h, size_t size) + enum drbd_packets cmd, struct p_header80 *h, size_t size) { int ok = 0; struct socket *sock; @@ -1700,7 +1700,7 @@ int drbd_send_cmd(struct drbd_conf *mdev, int use_data_socket, int drbd_send_cmd2(struct drbd_conf *mdev, enum drbd_packets cmd, char *data, size_t size) { - struct p_header h; + struct p_header80 h; int ok; h.magic = BE_DRBD_MAGIC; @@ -1807,7 +1807,7 @@ int drbd_send_protocol(struct drbd_conf *mdev) strcpy(p->integrity_alg, mdev->net_conf->integrity_alg); rv = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_PROTOCOL, - (struct p_header *)p, size); + (struct p_header80 *)p, size); kfree(p); return rv; } @@ -1833,7 +1833,7 @@ int _drbd_send_uuids(struct drbd_conf *mdev, u64 uuid_flags) put_ldev(mdev); return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_UUIDS, - (struct p_header *)&p, sizeof(p)); + (struct p_header80 *)&p, sizeof(p)); } int drbd_send_uuids(struct drbd_conf *mdev) @@ -1854,7 +1854,7 @@ int drbd_send_sync_uuid(struct drbd_conf *mdev, u64 val) p.uuid = cpu_to_be64(val); return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SYNC_UUID, - (struct p_header *)&p, sizeof(p)); + (struct p_header80 *)&p, sizeof(p)); } int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags flags) @@ -1884,7 +1884,7 @@ int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags fl p.dds_flags = cpu_to_be16(flags); ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SIZES, - (struct p_header *)&p, sizeof(p)); + (struct p_header80 *)&p, sizeof(p)); return ok; } @@ -1909,7 +1909,7 @@ int drbd_send_state(struct drbd_conf *mdev) if (likely(sock != NULL)) { ok = _drbd_send_cmd(mdev, sock, P_STATE, - (struct p_header *)&p, sizeof(p), 0); + (struct p_header80 *)&p, sizeof(p), 0); } mutex_unlock(&mdev->data.mutex); @@ -1927,7 +1927,7 @@ int drbd_send_state_req(struct drbd_conf *mdev, p.val = cpu_to_be32(val.i); return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_STATE_CHG_REQ, - (struct p_header *)&p, sizeof(p)); + (struct p_header80 *)&p, sizeof(p)); } int drbd_send_sr_reply(struct drbd_conf *mdev, int retcode) @@ -1937,7 +1937,7 @@ int drbd_send_sr_reply(struct drbd_conf *mdev, int retcode) p.retcode = cpu_to_be32(retcode); return drbd_send_cmd(mdev, USE_META_SOCKET, P_STATE_CHG_REPLY, - (struct p_header *)&p, sizeof(p)); + (struct p_header80 *)&p, sizeof(p)); } int fill_bitmap_rle_bits(struct drbd_conf *mdev, @@ -2036,7 +2036,7 @@ int fill_bitmap_rle_bits(struct drbd_conf *mdev, enum { OK, FAILED, DONE } send_bitmap_rle_or_plain(struct drbd_conf *mdev, - struct p_header *h, struct bm_xfer_ctx *c) + struct p_header80 *h, struct bm_xfer_ctx *c) { struct p_compressed_bm *p = (void*)h; unsigned long num_words; @@ -2066,12 +2066,12 @@ send_bitmap_rle_or_plain(struct drbd_conf *mdev, if (len) drbd_bm_get_lel(mdev, c->word_offset, num_words, (unsigned long*)h->payload); ok = _drbd_send_cmd(mdev, mdev->data.socket, P_BITMAP, - h, sizeof(struct p_header) + len, 0); + h, sizeof(struct p_header80) + len, 0); c->word_offset += num_words; c->bit_offset = c->word_offset * BITS_PER_LONG; c->packets[1]++; - c->bytes[1] += sizeof(struct p_header) + len; + c->bytes[1] += sizeof(struct p_header80) + len; if (c->bit_offset > c->bm_bits) c->bit_offset = c->bm_bits; @@ -2087,14 +2087,14 @@ send_bitmap_rle_or_plain(struct drbd_conf *mdev, int _drbd_send_bitmap(struct drbd_conf *mdev) { struct bm_xfer_ctx c; - struct p_header *p; + struct p_header80 *p; int ret; ERR_IF(!mdev->bitmap) return FALSE; /* maybe we should use some per thread scratch page, * and allocate that during initial device creation? */ - p = (struct p_header *) __get_free_page(GFP_NOIO); + p = (struct p_header80 *) __get_free_page(GFP_NOIO); if (!p) { dev_err(DEV, "failed to allocate one page buffer in %s\n", __func__); return FALSE; @@ -2152,7 +2152,7 @@ int drbd_send_b_ack(struct drbd_conf *mdev, u32 barrier_nr, u32 set_size) if (mdev->state.conn < C_CONNECTED) return FALSE; ok = drbd_send_cmd(mdev, USE_META_SOCKET, P_BARRIER_ACK, - (struct p_header *)&p, sizeof(p)); + (struct p_header80 *)&p, sizeof(p)); return ok; } @@ -2180,7 +2180,7 @@ static int _drbd_send_ack(struct drbd_conf *mdev, enum drbd_packets cmd, if (!mdev->meta.socket || mdev->state.conn < C_CONNECTED) return FALSE; ok = drbd_send_cmd(mdev, USE_META_SOCKET, cmd, - (struct p_header *)&p, sizeof(p)); + (struct p_header80 *)&p, sizeof(p)); return ok; } @@ -2188,8 +2188,8 @@ int drbd_send_ack_dp(struct drbd_conf *mdev, enum drbd_packets cmd, struct p_data *dp) { const int header_size = sizeof(struct p_data) - - sizeof(struct p_header); - int data_size = ((struct p_header *)dp)->length - header_size; + - sizeof(struct p_header80); + int data_size = ((struct p_header80 *)dp)->length - header_size; return _drbd_send_ack(mdev, cmd, dp->sector, cpu_to_be32(data_size), dp->block_id); @@ -2238,7 +2238,7 @@ int drbd_send_drequest(struct drbd_conf *mdev, int cmd, p.blksize = cpu_to_be32(size); ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, cmd, - (struct p_header *)&p, sizeof(p)); + (struct p_header80 *)&p, sizeof(p)); return ok; } @@ -2256,7 +2256,7 @@ int drbd_send_drequest_csum(struct drbd_conf *mdev, p.head.magic = BE_DRBD_MAGIC; p.head.command = cpu_to_be16(cmd); - p.head.length = cpu_to_be16(sizeof(p) - sizeof(struct p_header) + digest_size); + p.head.length = cpu_to_be16(sizeof(p) - sizeof(struct p_header80) + digest_size); mutex_lock(&mdev->data.mutex); @@ -2278,7 +2278,7 @@ int drbd_send_ov_request(struct drbd_conf *mdev, sector_t sector, int size) p.blksize = cpu_to_be32(size); ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_OV_REQUEST, - (struct p_header *)&p, sizeof(p)); + (struct p_header80 *)&p, sizeof(p)); return ok; } @@ -2447,10 +2447,17 @@ int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req) dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_w_tfm) ? crypto_hash_digestsize(mdev->integrity_w_tfm) : 0; - p.head.magic = BE_DRBD_MAGIC; - p.head.command = cpu_to_be16(P_DATA); - p.head.length = - cpu_to_be16(sizeof(p) - sizeof(struct p_header) + dgs + req->size); + if (req->size <= (1 << 15)) { + p.head.h80.magic = BE_DRBD_MAGIC; + p.head.h80.command = cpu_to_be16(P_DATA); + p.head.h80.length = + cpu_to_be16(sizeof(p) - sizeof(union p_header) + dgs + req->size); + } else { + p.head.h95.magic = BE_DRBD_MAGIC_BIG; + p.head.h95.command = cpu_to_be16(P_DATA); + p.head.h95.length = + cpu_to_be32(sizeof(p) - sizeof(union p_header) + dgs + req->size); + } p.sector = cpu_to_be64(req->sector); p.block_id = (unsigned long)req; @@ -2511,10 +2518,17 @@ int drbd_send_block(struct drbd_conf *mdev, enum drbd_packets cmd, dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_w_tfm) ? crypto_hash_digestsize(mdev->integrity_w_tfm) : 0; - p.head.magic = BE_DRBD_MAGIC; - p.head.command = cpu_to_be16(cmd); - p.head.length = - cpu_to_be16(sizeof(p) - sizeof(struct p_header) + dgs + e->size); + if (e->size <= (1 << 15)) { + p.head.h80.magic = BE_DRBD_MAGIC; + p.head.h80.command = cpu_to_be16(cmd); + p.head.h80.length = + cpu_to_be16(sizeof(p) - sizeof(struct p_header80) + dgs + e->size); + } else { + p.head.h95.magic = BE_DRBD_MAGIC_BIG; + p.head.h95.command = cpu_to_be16(cmd); + p.head.h95.length = + cpu_to_be32(sizeof(p) - sizeof(struct p_header80) + dgs + e->size); + } p.sector = cpu_to_be64(e->sector); p.block_id = e->block_id; @@ -2527,8 +2541,7 @@ int drbd_send_block(struct drbd_conf *mdev, enum drbd_packets cmd, if (!drbd_get_data_sock(mdev)) return 0; - ok = sizeof(p) == drbd_send(mdev, mdev->data.socket, &p, - sizeof(p), dgs ? MSG_MORE : 0); + ok = sizeof(p) == drbd_send(mdev, mdev->data.socket, &p, sizeof(p), dgs ? MSG_MORE : 0); if (ok && dgs) { dgb = mdev->int_dig_out; drbd_csum_ee(mdev, mdev->integrity_w_tfm, e, dgb); diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index 591a171..9b3321e 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -720,14 +720,14 @@ out: static int drbd_send_fp(struct drbd_conf *mdev, struct socket *sock, enum drbd_packets cmd) { - struct p_header *h = (struct p_header *) &mdev->data.sbuf.header; + struct p_header80 *h = (struct p_header80 *) &mdev->data.sbuf.header; return _drbd_send_cmd(mdev, sock, cmd, h, sizeof(*h), 0); } static enum drbd_packets drbd_recv_fp(struct drbd_conf *mdev, struct socket *sock) { - struct p_header *h = (struct p_header *) &mdev->data.sbuf.header; + struct p_header80 *h = (struct p_header80 *) &mdev->data.sbuf.header; int rr; rr = drbd_recv_short(mdev, sock, h, sizeof(*h), 0); @@ -944,7 +944,7 @@ out_release_sockets: return -1; } -static int drbd_recv_header(struct drbd_conf *mdev, struct p_header *h) +static int drbd_recv_header(struct drbd_conf *mdev, struct p_header80 *h) { int r; @@ -1266,7 +1266,7 @@ int w_e_reissue(struct drbd_conf *mdev, struct drbd_work *w, int cancel) __relea return 1; } -static int receive_Barrier(struct drbd_conf *mdev, struct p_header *h) +static int receive_Barrier(struct drbd_conf *mdev, struct p_header80 *h) { int rv, issue_flush; struct p_barrier *p = (struct p_barrier *)h; @@ -1570,7 +1570,7 @@ fail: return FALSE; } -static int receive_DataReply(struct drbd_conf *mdev, struct p_header *h) +static int receive_DataReply(struct drbd_conf *mdev, struct p_header80 *h) { struct drbd_request *req; sector_t sector; @@ -1610,7 +1610,7 @@ static int receive_DataReply(struct drbd_conf *mdev, struct p_header *h) return ok; } -static int receive_RSDataReply(struct drbd_conf *mdev, struct p_header *h) +static int receive_RSDataReply(struct drbd_conf *mdev, struct p_header80 *h) { sector_t sector; unsigned int header_size, data_size; @@ -1767,7 +1767,7 @@ static int drbd_wait_peer_seq(struct drbd_conf *mdev, const u32 packet_seq) } /* mirrored write */ -static int receive_Data(struct drbd_conf *mdev, struct p_header *h) +static int receive_Data(struct drbd_conf *mdev, struct p_header80 *h) { sector_t sector; struct drbd_epoch_entry *e; @@ -2066,7 +2066,7 @@ int drbd_rs_should_slow_down(struct drbd_conf *mdev) } -static int receive_DataRequest(struct drbd_conf *mdev, struct p_header *h) +static int receive_DataRequest(struct drbd_conf *mdev, struct p_header80 *h) { sector_t sector; const sector_t capacity = drbd_get_capacity(mdev->this_bdev); @@ -2756,7 +2756,7 @@ static int cmp_after_sb(enum drbd_after_sb_p peer, enum drbd_after_sb_p self) return 1; } -static int receive_protocol(struct drbd_conf *mdev, struct p_header *h) +static int receive_protocol(struct drbd_conf *mdev, struct p_header80 *h) { struct p_protocol *p = (struct p_protocol *)h; int header_size, data_size; @@ -2862,7 +2862,7 @@ struct crypto_hash *drbd_crypto_alloc_digest_safe(const struct drbd_conf *mdev, return tfm; } -static int receive_SyncParam(struct drbd_conf *mdev, struct p_header *h) +static int receive_SyncParam(struct drbd_conf *mdev, struct p_header80 *h) { int ok = TRUE; struct p_rs_param_95 *p = (struct p_rs_param_95 *)h; @@ -3032,7 +3032,7 @@ static void warn_if_differ_considerably(struct drbd_conf *mdev, (unsigned long long)a, (unsigned long long)b); } -static int receive_sizes(struct drbd_conf *mdev, struct p_header *h) +static int receive_sizes(struct drbd_conf *mdev, struct p_header80 *h) { struct p_sizes *p = (struct p_sizes *)h; enum determine_dev_size dd = unchanged; @@ -3148,7 +3148,7 @@ static int receive_sizes(struct drbd_conf *mdev, struct p_header *h) return TRUE; } -static int receive_uuids(struct drbd_conf *mdev, struct p_header *h) +static int receive_uuids(struct drbd_conf *mdev, struct p_header80 *h) { struct p_uuids *p = (struct p_uuids *)h; u64 *p_uuid; @@ -3241,7 +3241,7 @@ static union drbd_state convert_state(union drbd_state ps) return ms; } -static int receive_req_state(struct drbd_conf *mdev, struct p_header *h) +static int receive_req_state(struct drbd_conf *mdev, struct p_header80 *h) { struct p_req_state *p = (struct p_req_state *)h; union drbd_state mask, val; @@ -3271,7 +3271,7 @@ static int receive_req_state(struct drbd_conf *mdev, struct p_header *h) return TRUE; } -static int receive_state(struct drbd_conf *mdev, struct p_header *h) +static int receive_state(struct drbd_conf *mdev, struct p_header80 *h) { struct p_state *p = (struct p_state *)h; enum drbd_conns nconn, oconn; @@ -3395,7 +3395,7 @@ static int receive_state(struct drbd_conf *mdev, struct p_header *h) return TRUE; } -static int receive_sync_uuid(struct drbd_conf *mdev, struct p_header *h) +static int receive_sync_uuid(struct drbd_conf *mdev, struct p_header80 *h) { struct p_rs_uuid *p = (struct p_rs_uuid *)h; @@ -3428,7 +3428,7 @@ static int receive_sync_uuid(struct drbd_conf *mdev, struct p_header *h) enum receive_bitmap_ret { OK, DONE, FAILED }; static enum receive_bitmap_ret -receive_bitmap_plain(struct drbd_conf *mdev, struct p_header *h, +receive_bitmap_plain(struct drbd_conf *mdev, struct p_header80 *h, unsigned long *buffer, struct bm_xfer_ctx *c) { unsigned num_words = min_t(size_t, BM_PACKET_WORDS, c->bm_words - c->word_offset); @@ -3533,7 +3533,7 @@ void INFO_bm_xfer_stats(struct drbd_conf *mdev, const char *direction, struct bm_xfer_ctx *c) { /* what would it take to transfer it "plaintext" */ - unsigned plain = sizeof(struct p_header) * + unsigned plain = sizeof(struct p_header80) * ((c->bm_words+BM_PACKET_WORDS-1)/BM_PACKET_WORDS+1) + c->bm_words * sizeof(long); unsigned total = c->bytes[0] + c->bytes[1]; @@ -3571,7 +3571,7 @@ void INFO_bm_xfer_stats(struct drbd_conf *mdev, in order to be agnostic to the 32 vs 64 bits issue. returns 0 on failure, 1 if we successfully received it. */ -static int receive_bitmap(struct drbd_conf *mdev, struct p_header *h) +static int receive_bitmap(struct drbd_conf *mdev, struct p_header80 *h) { struct bm_xfer_ctx c; void *buffer; @@ -3623,7 +3623,7 @@ static int receive_bitmap(struct drbd_conf *mdev, struct p_header *h) } c.packets[h->command == P_BITMAP]++; - c.bytes[h->command == P_BITMAP] += sizeof(struct p_header) + h->length; + c.bytes[h->command == P_BITMAP] += sizeof(struct p_header80) + h->length; if (ret != OK) break; @@ -3659,7 +3659,7 @@ static int receive_bitmap(struct drbd_conf *mdev, struct p_header *h) return ok; } -static int receive_skip_(struct drbd_conf *mdev, struct p_header *h, int silent) +static int receive_skip_(struct drbd_conf *mdev, struct p_header80 *h, int silent) { /* TODO zero copy sink :) */ static char sink[128]; @@ -3679,17 +3679,17 @@ static int receive_skip_(struct drbd_conf *mdev, struct p_header *h, int silent) return size == 0; } -static int receive_skip(struct drbd_conf *mdev, struct p_header *h) +static int receive_skip(struct drbd_conf *mdev, struct p_header80 *h) { return receive_skip_(mdev, h, 0); } -static int receive_skip_silent(struct drbd_conf *mdev, struct p_header *h) +static int receive_skip_silent(struct drbd_conf *mdev, struct p_header80 *h) { return receive_skip_(mdev, h, 1); } -static int receive_UnplugRemote(struct drbd_conf *mdev, struct p_header *h) +static int receive_UnplugRemote(struct drbd_conf *mdev, struct p_header80 *h) { if (mdev->state.disk >= D_INCONSISTENT) drbd_kick_lo(mdev); @@ -3701,7 +3701,7 @@ static int receive_UnplugRemote(struct drbd_conf *mdev, struct p_header *h) return TRUE; } -typedef int (*drbd_cmd_handler_f)(struct drbd_conf *, struct p_header *); +typedef int (*drbd_cmd_handler_f)(struct drbd_conf *, struct p_header80 *); static drbd_cmd_handler_f drbd_default_handler[] = { [P_DATA] = receive_Data, @@ -3736,7 +3736,7 @@ static drbd_cmd_handler_f *drbd_opt_cmd_handler; static void drbdd(struct drbd_conf *mdev) { drbd_cmd_handler_f handler; - struct p_header *header = &mdev->data.rbuf.header; + struct p_header80 *header = &mdev->data.rbuf.header; while (get_t_state(&mdev->receiver) == Running) { drbd_thread_current_set_cpu(mdev); @@ -3964,7 +3964,7 @@ static int drbd_send_handshake(struct drbd_conf *mdev) p->protocol_min = cpu_to_be32(PRO_VERSION_MIN); p->protocol_max = cpu_to_be32(PRO_VERSION_MAX); ok = _drbd_send_cmd( mdev, mdev->data.socket, P_HAND_SHAKE, - (struct p_header *)p, sizeof(*p), 0 ); + (struct p_header80 *)p, sizeof(*p), 0 ); mutex_unlock(&mdev->data.mutex); return ok; } @@ -3981,7 +3981,7 @@ static int drbd_do_handshake(struct drbd_conf *mdev) /* ASSERT current == mdev->receiver ... */ struct p_handshake *p = &mdev->data.rbuf.handshake; const int expect = sizeof(struct p_handshake) - -sizeof(struct p_header); + -sizeof(struct p_header80); int rv; rv = drbd_send_handshake(mdev); @@ -4058,7 +4058,7 @@ static int drbd_do_auth(struct drbd_conf *mdev) char *response = NULL; char *right_response = NULL; char *peers_ch = NULL; - struct p_header p; + struct p_header80 p; unsigned int key_len = strlen(mdev->net_conf->shared_secret); unsigned int resp_size; struct hash_desc desc; @@ -4231,7 +4231,7 @@ int drbdd_init(struct drbd_thread *thi) /* ********* acknowledge sender ******** */ -static int got_RqSReply(struct drbd_conf *mdev, struct p_header *h) +static int got_RqSReply(struct drbd_conf *mdev, struct p_header80 *h) { struct p_req_state_reply *p = (struct p_req_state_reply *)h; @@ -4249,13 +4249,13 @@ static int got_RqSReply(struct drbd_conf *mdev, struct p_header *h) return TRUE; } -static int got_Ping(struct drbd_conf *mdev, struct p_header *h) +static int got_Ping(struct drbd_conf *mdev, struct p_header80 *h) { return drbd_send_ping_ack(mdev); } -static int got_PingAck(struct drbd_conf *mdev, struct p_header *h) +static int got_PingAck(struct drbd_conf *mdev, struct p_header80 *h) { /* restore idle timeout */ mdev->meta.socket->sk->sk_rcvtimeo = mdev->net_conf->ping_int*HZ; @@ -4265,7 +4265,7 @@ static int got_PingAck(struct drbd_conf *mdev, struct p_header *h) return TRUE; } -static int got_IsInSync(struct drbd_conf *mdev, struct p_header *h) +static int got_IsInSync(struct drbd_conf *mdev, struct p_header80 *h) { struct p_block_ack *p = (struct p_block_ack *)h; sector_t sector = be64_to_cpu(p->sector); @@ -4336,7 +4336,7 @@ static int validate_req_change_req_state(struct drbd_conf *mdev, return TRUE; } -static int got_BlockAck(struct drbd_conf *mdev, struct p_header *h) +static int got_BlockAck(struct drbd_conf *mdev, struct p_header80 *h) { struct p_block_ack *p = (struct p_block_ack *)h; sector_t sector = be64_to_cpu(p->sector); @@ -4376,7 +4376,7 @@ static int got_BlockAck(struct drbd_conf *mdev, struct p_header *h) _ack_id_to_req, __func__ , what); } -static int got_NegAck(struct drbd_conf *mdev, struct p_header *h) +static int got_NegAck(struct drbd_conf *mdev, struct p_header80 *h) { struct p_block_ack *p = (struct p_block_ack *)h; sector_t sector = be64_to_cpu(p->sector); @@ -4396,7 +4396,7 @@ static int got_NegAck(struct drbd_conf *mdev, struct p_header *h) _ack_id_to_req, __func__ , neg_acked); } -static int got_NegDReply(struct drbd_conf *mdev, struct p_header *h) +static int got_NegDReply(struct drbd_conf *mdev, struct p_header80 *h) { struct p_block_ack *p = (struct p_block_ack *)h; sector_t sector = be64_to_cpu(p->sector); @@ -4409,7 +4409,7 @@ static int got_NegDReply(struct drbd_conf *mdev, struct p_header *h) _ar_id_to_req, __func__ , neg_acked); } -static int got_NegRSDReply(struct drbd_conf *mdev, struct p_header *h) +static int got_NegRSDReply(struct drbd_conf *mdev, struct p_header80 *h) { sector_t sector; int size; @@ -4431,7 +4431,7 @@ static int got_NegRSDReply(struct drbd_conf *mdev, struct p_header *h) return TRUE; } -static int got_BarrierAck(struct drbd_conf *mdev, struct p_header *h) +static int got_BarrierAck(struct drbd_conf *mdev, struct p_header80 *h) { struct p_barrier_ack *p = (struct p_barrier_ack *)h; @@ -4440,7 +4440,7 @@ static int got_BarrierAck(struct drbd_conf *mdev, struct p_header *h) return TRUE; } -static int got_OVResult(struct drbd_conf *mdev, struct p_header *h) +static int got_OVResult(struct drbd_conf *mdev, struct p_header80 *h) { struct p_block_ack *p = (struct p_block_ack *)h; struct drbd_work *w; @@ -4474,7 +4474,7 @@ static int got_OVResult(struct drbd_conf *mdev, struct p_header *h) return TRUE; } -static int got_something_to_ignore_m(struct drbd_conf *mdev, struct p_header *h) +static int got_something_to_ignore_m(struct drbd_conf *mdev, struct p_header80 *h) { /* IGNORE */ return TRUE; @@ -4482,7 +4482,7 @@ static int got_something_to_ignore_m(struct drbd_conf *mdev, struct p_header *h) struct asender_cmd { size_t pkt_size; - int (*process)(struct drbd_conf *mdev, struct p_header *h); + int (*process)(struct drbd_conf *mdev, struct p_header80 *h); }; static struct asender_cmd *get_asender_cmd(int cmd) @@ -4491,8 +4491,8 @@ static struct asender_cmd *get_asender_cmd(int cmd) /* anything missing from this table is in * the drbd_cmd_handler (drbd_default_handler) table, * see the beginning of drbdd() */ - [P_PING] = { sizeof(struct p_header), got_Ping }, - [P_PING_ACK] = { sizeof(struct p_header), got_PingAck }, + [P_PING] = { sizeof(struct p_header80), got_Ping }, + [P_PING_ACK] = { sizeof(struct p_header80), got_PingAck }, [P_RECV_ACK] = { sizeof(struct p_block_ack), got_BlockAck }, [P_WRITE_ACK] = { sizeof(struct p_block_ack), got_BlockAck }, [P_RS_WRITE_ACK] = { sizeof(struct p_block_ack), got_BlockAck }, @@ -4504,7 +4504,7 @@ static struct asender_cmd *get_asender_cmd(int cmd) [P_BARRIER_ACK] = { sizeof(struct p_barrier_ack), got_BarrierAck }, [P_STATE_CHG_REPLY] = { sizeof(struct p_req_state_reply), got_RqSReply }, [P_RS_IS_IN_SYNC] = { sizeof(struct p_block_ack), got_IsInSync }, - [P_DELAY_PROBE] = { sizeof(struct p_delay_probe), got_something_to_ignore_m }, + [P_DELAY_PROBE] = { sizeof(struct p_delay_probe93), got_something_to_ignore_m }, [P_MAX_CMD] = { 0, NULL }, }; if (cmd > P_MAX_CMD || asender_tbl[cmd].process == NULL) @@ -4515,13 +4515,13 @@ static struct asender_cmd *get_asender_cmd(int cmd) int drbd_asender(struct drbd_thread *thi) { struct drbd_conf *mdev = thi->mdev; - struct p_header *h = &mdev->meta.rbuf.header; + struct p_header80 *h = &mdev->meta.rbuf.header; struct asender_cmd *cmd = NULL; int rv, len; void *buf = h; int received = 0; - int expect = sizeof(struct p_header); + int expect = sizeof(struct p_header80); int empty; sprintf(current->comm, "drbd%d_asender", mdev_to_minor(mdev)); @@ -4621,7 +4621,7 @@ int drbd_asender(struct drbd_thread *thi) goto disconnect; } expect = cmd->pkt_size; - ERR_IF(len != expect-sizeof(struct p_header)) + ERR_IF(len != expect-sizeof(struct p_header80)) goto reconnect; } if (received == expect) { @@ -4631,7 +4631,7 @@ int drbd_asender(struct drbd_thread *thi) buf = h; received = 0; - expect = sizeof(struct p_header); + expect = sizeof(struct p_header80); cmd = NULL; } } diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c index 1eeb554..3d0e14e 100644 --- a/drivers/block/drbd/drbd_worker.c +++ b/drivers/block/drbd/drbd_worker.c @@ -1204,7 +1204,7 @@ int w_send_barrier(struct drbd_conf *mdev, struct drbd_work *w, int cancel) * dec_ap_pending will be done in got_BarrierAck * or (on connection loss) in w_clear_epoch. */ ok = _drbd_send_cmd(mdev, mdev->data.socket, P_BARRIER, - (struct p_header *)p, sizeof(*p), 0); + (struct p_header80 *)p, sizeof(*p), 0); drbd_put_data_sock(mdev); return ok; diff --git a/include/linux/drbd.h b/include/linux/drbd.h index 0b2bfb5..89718a3 100644 --- a/include/linux/drbd.h +++ b/include/linux/drbd.h @@ -318,6 +318,8 @@ enum drbd_timeout_flag { #define DRBD_MAGIC 0x83740267 #define BE_DRBD_MAGIC __constant_cpu_to_be32(DRBD_MAGIC) +#define DRBD_MAGIC_BIG 0x835a +#define BE_DRBD_MAGIC_BIG __constant_cpu_to_be16(DRBD_MAGIC_BIG) /* these are of type "int" */ #define DRBD_MD_INDEX_INTERNAL -1 -- cgit v0.10.2 From 02918be2273a6b086292e0d85b740336eda46e36 Mon Sep 17 00:00:00 2001 From: Philipp Reisner Date: Fri, 20 Aug 2010 14:35:10 +0200 Subject: drbd: receiving of big packets, for payloads between 64kByte and 4GByte Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index 3f10efc..db7e655 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -626,7 +626,7 @@ DCBP_set_pad_bits(struct p_compressed_bm *p, int n) #endif union p_polymorph { - struct p_header80 header; + union p_header header; struct p_handshake handshake; struct p_data data; struct p_block_ack block_ack; @@ -641,6 +641,8 @@ union p_polymorph { struct p_req_state req_state; struct p_req_state_reply req_state_reply; struct p_block_req block_req; + struct p_delay_probe93 delay_probe93; + struct p_rs_uuid rs_uuid; } __packed; /**********************************************************************/ diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index 9b3321e..fe30864 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -720,14 +720,14 @@ out: static int drbd_send_fp(struct drbd_conf *mdev, struct socket *sock, enum drbd_packets cmd) { - struct p_header80 *h = (struct p_header80 *) &mdev->data.sbuf.header; + struct p_header80 *h = &mdev->data.sbuf.header.h80; return _drbd_send_cmd(mdev, sock, cmd, h, sizeof(*h), 0); } static enum drbd_packets drbd_recv_fp(struct drbd_conf *mdev, struct socket *sock) { - struct p_header80 *h = (struct p_header80 *) &mdev->data.sbuf.header; + struct p_header80 *h = &mdev->data.rbuf.header.h80; int rr; rr = drbd_recv_short(mdev, sock, h, sizeof(*h), 0); @@ -944,22 +944,27 @@ out_release_sockets: return -1; } -static int drbd_recv_header(struct drbd_conf *mdev, struct p_header80 *h) +static int drbd_recv_header(struct drbd_conf *mdev, enum drbd_packets *cmd, unsigned int *packet_size) { + union p_header *h = &mdev->data.rbuf.header; int r; r = drbd_recv(mdev, h, sizeof(*h)); - if (unlikely(r != sizeof(*h))) { dev_err(DEV, "short read expecting header on sock: r=%d\n", r); return FALSE; - }; - h->command = be16_to_cpu(h->command); - h->length = be16_to_cpu(h->length); - if (unlikely(h->magic != BE_DRBD_MAGIC)) { + } + + if (likely(h->h80.magic == BE_DRBD_MAGIC)) { + *cmd = be16_to_cpu(h->h80.command); + *packet_size = be16_to_cpu(h->h80.length); + } else if (h->h95.magic == BE_DRBD_MAGIC_BIG) { + *cmd = be16_to_cpu(h->h95.command); + *packet_size = be32_to_cpu(h->h95.length); + } else { dev_err(DEV, "magic?? on data m: 0x%lx c: %d l: %d\n", - (long)be32_to_cpu(h->magic), - h->command, h->length); + (long)be32_to_cpu(h->h80.magic), + h->h80.command, h->h80.length); return FALSE; } mdev->last_received = jiffies; @@ -1266,17 +1271,12 @@ int w_e_reissue(struct drbd_conf *mdev, struct drbd_work *w, int cancel) __relea return 1; } -static int receive_Barrier(struct drbd_conf *mdev, struct p_header80 *h) +static int receive_Barrier(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size) { int rv, issue_flush; - struct p_barrier *p = (struct p_barrier *)h; + struct p_barrier *p = &mdev->data.rbuf.barrier; struct drbd_epoch *epoch; - ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE; - - rv = drbd_recv(mdev, h->payload, h->length); - ERR_IF(rv != h->length) return FALSE; - inc_unacked(mdev); if (mdev->net_conf->wire_protocol != DRBD_PROT_C) @@ -1570,21 +1570,12 @@ fail: return FALSE; } -static int receive_DataReply(struct drbd_conf *mdev, struct p_header80 *h) +static int receive_DataReply(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size) { struct drbd_request *req; sector_t sector; - unsigned int header_size, data_size; int ok; - struct p_data *p = (struct p_data *)h; - - header_size = sizeof(*p) - sizeof(*h); - data_size = h->length - header_size; - - ERR_IF(data_size == 0) return FALSE; - - if (drbd_recv(mdev, h->payload, header_size) != header_size) - return FALSE; + struct p_data *p = &mdev->data.rbuf.data; sector = be64_to_cpu(p->sector); @@ -1610,20 +1601,11 @@ static int receive_DataReply(struct drbd_conf *mdev, struct p_header80 *h) return ok; } -static int receive_RSDataReply(struct drbd_conf *mdev, struct p_header80 *h) +static int receive_RSDataReply(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size) { sector_t sector; - unsigned int header_size, data_size; int ok; - struct p_data *p = (struct p_data *)h; - - header_size = sizeof(*p) - sizeof(*h); - data_size = h->length - header_size; - - ERR_IF(data_size == 0) return FALSE; - - if (drbd_recv(mdev, h->payload, header_size) != header_size) - return FALSE; + struct p_data *p = &mdev->data.rbuf.data; sector = be64_to_cpu(p->sector); D_ASSERT(p->block_id == ID_SYNCER); @@ -1767,23 +1749,14 @@ static int drbd_wait_peer_seq(struct drbd_conf *mdev, const u32 packet_seq) } /* mirrored write */ -static int receive_Data(struct drbd_conf *mdev, struct p_header80 *h) +static int receive_Data(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size) { sector_t sector; struct drbd_epoch_entry *e; - struct p_data *p = (struct p_data *)h; - int header_size, data_size; + struct p_data *p = &mdev->data.rbuf.data; int rw = WRITE; u32 dp_flags; - header_size = sizeof(*p) - sizeof(*h); - data_size = h->length - header_size; - - ERR_IF(data_size == 0) return FALSE; - - if (drbd_recv(mdev, h->payload, header_size) != header_size) - return FALSE; - if (!get_ldev(mdev)) { if (__ratelimit(&drbd_ratelimit_state)) dev_err(DEV, "Can not write mirrored data block " @@ -2066,20 +2039,15 @@ int drbd_rs_should_slow_down(struct drbd_conf *mdev) } -static int receive_DataRequest(struct drbd_conf *mdev, struct p_header80 *h) +static int receive_DataRequest(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int digest_size) { sector_t sector; const sector_t capacity = drbd_get_capacity(mdev->this_bdev); struct drbd_epoch_entry *e; struct digest_info *di = NULL; - struct p_block_req *p = (struct p_block_req *)h; - const int brps = sizeof(*p)-sizeof(*h); - int size, digest_size; + int size; unsigned int fault_type; - - - if (drbd_recv(mdev, h->payload, brps) != brps) - return FALSE; + struct p_block_req *p = &mdev->data.rbuf.block_req; sector = be64_to_cpu(p->sector); size = be32_to_cpu(p->blksize); @@ -2099,9 +2067,9 @@ static int receive_DataRequest(struct drbd_conf *mdev, struct p_header80 *h) if (__ratelimit(&drbd_ratelimit_state)) dev_err(DEV, "Can not satisfy peer's read request, " "no local data.\n"); - drbd_send_ack_rp(mdev, h->command == P_DATA_REQUEST ? P_NEG_DREPLY : + drbd_send_ack_rp(mdev, cmd == P_DATA_REQUEST ? P_NEG_DREPLY : P_NEG_RS_DREPLY , p); - return drbd_drain_block(mdev, h->length - brps); + return TRUE; } /* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD @@ -2113,7 +2081,7 @@ static int receive_DataRequest(struct drbd_conf *mdev, struct p_header80 *h) return FALSE; } - switch (h->command) { + switch (cmd) { case P_DATA_REQUEST: e->w.cb = w_e_end_data_req; fault_type = DRBD_FAULT_DT_RD; @@ -2128,7 +2096,6 @@ static int receive_DataRequest(struct drbd_conf *mdev, struct p_header80 *h) case P_OV_REPLY: case P_CSUM_RS_REQUEST: fault_type = DRBD_FAULT_RS_RD; - digest_size = h->length - brps ; di = kmalloc(sizeof(*di) + digest_size, GFP_NOIO); if (!di) goto out_free_e; @@ -2142,10 +2109,10 @@ static int receive_DataRequest(struct drbd_conf *mdev, struct p_header80 *h) if (drbd_recv(mdev, di->digest, digest_size) != digest_size) goto out_free_e; - if (h->command == P_CSUM_RS_REQUEST) { + if (cmd == P_CSUM_RS_REQUEST) { D_ASSERT(mdev->agreed_pro_version >= 89); e->w.cb = w_e_end_csum_rs_req; - } else if (h->command == P_OV_REPLY) { + } else if (cmd == P_OV_REPLY) { e->w.cb = w_e_end_ov_reply; dec_rs_pending(mdev); /* drbd_rs_begin_io done when we sent this request, @@ -2173,7 +2140,7 @@ static int receive_DataRequest(struct drbd_conf *mdev, struct p_header80 *h) default: dev_err(DEV, "unexpected command (%s) in receive_DataRequest\n", - cmdname(h->command)); + cmdname(cmd)); fault_type = DRBD_FAULT_MAX; goto out_free_e; } @@ -2756,20 +2723,13 @@ static int cmp_after_sb(enum drbd_after_sb_p peer, enum drbd_after_sb_p self) return 1; } -static int receive_protocol(struct drbd_conf *mdev, struct p_header80 *h) +static int receive_protocol(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size) { - struct p_protocol *p = (struct p_protocol *)h; - int header_size, data_size; + struct p_protocol *p = &mdev->data.rbuf.protocol; int p_proto, p_after_sb_0p, p_after_sb_1p, p_after_sb_2p; int p_want_lose, p_two_primaries, cf; char p_integrity_alg[SHARED_SECRET_MAX] = ""; - header_size = sizeof(*p) - sizeof(*h); - data_size = h->length - header_size; - - if (drbd_recv(mdev, h->payload, header_size) != header_size) - return FALSE; - p_proto = be32_to_cpu(p->protocol); p_after_sb_0p = be32_to_cpu(p->after_sb_0p); p_after_sb_1p = be32_to_cpu(p->after_sb_1p); @@ -2862,10 +2822,10 @@ struct crypto_hash *drbd_crypto_alloc_digest_safe(const struct drbd_conf *mdev, return tfm; } -static int receive_SyncParam(struct drbd_conf *mdev, struct p_header80 *h) +static int receive_SyncParam(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int packet_size) { int ok = TRUE; - struct p_rs_param_95 *p = (struct p_rs_param_95 *)h; + struct p_rs_param_95 *p = &mdev->data.rbuf.rs_param_95; unsigned int header_size, data_size, exp_max_sz; struct crypto_hash *verify_tfm = NULL; struct crypto_hash *csums_tfm = NULL; @@ -2879,29 +2839,29 @@ static int receive_SyncParam(struct drbd_conf *mdev, struct p_header80 *h) : apv <= 94 ? sizeof(struct p_rs_param_89) : /* apv >= 95 */ sizeof(struct p_rs_param_95); - if (h->length > exp_max_sz) { + if (packet_size > exp_max_sz) { dev_err(DEV, "SyncParam packet too long: received %u, expected <= %u bytes\n", - h->length, exp_max_sz); + packet_size, exp_max_sz); return FALSE; } if (apv <= 88) { - header_size = sizeof(struct p_rs_param) - sizeof(*h); - data_size = h->length - header_size; + header_size = sizeof(struct p_rs_param) - sizeof(struct p_header80); + data_size = packet_size - header_size; } else if (apv <= 94) { - header_size = sizeof(struct p_rs_param_89) - sizeof(*h); - data_size = h->length - header_size; + header_size = sizeof(struct p_rs_param_89) - sizeof(struct p_header80); + data_size = packet_size - header_size; D_ASSERT(data_size == 0); } else { - header_size = sizeof(struct p_rs_param_95) - sizeof(*h); - data_size = h->length - header_size; + header_size = sizeof(struct p_rs_param_95) - sizeof(struct p_header80); + data_size = packet_size - header_size; D_ASSERT(data_size == 0); } /* initialize verify_alg and csums_alg */ memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX); - if (drbd_recv(mdev, h->payload, header_size) != header_size) + if (drbd_recv(mdev, &p->head.payload, header_size) != header_size) return FALSE; mdev->sync_conf.rate = be32_to_cpu(p->rate); @@ -3032,19 +2992,15 @@ static void warn_if_differ_considerably(struct drbd_conf *mdev, (unsigned long long)a, (unsigned long long)b); } -static int receive_sizes(struct drbd_conf *mdev, struct p_header80 *h) +static int receive_sizes(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size) { - struct p_sizes *p = (struct p_sizes *)h; + struct p_sizes *p = &mdev->data.rbuf.sizes; enum determine_dev_size dd = unchanged; unsigned int max_seg_s; sector_t p_size, p_usize, my_usize; int ldsc = 0; /* local disk size changed */ enum dds_flags ddsf; - ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE; - if (drbd_recv(mdev, h->payload, h->length) != h->length) - return FALSE; - p_size = be64_to_cpu(p->d_size); p_usize = be64_to_cpu(p->u_size); @@ -3148,16 +3104,12 @@ static int receive_sizes(struct drbd_conf *mdev, struct p_header80 *h) return TRUE; } -static int receive_uuids(struct drbd_conf *mdev, struct p_header80 *h) +static int receive_uuids(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size) { - struct p_uuids *p = (struct p_uuids *)h; + struct p_uuids *p = &mdev->data.rbuf.uuids; u64 *p_uuid; int i; - ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE; - if (drbd_recv(mdev, h->payload, h->length) != h->length) - return FALSE; - p_uuid = kmalloc(sizeof(u64)*UI_EXTENDED_SIZE, GFP_NOIO); for (i = UI_CURRENT; i < UI_EXTENDED_SIZE; i++) @@ -3241,16 +3193,12 @@ static union drbd_state convert_state(union drbd_state ps) return ms; } -static int receive_req_state(struct drbd_conf *mdev, struct p_header80 *h) +static int receive_req_state(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size) { - struct p_req_state *p = (struct p_req_state *)h; + struct p_req_state *p = &mdev->data.rbuf.req_state; union drbd_state mask, val; int rv; - ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE; - if (drbd_recv(mdev, h->payload, h->length) != h->length) - return FALSE; - mask.i = be32_to_cpu(p->mask); val.i = be32_to_cpu(p->val); @@ -3271,21 +3219,15 @@ static int receive_req_state(struct drbd_conf *mdev, struct p_header80 *h) return TRUE; } -static int receive_state(struct drbd_conf *mdev, struct p_header80 *h) +static int receive_state(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size) { - struct p_state *p = (struct p_state *)h; + struct p_state *p = &mdev->data.rbuf.state; enum drbd_conns nconn, oconn; union drbd_state ns, peer_state; enum drbd_disk_state real_peer_disk; enum chg_state_flags cs_flags; int rv; - ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) - return FALSE; - - if (drbd_recv(mdev, h->payload, h->length) != h->length) - return FALSE; - peer_state.i = be32_to_cpu(p->state); real_peer_disk = peer_state.disk; @@ -3395,9 +3337,9 @@ static int receive_state(struct drbd_conf *mdev, struct p_header80 *h) return TRUE; } -static int receive_sync_uuid(struct drbd_conf *mdev, struct p_header80 *h) +static int receive_sync_uuid(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size) { - struct p_rs_uuid *p = (struct p_rs_uuid *)h; + struct p_rs_uuid *p = &mdev->data.rbuf.rs_uuid; wait_event(mdev->misc_wait, mdev->state.conn == C_WF_SYNC_UUID || @@ -3406,10 +3348,6 @@ static int receive_sync_uuid(struct drbd_conf *mdev, struct p_header80 *h) /* D_ASSERT( mdev->state.conn == C_WF_SYNC_UUID ); */ - ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE; - if (drbd_recv(mdev, h->payload, h->length) != h->length) - return FALSE; - /* Here the _drbd_uuid_ functions are right, current should _not_ be rotated into the history */ if (get_ldev_if_state(mdev, D_NEGOTIATING)) { @@ -3428,14 +3366,14 @@ static int receive_sync_uuid(struct drbd_conf *mdev, struct p_header80 *h) enum receive_bitmap_ret { OK, DONE, FAILED }; static enum receive_bitmap_ret -receive_bitmap_plain(struct drbd_conf *mdev, struct p_header80 *h, - unsigned long *buffer, struct bm_xfer_ctx *c) +receive_bitmap_plain(struct drbd_conf *mdev, unsigned int data_size, + unsigned long *buffer, struct bm_xfer_ctx *c) { unsigned num_words = min_t(size_t, BM_PACKET_WORDS, c->bm_words - c->word_offset); unsigned want = num_words * sizeof(long); - if (want != h->length) { - dev_err(DEV, "%s:want (%u) != h->length (%u)\n", __func__, want, h->length); + if (want != data_size) { + dev_err(DEV, "%s:want (%u) != data_size (%u)\n", __func__, want, data_size); return FAILED; } if (want == 0) @@ -3571,12 +3509,13 @@ void INFO_bm_xfer_stats(struct drbd_conf *mdev, in order to be agnostic to the 32 vs 64 bits issue. returns 0 on failure, 1 if we successfully received it. */ -static int receive_bitmap(struct drbd_conf *mdev, struct p_header80 *h) +static int receive_bitmap(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size) { struct bm_xfer_ctx c; void *buffer; enum receive_bitmap_ret ret; int ok = FALSE; + struct p_header80 *h = &mdev->data.rbuf.header.h80; wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_bio_cnt)); @@ -3596,21 +3535,21 @@ static int receive_bitmap(struct drbd_conf *mdev, struct p_header80 *h) }; do { - if (h->command == P_BITMAP) { - ret = receive_bitmap_plain(mdev, h, buffer, &c); - } else if (h->command == P_COMPRESSED_BITMAP) { + if (cmd == P_BITMAP) { + ret = receive_bitmap_plain(mdev, data_size, buffer, &c); + } else if (cmd == P_COMPRESSED_BITMAP) { /* MAYBE: sanity check that we speak proto >= 90, * and the feature is enabled! */ struct p_compressed_bm *p; - if (h->length > BM_PACKET_PAYLOAD_BYTES) { + if (data_size > BM_PACKET_PAYLOAD_BYTES) { dev_err(DEV, "ReportCBitmap packet too large\n"); goto out; } /* use the page buff */ p = buffer; memcpy(p, h, sizeof(*h)); - if (drbd_recv(mdev, p->head.payload, h->length) != h->length) + if (drbd_recv(mdev, p->head.payload, data_size) != data_size) goto out; if (p->head.length <= (sizeof(*p) - sizeof(p->head))) { dev_err(DEV, "ReportCBitmap packet too small (l:%u)\n", p->head.length); @@ -3618,17 +3557,17 @@ static int receive_bitmap(struct drbd_conf *mdev, struct p_header80 *h) } ret = decode_bitmap_c(mdev, p, &c); } else { - dev_warn(DEV, "receive_bitmap: h->command neither ReportBitMap nor ReportCBitMap (is 0x%x)", h->command); + dev_warn(DEV, "receive_bitmap: cmd neither ReportBitMap nor ReportCBitMap (is 0x%x)", cmd); goto out; } - c.packets[h->command == P_BITMAP]++; - c.bytes[h->command == P_BITMAP] += sizeof(struct p_header80) + h->length; + c.packets[cmd == P_BITMAP]++; + c.bytes[cmd == P_BITMAP] += sizeof(struct p_header80) + data_size; if (ret != OK) break; - if (!drbd_recv_header(mdev, h)) + if (!drbd_recv_header(mdev, &cmd, &data_size)) goto out; } while (ret == OK); if (ret == FAILED) @@ -3659,17 +3598,16 @@ static int receive_bitmap(struct drbd_conf *mdev, struct p_header80 *h) return ok; } -static int receive_skip_(struct drbd_conf *mdev, struct p_header80 *h, int silent) +static int receive_skip(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size) { /* TODO zero copy sink :) */ static char sink[128]; int size, want, r; - if (!silent) - dev_warn(DEV, "skipping unknown optional packet type %d, l: %d!\n", - h->command, h->length); + dev_warn(DEV, "skipping unknown optional packet type %d, l: %d!\n", + cmd, data_size); - size = h->length; + size = data_size; while (size > 0) { want = min_t(int, size, sizeof(sink)); r = drbd_recv(mdev, sink, want); @@ -3679,17 +3617,7 @@ static int receive_skip_(struct drbd_conf *mdev, struct p_header80 *h, int silen return size == 0; } -static int receive_skip(struct drbd_conf *mdev, struct p_header80 *h) -{ - return receive_skip_(mdev, h, 0); -} - -static int receive_skip_silent(struct drbd_conf *mdev, struct p_header80 *h) -{ - return receive_skip_(mdev, h, 1); -} - -static int receive_UnplugRemote(struct drbd_conf *mdev, struct p_header80 *h) +static int receive_UnplugRemote(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size) { if (mdev->state.disk >= D_INCONSISTENT) drbd_kick_lo(mdev); @@ -3701,73 +3629,91 @@ static int receive_UnplugRemote(struct drbd_conf *mdev, struct p_header80 *h) return TRUE; } -typedef int (*drbd_cmd_handler_f)(struct drbd_conf *, struct p_header80 *); - -static drbd_cmd_handler_f drbd_default_handler[] = { - [P_DATA] = receive_Data, - [P_DATA_REPLY] = receive_DataReply, - [P_RS_DATA_REPLY] = receive_RSDataReply, - [P_BARRIER] = receive_Barrier, - [P_BITMAP] = receive_bitmap, - [P_COMPRESSED_BITMAP] = receive_bitmap, - [P_UNPLUG_REMOTE] = receive_UnplugRemote, - [P_DATA_REQUEST] = receive_DataRequest, - [P_RS_DATA_REQUEST] = receive_DataRequest, - [P_SYNC_PARAM] = receive_SyncParam, - [P_SYNC_PARAM89] = receive_SyncParam, - [P_PROTOCOL] = receive_protocol, - [P_UUIDS] = receive_uuids, - [P_SIZES] = receive_sizes, - [P_STATE] = receive_state, - [P_STATE_CHG_REQ] = receive_req_state, - [P_SYNC_UUID] = receive_sync_uuid, - [P_OV_REQUEST] = receive_DataRequest, - [P_OV_REPLY] = receive_DataRequest, - [P_CSUM_RS_REQUEST] = receive_DataRequest, - [P_DELAY_PROBE] = receive_skip_silent, +typedef int (*drbd_cmd_handler_f)(struct drbd_conf *, enum drbd_packets cmd, unsigned int to_receive); + +struct data_cmd { + int expect_payload; + size_t pkt_size; + drbd_cmd_handler_f function; +}; + +static struct data_cmd drbd_cmd_handler[] = { + [P_DATA] = { 1, sizeof(struct p_data), receive_Data }, + [P_DATA_REPLY] = { 1, sizeof(struct p_data), receive_DataReply }, + [P_RS_DATA_REPLY] = { 1, sizeof(struct p_data), receive_RSDataReply } , + [P_BARRIER] = { 0, sizeof(struct p_barrier), receive_Barrier } , + [P_BITMAP] = { 1, sizeof(struct p_header80), receive_bitmap } , + [P_COMPRESSED_BITMAP] = { 1, sizeof(struct p_header80), receive_bitmap } , + [P_UNPLUG_REMOTE] = { 0, sizeof(struct p_header80), receive_UnplugRemote }, + [P_DATA_REQUEST] = { 0, sizeof(struct p_block_req), receive_DataRequest }, + [P_RS_DATA_REQUEST] = { 0, sizeof(struct p_block_req), receive_DataRequest }, + [P_SYNC_PARAM] = { 1, sizeof(struct p_header80), receive_SyncParam }, + [P_SYNC_PARAM89] = { 1, sizeof(struct p_header80), receive_SyncParam }, + [P_PROTOCOL] = { 1, sizeof(struct p_protocol), receive_protocol }, + [P_UUIDS] = { 0, sizeof(struct p_uuids), receive_uuids }, + [P_SIZES] = { 0, sizeof(struct p_sizes), receive_sizes }, + [P_STATE] = { 0, sizeof(struct p_state), receive_state }, + [P_STATE_CHG_REQ] = { 0, sizeof(struct p_req_state), receive_req_state }, + [P_SYNC_UUID] = { 0, sizeof(struct p_rs_uuid), receive_sync_uuid }, + [P_OV_REQUEST] = { 0, sizeof(struct p_block_req), receive_DataRequest }, + [P_OV_REPLY] = { 1, sizeof(struct p_block_req), receive_DataRequest }, + [P_CSUM_RS_REQUEST] = { 1, sizeof(struct p_block_req), receive_DataRequest }, + [P_DELAY_PROBE] = { 0, sizeof(struct p_delay_probe93), receive_skip }, /* anything missing from this table is in * the asender_tbl, see get_asender_cmd */ - [P_MAX_CMD] = NULL, + [P_MAX_CMD] = { 0, 0, NULL }, }; -static drbd_cmd_handler_f *drbd_cmd_handler = drbd_default_handler; -static drbd_cmd_handler_f *drbd_opt_cmd_handler; +/* All handler functions that expect a sub-header get that sub-heder in + mdev->data.rbuf.header.head.payload. + + Usually in mdev->data.rbuf.header.head the callback can find the usual + p_header, but they may not rely on that. Since there is also p_header95 ! + */ static void drbdd(struct drbd_conf *mdev) { - drbd_cmd_handler_f handler; - struct p_header80 *header = &mdev->data.rbuf.header; + union p_header *header = &mdev->data.rbuf.header; + unsigned int packet_size; + enum drbd_packets cmd; + size_t shs; /* sub header size */ + int rv; while (get_t_state(&mdev->receiver) == Running) { drbd_thread_current_set_cpu(mdev); - if (!drbd_recv_header(mdev, header)) { - drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR)); - break; + if (!drbd_recv_header(mdev, &cmd, &packet_size)) + goto err_out; + + if (unlikely(cmd >= P_MAX_CMD || !drbd_cmd_handler[cmd].function)) { + dev_err(DEV, "unknown packet type %d, l: %d!\n", cmd, packet_size); + goto err_out; } - if (header->command < P_MAX_CMD) - handler = drbd_cmd_handler[header->command]; - else if (P_MAY_IGNORE < header->command - && header->command < P_MAX_OPT_CMD) - handler = drbd_opt_cmd_handler[header->command-P_MAY_IGNORE]; - else if (header->command > P_MAX_OPT_CMD) - handler = receive_skip; - else - handler = NULL; + shs = drbd_cmd_handler[cmd].pkt_size - sizeof(union p_header); + rv = drbd_recv(mdev, &header->h80.payload, shs); + if (unlikely(rv != shs)) { + dev_err(DEV, "short read while reading sub header: rv=%d\n", rv); + goto err_out; + } - if (unlikely(!handler)) { - dev_err(DEV, "unknown packet type %d, l: %d!\n", - header->command, header->length); - drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR)); - break; + if (packet_size - shs > 0 && !drbd_cmd_handler[cmd].expect_payload) { + dev_err(DEV, "No payload expected %s l:%d\n", cmdname(cmd), packet_size); + goto err_out; } - if (unlikely(!handler(mdev, header))) { + + rv = drbd_cmd_handler[cmd].function(mdev, cmd, packet_size - shs); + + if (unlikely(!rv)) { dev_err(DEV, "error receiving %s, l: %d!\n", - cmdname(header->command), header->length); - drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR)); - break; + cmdname(cmd), packet_size); + goto err_out; } } + + if (0) { + err_out: + drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR)); + } } void drbd_flush_workqueue(struct drbd_conf *mdev) @@ -3980,27 +3926,28 @@ static int drbd_do_handshake(struct drbd_conf *mdev) { /* ASSERT current == mdev->receiver ... */ struct p_handshake *p = &mdev->data.rbuf.handshake; - const int expect = sizeof(struct p_handshake) - -sizeof(struct p_header80); + const int expect = sizeof(struct p_handshake) - sizeof(struct p_header80); + unsigned int length; + enum drbd_packets cmd; int rv; rv = drbd_send_handshake(mdev); if (!rv) return 0; - rv = drbd_recv_header(mdev, &p->head); + rv = drbd_recv_header(mdev, &cmd, &length); if (!rv) return 0; - if (p->head.command != P_HAND_SHAKE) { + if (cmd != P_HAND_SHAKE) { dev_err(DEV, "expected HandShake packet, received: %s (0x%04x)\n", - cmdname(p->head.command), p->head.command); + cmdname(cmd), cmd); return -1; } - if (p->head.length != expect) { + if (length != expect) { dev_err(DEV, "expected HandShake length: %u, received: %u\n", - expect, p->head.length); + expect, length); return -1; } @@ -4058,10 +4005,11 @@ static int drbd_do_auth(struct drbd_conf *mdev) char *response = NULL; char *right_response = NULL; char *peers_ch = NULL; - struct p_header80 p; unsigned int key_len = strlen(mdev->net_conf->shared_secret); unsigned int resp_size; struct hash_desc desc; + enum drbd_packets cmd; + unsigned int length; int rv; desc.tfm = mdev->cram_hmac_tfm; @@ -4081,33 +4029,33 @@ static int drbd_do_auth(struct drbd_conf *mdev) if (!rv) goto fail; - rv = drbd_recv_header(mdev, &p); + rv = drbd_recv_header(mdev, &cmd, &length); if (!rv) goto fail; - if (p.command != P_AUTH_CHALLENGE) { + if (cmd != P_AUTH_CHALLENGE) { dev_err(DEV, "expected AuthChallenge packet, received: %s (0x%04x)\n", - cmdname(p.command), p.command); + cmdname(cmd), cmd); rv = 0; goto fail; } - if (p.length > CHALLENGE_LEN*2) { + if (length > CHALLENGE_LEN * 2) { dev_err(DEV, "expected AuthChallenge payload too big.\n"); rv = -1; goto fail; } - peers_ch = kmalloc(p.length, GFP_NOIO); + peers_ch = kmalloc(length, GFP_NOIO); if (peers_ch == NULL) { dev_err(DEV, "kmalloc of peers_ch failed\n"); rv = -1; goto fail; } - rv = drbd_recv(mdev, peers_ch, p.length); + rv = drbd_recv(mdev, peers_ch, length); - if (rv != p.length) { + if (rv != length) { dev_err(DEV, "short read AuthChallenge: l=%u\n", rv); rv = 0; goto fail; @@ -4122,7 +4070,7 @@ static int drbd_do_auth(struct drbd_conf *mdev) } sg_init_table(&sg, 1); - sg_set_buf(&sg, peers_ch, p.length); + sg_set_buf(&sg, peers_ch, length); rv = crypto_hash_digest(&desc, &sg, sg.length, response); if (rv) { @@ -4135,18 +4083,18 @@ static int drbd_do_auth(struct drbd_conf *mdev) if (!rv) goto fail; - rv = drbd_recv_header(mdev, &p); + rv = drbd_recv_header(mdev, &cmd, &length); if (!rv) goto fail; - if (p.command != P_AUTH_RESPONSE) { + if (cmd != P_AUTH_RESPONSE) { dev_err(DEV, "expected AuthResponse packet, received: %s (0x%04x)\n", - cmdname(p.command), p.command); + cmdname(cmd), cmd); rv = 0; goto fail; } - if (p.length != resp_size) { + if (length != resp_size) { dev_err(DEV, "expected AuthResponse payload of wrong size\n"); rv = 0; goto fail; @@ -4474,9 +4422,8 @@ static int got_OVResult(struct drbd_conf *mdev, struct p_header80 *h) return TRUE; } -static int got_something_to_ignore_m(struct drbd_conf *mdev, struct p_header80 *h) +static int got_skip(struct drbd_conf *mdev, struct p_header80 *h) { - /* IGNORE */ return TRUE; } @@ -4504,7 +4451,7 @@ static struct asender_cmd *get_asender_cmd(int cmd) [P_BARRIER_ACK] = { sizeof(struct p_barrier_ack), got_BarrierAck }, [P_STATE_CHG_REPLY] = { sizeof(struct p_req_state_reply), got_RqSReply }, [P_RS_IS_IN_SYNC] = { sizeof(struct p_block_ack), got_IsInSync }, - [P_DELAY_PROBE] = { sizeof(struct p_delay_probe93), got_something_to_ignore_m }, + [P_DELAY_PROBE] = { sizeof(struct p_delay_probe93), got_skip }, [P_MAX_CMD] = { 0, NULL }, }; if (cmd > P_MAX_CMD || asender_tbl[cmd].process == NULL) @@ -4515,7 +4462,7 @@ static struct asender_cmd *get_asender_cmd(int cmd) int drbd_asender(struct drbd_thread *thi) { struct drbd_conf *mdev = thi->mdev; - struct p_header80 *h = &mdev->meta.rbuf.header; + struct p_header80 *h = &mdev->meta.rbuf.header.h80; struct asender_cmd *cmd = NULL; int rv, len; -- cgit v0.10.2 From d53733893dc43f4ebb5be510863c5debf0f8990b Mon Sep 17 00:00:00 2001 From: Philipp Reisner Date: Mon, 23 Aug 2010 15:18:33 +0200 Subject: drbd: Actually allow BIOs up to 128k (was 32k). Now we have multiple BIOs per ee, packets with a 32 bit length field, it gets time to use these goodies. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index db7e655..58dc02b 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -1389,11 +1389,13 @@ struct bm_extent { #endif /* Sector shift value for the "hash" functions of tl_hash and ee_hash tables. - * With a value of 6 all IO in one 32K block make it to the same slot of the + * With a value of 8 all IO in one 128K block make it to the same slot of the * hash table. */ -#define HT_SHIFT 6 +#define HT_SHIFT 8 #define DRBD_MAX_SEGMENT_SIZE (1U<<(9+HT_SHIFT)) +#define DRBD_MAX_SIZE_H80_PACKET (1 << 15) /* The old header only allows packets up to 32Kib data */ + /* Number of elements in the app_reads_hash */ #define APP_R_HSIZE 15 diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index f3f4ea9..71c4c26 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -2447,7 +2447,7 @@ int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req) dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_w_tfm) ? crypto_hash_digestsize(mdev->integrity_w_tfm) : 0; - if (req->size <= (1 << 15)) { + if (req->size <= DRBD_MAX_SIZE_H80_PACKET) { p.head.h80.magic = BE_DRBD_MAGIC; p.head.h80.command = cpu_to_be16(P_DATA); p.head.h80.length = @@ -2518,7 +2518,7 @@ int drbd_send_block(struct drbd_conf *mdev, enum drbd_packets cmd, dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_w_tfm) ? crypto_hash_digestsize(mdev->integrity_w_tfm) : 0; - if (e->size <= (1 << 15)) { + if (e->size <= DRBD_MAX_SIZE_H80_PACKET) { p.head.h80.magic = BE_DRBD_MAGIC; p.head.h80.command = cpu_to_be16(cmd); p.head.h80.length = diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index 6b35d41..97fb2c2 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c @@ -1063,7 +1063,9 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp mdev->read_cnt = 0; mdev->writ_cnt = 0; - drbd_setup_queue_param(mdev, DRBD_MAX_SEGMENT_SIZE); + drbd_setup_queue_param(mdev, mdev->state.conn == C_CONNECTED && + mdev->agreed_pro_version < 95 ? + DRBD_MAX_SIZE_H80_PACKET : DRBD_MAX_SEGMENT_SIZE); /* If I am currently not R_PRIMARY, * but meta data primary indicator is set, diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index fe30864..4249117 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -925,6 +925,11 @@ retry: drbd_thread_start(&mdev->asender); + if (mdev->agreed_pro_version < 95 && get_ldev(mdev)) { + drbd_setup_queue_param(mdev, DRBD_MAX_SIZE_H80_PACKET); + put_ldev(mdev); + } + if (!drbd_send_protocol(mdev)) return -1; drbd_send_sync_param(mdev, &mdev->sync_conf); -- cgit v0.10.2 From 0778286a133d2d3f81861a4e5db308e359583006 Mon Sep 17 00:00:00 2001 From: Philipp Reisner Date: Tue, 31 Aug 2010 12:00:50 +0200 Subject: drbd: Disable activity log updates when the whole device is out of sync When the complete device is marked as out of sync, we can disable updates of the on disk AL. Currently AL updates are only disabled if one uses the "invalidate-remote" command on an unconnected, primary device, or when at attach time all bits in the bitmap are set. As of now, AL updated do not get disabled when a all bits becomes set due to application writes to an unconnected DRBD device. While this is a missing feature, it is not considered important, and might get added later. BTW, after initializing a "one legged" DRBD device drbdadm create-md resX drbdadm -- --force primary resX AL updates also get disabled, until the first connect. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c index e3f88d6..fd42832 100644 --- a/drivers/block/drbd/drbd_bitmap.c +++ b/drivers/block/drbd/drbd_bitmap.c @@ -569,7 +569,7 @@ int drbd_bm_resize(struct drbd_conf *mdev, sector_t capacity, int set_new_bits) * * maybe bm_set should be atomic_t ? */ -static unsigned long _drbd_bm_total_weight(struct drbd_conf *mdev) +unsigned long _drbd_bm_total_weight(struct drbd_conf *mdev) { struct drbd_bitmap *b = mdev->bitmap; unsigned long s; diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index 58dc02b..bb3a488 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -863,6 +863,7 @@ enum { CONN_DRY_RUN, /* Expect disconnect after resync handshake. */ GOT_PING_ACK, /* set when we receive a ping_ack packet, misc wait gets woken */ NEW_CUR_UUID, /* Create new current UUID when thawing IO */ + AL_SUSPENDED, /* Activity logging is currently suspended. */ }; struct drbd_bitmap; /* opaque for drbd_conf */ @@ -1425,6 +1426,7 @@ extern unsigned long drbd_bm_find_next(struct drbd_conf *mdev, unsigned long bm_ /* bm_find_next variants for use while you hold drbd_bm_lock() */ extern unsigned long _drbd_bm_find_next(struct drbd_conf *mdev, unsigned long bm_fo); extern unsigned long _drbd_bm_find_next_zero(struct drbd_conf *mdev, unsigned long bm_fo); +extern unsigned long _drbd_bm_total_weight(struct drbd_conf *mdev); extern unsigned long drbd_bm_total_weight(struct drbd_conf *mdev); extern int drbd_bm_rs_done(struct drbd_conf *mdev); /* for receive_bitmap */ diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 71c4c26..23878ff 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -963,6 +963,12 @@ static void set_ov_position(struct drbd_conf *mdev, enum drbd_conns cs) } } +static void drbd_resume_al(struct drbd_conf *mdev) +{ + if (test_and_clear_bit(AL_SUSPENDED, &mdev->flags)) + dev_info(DEV, "Resumed AL updates\n"); +} + /** * __drbd_set_state() - Set a new DRBD state * @mdev: DRBD device. @@ -1160,6 +1166,10 @@ int __drbd_set_state(struct drbd_conf *mdev, ns.conn <= C_TEAR_DOWN && ns.conn >= C_TIMEOUT) drbd_thread_restart_nowait(&mdev->receiver); + /* Resume AL writing if we get a connection */ + if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED) + drbd_resume_al(mdev); + ascw = kmalloc(sizeof(*ascw), GFP_ATOMIC); if (ascw) { ascw->os = os; @@ -2851,6 +2861,7 @@ void drbd_mdev_cleanup(struct drbd_conf *mdev) } drbd_free_resources(mdev); + clear_bit(AL_SUSPENDED, &mdev->flags); /* * currently we drbd_init_ee only on module load, so @@ -3652,6 +3663,7 @@ int drbd_bmio_clear_n_write(struct drbd_conf *mdev) { int rv = -EIO; + drbd_resume_al(mdev); if (get_ldev_if_state(mdev, D_ATTACHING)) { drbd_bm_clear_all(mdev); rv = drbd_bm_write(mdev); diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index 97fb2c2..6742652 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c @@ -777,6 +777,29 @@ static void drbd_reconfig_done(struct drbd_conf *mdev) wake_up(&mdev->state_wait); } +/* Make sure IO is suspended before calling this function(). */ +static void drbd_suspend_al(struct drbd_conf *mdev) +{ + int s = 0; + + if (lc_try_lock(mdev->act_log)) { + drbd_al_shrink(mdev); + lc_unlock(mdev->act_log); + } else { + dev_warn(DEV, "Failed to lock al in drbd_suspend_al()\n"); + return; + } + + spin_lock_irq(&mdev->req_lock); + if (mdev->state.conn < C_CONNECTED) + s = !test_and_set_bit(AL_SUSPENDED, &mdev->flags); + + spin_unlock_irq(&mdev->req_lock); + + if (s) + dev_info(DEV, "Suspended AL updates\n"); +} + /* does always return 0; * interesting return code is in reply->ret_code */ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, @@ -1113,6 +1136,9 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp drbd_al_to_on_disk_bm(mdev); } + if (_drbd_bm_total_weight(mdev) == drbd_bm_bits(mdev)) + drbd_suspend_al(mdev); /* IO is still suspended here... */ + spin_lock_irq(&mdev->req_lock); os = mdev->state; ns.i = os.i; @@ -1792,12 +1818,38 @@ static int drbd_nl_invalidate(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nl return 0; } +static int drbd_bmio_set_susp_al(struct drbd_conf *mdev) +{ + int rv; + + rv = drbd_bmio_set_n_write(mdev); + drbd_suspend_al(mdev); + return rv; +} + static int drbd_nl_invalidate_peer(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, struct drbd_nl_cfg_reply *reply) { + int retcode; - reply->ret_code = drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_S)); + retcode = _drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_S), CS_ORDERED); + + if (retcode < SS_SUCCESS) { + if (retcode == SS_NEED_CONNECTION && mdev->state.role == R_PRIMARY) { + /* The peer will get a resync upon connect anyways. Just make that + into a full resync. */ + retcode = drbd_request_state(mdev, NS(pdsk, D_INCONSISTENT)); + if (retcode >= SS_SUCCESS) { + /* open coded drbd_bitmap_io() */ + if (drbd_bitmap_io(mdev, &drbd_bmio_set_susp_al, + "set_n_write from invalidate_peer")) + retcode = ERR_IO_MD_DISK; + } + } else + retcode = drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_S)); + } + reply->ret_code = retcode; return 0; } diff --git a/drivers/block/drbd/drbd_proc.c b/drivers/block/drbd/drbd_proc.c index c159692..a4a4a06 100644 --- a/drivers/block/drbd/drbd_proc.c +++ b/drivers/block/drbd/drbd_proc.c @@ -203,7 +203,7 @@ static int drbd_seq_show(struct seq_file *seq, void *v) seq_printf(seq, "%2d: cs:Unconfigured\n", i); } else { seq_printf(seq, - "%2d: cs:%s ro:%s/%s ds:%s/%s %c %c%c%c%c%c\n" + "%2d: cs:%s ro:%s/%s ds:%s/%s %c %c%c%c%c%c%c\n" " ns:%u nr:%u dw:%u dr:%u al:%u bm:%u " "lo:%d pe:%d ua:%d ap:%d ep:%d wo:%c", i, sn, @@ -218,6 +218,7 @@ static int drbd_seq_show(struct seq_file *seq, void *v) mdev->state.peer_isp ? 'p' : '-', mdev->state.user_isp ? 'u' : '-', mdev->congestion_reason ?: '-', + test_bit(AL_SUSPENDED, &mdev->flags) ? 's' : '-', mdev->send_cnt/2, mdev->recv_cnt/2, mdev->writ_cnt/2, diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c index 3b61d76..af608b3 100644 --- a/drivers/block/drbd/drbd_req.c +++ b/drivers/block/drbd/drbd_req.c @@ -94,7 +94,8 @@ static void _req_is_done(struct drbd_conf *mdev, struct drbd_request *req, const */ if (s & RQ_LOCAL_MASK) { if (get_ldev_if_state(mdev, D_FAILED)) { - drbd_al_complete_io(mdev, req->sector); + if (s & RQ_IN_ACT_LOG) + drbd_al_complete_io(mdev, req->sector); put_ldev(mdev); } else if (__ratelimit(&drbd_ratelimit_state)) { dev_warn(DEV, "Should have called drbd_al_complete_io(, %llu), " @@ -802,8 +803,10 @@ static int drbd_make_request_common(struct drbd_conf *mdev, struct bio *bio) * resync extent to finish, and, if necessary, pulls in the target * extent into the activity log, which involves further disk io because * of transactional on-disk meta data updates. */ - if (rw == WRITE && local) + if (rw == WRITE && local && !test_bit(AL_SUSPENDED, &mdev->flags)) { + req->rq_state |= RQ_IN_ACT_LOG; drbd_al_begin_io(mdev, sector); + } remote = remote && (mdev->state.pdsk == D_UP_TO_DATE || (mdev->state.pdsk == D_INCONSISTENT && diff --git a/drivers/block/drbd/drbd_req.h b/drivers/block/drbd/drbd_req.h index f2e45aa..181ea03 100644 --- a/drivers/block/drbd/drbd_req.h +++ b/drivers/block/drbd/drbd_req.h @@ -189,6 +189,9 @@ enum drbd_req_state_bits { /* Set when this is a write, clear for a read */ __RQ_WRITE, + + /* Should call drbd_al_complete_io() for this request... */ + __RQ_IN_ACT_LOG, }; #define RQ_LOCAL_PENDING (1UL << __RQ_LOCAL_PENDING) @@ -208,6 +211,7 @@ enum drbd_req_state_bits { #define RQ_NET_MASK (((1UL << __RQ_NET_MAX)-1) & ~RQ_LOCAL_MASK) #define RQ_WRITE (1UL << __RQ_WRITE) +#define RQ_IN_ACT_LOG (1UL << __RQ_IN_ACT_LOG) /* For waking up the frozen transfer log mod_req() has to return if the request should be counted in the epoch object*/ diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c index 3d0e14e..8be9832 100644 --- a/drivers/block/drbd/drbd_worker.c +++ b/drivers/block/drbd/drbd_worker.c @@ -1273,7 +1273,7 @@ int w_restart_disk_io(struct drbd_conf *mdev, struct drbd_work *w, int cancel) { struct drbd_request *req = container_of(w, struct drbd_request, w); - if (bio_data_dir(req->master_bio) == WRITE) + if (bio_data_dir(req->master_bio) == WRITE && req->rq_state & RQ_IN_ACT_LOG) drbd_al_begin_io(mdev, req->sector); /* Calling drbd_al_begin_io() out of the worker might deadlocks theoretically. Practically it can not deadlock, since this is -- cgit v0.10.2 From c518d04fdec3d8b9d6f8b2228040934de9ee6708 Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Wed, 1 Sep 2010 09:50:23 +0200 Subject: drbd: fix race between deconfiguring and reconfiguring network If a drbd_nl_net_conf hits the small window between the state change to C_STANDALONE and the corresponding cleanup in after_state_ch, that cleanup would throw away stuff we now need again, and later trigger BUG_ON()s. Fixed by properly serializing the new config request with any pending cleanup. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index 6742652..d066190 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c @@ -750,14 +750,16 @@ void drbd_setup_queue_param(struct drbd_conf *mdev, unsigned int max_seg_s) __mu /* serialize deconfig (worker exiting, doing cleanup) * and reconfig (drbdsetup disk, drbdsetup net) * - * wait for a potentially exiting worker, then restart it, - * or start a new one. + * Wait for a potentially exiting worker, then restart it, + * or start a new one. Flush any pending work, there may still be an + * after_state_change queued. */ static void drbd_reconfig_start(struct drbd_conf *mdev) { wait_event(mdev->state_wait, !test_and_set_bit(CONFIG_PENDING, &mdev->flags)); wait_event(mdev->state_wait, !test_bit(DEVICE_DYING, &mdev->flags)); drbd_thread_start(&mdev->worker); + drbd_flush_workqueue(mdev); } /* if still unconfigured, stops worker again. -- cgit v0.10.2 From 3f3a9b849d2b703934c07fa17f5eac2dc37c1f6b Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Wed, 1 Sep 2010 15:12:12 +0200 Subject: drbd: fix race on meta-data update The race: drbd_md_mark_dirty() drbd_md_sync() if (!test_and_clear_bit(MD_DIRTY, &mdev->flags)) return; drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE) ==> RACE clear_bit(MD_DIRTY, &mdev->flags); <== spurious Fixed by removing the spurious clear_bit. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 23878ff..73c905d 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -3446,12 +3446,9 @@ void drbd_md_sync(struct drbd_conf *mdev) D_ASSERT(drbd_md_ss__(mdev, mdev->ldev) == mdev->ldev->md.md_offset); sector = mdev->ldev->md.md_offset; - if (drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) { - clear_bit(MD_DIRTY, &mdev->flags); - } else { + if (!drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) { /* this was a try anyways ... */ dev_err(DEV, "meta data update failed!\n"); - drbd_chk_io_error(mdev, 1, TRUE); } -- cgit v0.10.2 From ef50a3e34f93a067ada541346be3175e924331a2 Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Wed, 1 Sep 2010 14:39:30 +0200 Subject: drbd: implicitly create unconfigured devices on sync-after dependencies If pacemaker (for example) decided to initialize minor devices not in the exact sync-after dependency order, the configuration partially failed with an error "The sync-after minor number is invalid". (Bugz. #322) We can avoid that by implicitly creating unconfigured minor devices, if others depend on them. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index d066190..e0061a9 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c @@ -413,6 +413,39 @@ int drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role, int force) return r; } +static struct drbd_conf *ensure_mdev(int minor, int create) +{ + struct drbd_conf *mdev; + + if (minor >= minor_count) + return NULL; + + mdev = minor_to_mdev(minor); + + if (!mdev && create) { + struct gendisk *disk = NULL; + mdev = drbd_new_device(minor); + + spin_lock_irq(&drbd_pp_lock); + if (minor_table[minor] == NULL) { + minor_table[minor] = mdev; + disk = mdev->vdisk; + mdev = NULL; + } /* else: we lost the race */ + spin_unlock_irq(&drbd_pp_lock); + + if (disk) /* we won the race above */ + /* in case we ever add a drbd_delete_device(), + * don't forget the del_gendisk! */ + add_disk(disk); + else /* we lost the race above */ + drbd_free_mdev(mdev); + + mdev = minor_to_mdev(minor); + } + + return mdev; +} static int drbd_nl_primary(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, struct drbd_nl_cfg_reply *reply) @@ -1713,6 +1746,12 @@ static int drbd_nl_syncer_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *n } #undef AL_MAX + /* to avoid spurious errors when configuring minors before configuring + * the minors they depend on: if necessary, first create the minor we + * depend on */ + if (sc.after >= 0) + ensure_mdev(sc.after, 1); + /* most sanity checks done, try to assign the new sync-after * dependency. need to hold the global lock in there, * to avoid a race in the dependency loop check. */ @@ -2080,40 +2119,6 @@ out: return 0; } -static struct drbd_conf *ensure_mdev(struct drbd_nl_cfg_req *nlp) -{ - struct drbd_conf *mdev; - - if (nlp->drbd_minor >= minor_count) - return NULL; - - mdev = minor_to_mdev(nlp->drbd_minor); - - if (!mdev && (nlp->flags & DRBD_NL_CREATE_DEVICE)) { - struct gendisk *disk = NULL; - mdev = drbd_new_device(nlp->drbd_minor); - - spin_lock_irq(&drbd_pp_lock); - if (minor_table[nlp->drbd_minor] == NULL) { - minor_table[nlp->drbd_minor] = mdev; - disk = mdev->vdisk; - mdev = NULL; - } /* else: we lost the race */ - spin_unlock_irq(&drbd_pp_lock); - - if (disk) /* we won the race above */ - /* in case we ever add a drbd_delete_device(), - * don't forget the del_gendisk! */ - add_disk(disk); - else /* we lost the race above */ - drbd_free_mdev(mdev); - - mdev = minor_to_mdev(nlp->drbd_minor); - } - - return mdev; -} - struct cn_handler_struct { int (*function)(struct drbd_conf *, struct drbd_nl_cfg_req *, @@ -2174,7 +2179,8 @@ static void drbd_connector_callback(struct cn_msg *req, struct netlink_skb_parms goto fail; } - mdev = ensure_mdev(nlp); + mdev = ensure_mdev(nlp->drbd_minor, + (nlp->flags & DRBD_NL_CREATE_DEVICE)); if (!mdev) { retcode = ERR_MINOR_INVALID; goto fail; -- cgit v0.10.2 From 63106d3c6c769b6219bd04edde513b12abae3f61 Mon Sep 17 00:00:00 2001 From: Philipp Reisner Date: Wed, 1 Sep 2010 15:47:15 +0200 Subject: drbd: Removed a race that could cause unexpected execution of w_make_resync_request() The actual race happened int the drbd_start_resync() function. Where drbd_resync_finished() -> __drbd_set_state() set STOP_SYNC_TIMER and armed the timer. If the timer fired before execution reaches the mod_timer statement at the end of drbd_start_resync() the latter would cause an unexpected call to w_make_resync_request(). Removed the STOP_SYNC_TIMER bit, and base it on the connection state. The STOP_SYNC_TIMER bit probably originates probably the time before the state engine. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index bb3a488..d5e38de8 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -827,7 +827,6 @@ enum { SIGNAL_ASENDER, /* whether asender wants to be interrupted */ SEND_PING, /* whether asender should send a ping asap */ - STOP_SYNC_TIMER, /* tell timer to cancel itself */ UNPLUG_QUEUED, /* only relevant with kernel 2.4 */ UNPLUG_REMOTE, /* sending a "UnplugRemote" could help */ MD_DIRTY, /* current uuids and flags not yet on disk */ diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 73c905d..5dd071e 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -1052,12 +1052,6 @@ int __drbd_set_state(struct drbd_conf *mdev, wake_up(&mdev->misc_wait); wake_up(&mdev->state_wait); - /* post-state-change actions */ - if (os.conn >= C_SYNC_SOURCE && ns.conn <= C_CONNECTED) { - set_bit(STOP_SYNC_TIMER, &mdev->flags); - mod_timer(&mdev->resync_timer, jiffies); - } - /* aborted verify run. log the last position */ if ((os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) && ns.conn < C_CONNECTED) { @@ -1072,22 +1066,14 @@ int __drbd_set_state(struct drbd_conf *mdev, dev_info(DEV, "Syncer continues.\n"); mdev->rs_paused += (long)jiffies -(long)mdev->rs_mark_time[mdev->rs_last_mark]; - if (ns.conn == C_SYNC_TARGET) { - if (!test_and_clear_bit(STOP_SYNC_TIMER, &mdev->flags)) - mod_timer(&mdev->resync_timer, jiffies); - /* This if (!test_bit) is only needed for the case - that a device that has ceased to used its timer, - i.e. it is already in drbd_resync_finished() gets - paused and resumed. */ - } + if (ns.conn == C_SYNC_TARGET) + mod_timer(&mdev->resync_timer, jiffies); } if ((os.conn == C_SYNC_TARGET || os.conn == C_SYNC_SOURCE) && (ns.conn == C_PAUSED_SYNC_T || ns.conn == C_PAUSED_SYNC_S)) { dev_info(DEV, "Resync suspended\n"); mdev->rs_mark_time[mdev->rs_last_mark] = jiffies; - if (ns.conn == C_PAUSED_SYNC_T) - set_bit(STOP_SYNC_TIMER, &mdev->flags); } if (os.conn == C_CONNECTED && diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index 4249117..885471d 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -3803,7 +3803,6 @@ static void drbd_disconnect(struct drbd_conf *mdev) /* make sure syncer is stopped and w_resume_next_sg queued */ del_timer_sync(&mdev->resync_timer); - set_bit(STOP_SYNC_TIMER, &mdev->flags); resync_timer_fn((unsigned long)mdev); /* wait for all w_e_end_data_req, w_e_end_rsdata_req, w_send_barrier, diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c index 8be9832..0e5bf8c 100644 --- a/drivers/block/drbd/drbd_worker.c +++ b/drivers/block/drbd/drbd_worker.c @@ -395,25 +395,22 @@ defer: void resync_timer_fn(unsigned long data) { - unsigned long flags; struct drbd_conf *mdev = (struct drbd_conf *) data; int queue; - spin_lock_irqsave(&mdev->req_lock, flags); - - if (likely(!test_and_clear_bit(STOP_SYNC_TIMER, &mdev->flags))) { - queue = 1; - if (mdev->state.conn == C_VERIFY_S) - mdev->resync_work.cb = w_make_ov_request; - else - mdev->resync_work.cb = w_make_resync_request; - } else { + queue = 1; + switch (mdev->state.conn) { + case C_VERIFY_S: + mdev->resync_work.cb = w_make_ov_request; + break; + case C_SYNC_TARGET: + mdev->resync_work.cb = w_make_resync_request; + break; + default: queue = 0; mdev->resync_work.cb = w_resync_inactive; } - spin_unlock_irqrestore(&mdev->req_lock, flags); - /* harmless race: list_empty outside data.work.q_lock */ if (list_empty(&mdev->resync_work.list) && queue) drbd_queue_work(&mdev->data.work, &mdev->resync_work); -- cgit v0.10.2 From ee15b038164fcf19b798021762dee3cf5cbc6433 Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Fri, 3 Sep 2010 10:00:09 +0200 Subject: drbd: fix race on meta-data update, addendum addendum to baa33ae4eaa4477b60af7c434c0ddd1d182c1ae7 The race: drbd_md_sync() if (!test_and_clear_bit(MD_DIRTY, &mdev->flags)) return; ==> RACE with drbd_md_mark_dirty() rearming the timer. del_timer(&mdev->md_sync_timer); Fixed by moving the del_timer before the test_and_clear_bit. Additionally only rearm the timer in drbd_md_mark_dirty, if MD_DIRTY was not already set, reduce the grace period from five to one second, and add an ifdef'ed debuging aid to find code paths missing an explicit drbd_md_sync, if any, as those are the only relevant ones for this race. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index d5e38de8..f9b75fc 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -975,6 +975,12 @@ struct drbd_conf { md_sync_work; struct timer_list resync_timer; struct timer_list md_sync_timer; +#ifdef DRBD_DEBUG_MD_SYNC + struct { + unsigned int line; + const char* func; + } last_md_mark_dirty; +#endif /* Used after attach while negotiating new disk state. */ union drbd_state new_state_tmp; @@ -1253,7 +1259,13 @@ extern void drbd_uuid_set_bm(struct drbd_conf *mdev, u64 val) __must_hold(local) extern void drbd_md_set_flag(struct drbd_conf *mdev, int flags) __must_hold(local); extern void drbd_md_clear_flag(struct drbd_conf *mdev, int flags)__must_hold(local); extern int drbd_md_test_flag(struct drbd_backing_dev *, int); +#ifndef DRBD_DEBUG_MD_SYNC extern void drbd_md_mark_dirty(struct drbd_conf *mdev); +#else +#define drbd_md_mark_dirty(m) drbd_md_mark_dirty_(m, __LINE__ , __func__ ) +extern void drbd_md_mark_dirty_(struct drbd_conf *mdev, + unsigned int line, const char *func); +#endif extern void drbd_queue_bitmap_io(struct drbd_conf *mdev, int (*io_fn)(struct drbd_conf *), void (*done)(struct drbd_conf *, int), diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 5dd071e..ab1244e 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -3402,9 +3402,10 @@ void drbd_md_sync(struct drbd_conf *mdev) sector_t sector; int i; + del_timer(&mdev->md_sync_timer); + /* timer may be rearmed by drbd_md_mark_dirty() now. */ if (!test_and_clear_bit(MD_DIRTY, &mdev->flags)) return; - del_timer(&mdev->md_sync_timer); /* We use here D_FAILED and not D_ATTACHING because we try to write * metadata even if we detach due to a disk failure! */ @@ -3529,12 +3530,22 @@ int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev) * the meta-data super block. This function sets MD_DIRTY, and starts a * timer that ensures that within five seconds you have to call drbd_md_sync(). */ +#ifdef DRBD_DEBUG_MD_SYNC +void drbd_md_mark_dirty_(struct drbd_conf *mdev, unsigned int line, const char *func) +{ + if (!test_and_set_bit(MD_DIRTY, &mdev->flags)) { + mod_timer(&mdev->md_sync_timer, jiffies + HZ); + mdev->last_md_mark_dirty.line = line; + mdev->last_md_mark_dirty.func = func; + } +} +#else void drbd_md_mark_dirty(struct drbd_conf *mdev) { - set_bit(MD_DIRTY, &mdev->flags); - mod_timer(&mdev->md_sync_timer, jiffies + 5*HZ); + if (!test_and_set_bit(MD_DIRTY, &mdev->flags)) + mod_timer(&mdev->md_sync_timer, jiffies + HZ); } - +#endif static void drbd_uuid_move_history(struct drbd_conf *mdev) __must_hold(local) { @@ -3775,8 +3786,11 @@ static void md_sync_timer_fn(unsigned long data) static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused) { dev_warn(DEV, "md_sync_timer expired! Worker calls drbd_md_sync().\n"); +#ifdef DEBUG + dev_warn(DEV, "last md_mark_dirty: %s:%u\n", + mdev->last_md_mark_dirty.func, mdev->last_md_mark_dirty.line); +#endif drbd_md_sync(mdev); - return 1; } -- cgit v0.10.2 From 1090c056c5eb6d5335cceb381683e77ac24c71ab Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Mon, 19 Jul 2010 17:41:04 +0200 Subject: drbd: drbd_md_sync before calling user space helpers Just in case we have some pending meta data changes to sync, do it before we call our userland helper, as that may take some time, or even cause a hard reboot. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index e0061a9..5b30f90 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c @@ -172,6 +172,10 @@ int drbd_khelper(struct drbd_conf *mdev, char *cmd) put_net_conf(mdev); } + /* The helper may take some time. + * write out any unsynced meta data changes now */ + drbd_md_sync(mdev); + dev_info(DEV, "helper command: %s %s %s\n", usermode_helper, cmd, mb); drbd_bcast_ev_helper(mdev, cmd); -- cgit v0.10.2 From 76d2e7eca8e7675c6d7a6592f9e747b121cc8a87 Mon Sep 17 00:00:00 2001 From: Philipp Reisner Date: Wed, 25 Aug 2010 11:58:05 +0200 Subject: drbd: Adding support for BIO/Request flags: REQ_FUA, REQ_FLUSH and REQ_DISCARD Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index f9b75fc..79b877d 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -374,9 +374,13 @@ union p_header { */ /* these defines must not be changed without changing the protocol version */ -#define DP_HARDBARRIER 1 -#define DP_RW_SYNC 2 +#define DP_HARDBARRIER 1 /* depricated */ +#define DP_RW_SYNC 2 /* equals REQ_SYNC */ #define DP_MAY_SET_IN_SYNC 4 +#define DP_UNPLUG 8 /* equals REQ_UNPLUG */ +#define DP_FUA 16 /* equals REQ_FUA */ +#define DP_FLUSH 32 /* equals REQ_FLUSH */ +#define DP_DISCARD 64 /* equals REQ_DISCARD */ struct p_data { union p_header head; diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index ab1244e..1827cf0 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -2426,6 +2426,18 @@ static int _drbd_send_zc_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e) return 1; } +static u32 bio_flags_to_wire(struct drbd_conf *mdev, unsigned long bi_rw) +{ + if (mdev->agreed_pro_version >= 95) + return (bi_rw & REQ_SYNC ? DP_RW_SYNC : 0) | + (bi_rw & REQ_UNPLUG ? DP_UNPLUG : 0) | + (bi_rw & REQ_FUA ? DP_FUA : 0) | + (bi_rw & REQ_FLUSH ? DP_FLUSH : 0) | + (bi_rw & REQ_DISCARD ? DP_DISCARD : 0); + else + return bi_rw & (REQ_SYNC | REQ_UNPLUG) ? DP_RW_SYNC : 0; +} + /* Used to send write requests * R_PRIMARY -> Peer (P_DATA) */ @@ -2459,21 +2471,9 @@ int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req) p.block_id = (unsigned long)req; p.seq_num = cpu_to_be32(req->seq_num = atomic_add_return(1, &mdev->packet_seq)); - dp_flags = 0; - /* NOTE: no need to check if barriers supported here as we would - * not pass the test in make_request_common in that case - */ - if (req->master_bio->bi_rw & REQ_HARDBARRIER) { - dev_err(DEV, "ASSERT FAILED would have set DP_HARDBARRIER\n"); - /* dp_flags |= DP_HARDBARRIER; */ - } - if (req->master_bio->bi_rw & REQ_SYNC) - dp_flags |= DP_RW_SYNC; - /* for now handle SYNCIO and UNPLUG - * as if they still were one and the same flag */ - if (req->master_bio->bi_rw & REQ_UNPLUG) - dp_flags |= DP_RW_SYNC; + dp_flags = bio_flags_to_wire(mdev, req->master_bio->bi_rw); + if (mdev->state.conn >= C_SYNC_SOURCE && mdev->state.conn <= C_PAUSED_SYNC_T) dp_flags |= DP_MAY_SET_IN_SYNC; diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index 885471d..e96fbb0 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -1753,6 +1753,18 @@ static int drbd_wait_peer_seq(struct drbd_conf *mdev, const u32 packet_seq) return ret; } +static unsigned long write_flags_to_bio(struct drbd_conf *mdev, u32 dpf) +{ + if (mdev->agreed_pro_version >= 95) + return (dpf & DP_RW_SYNC ? REQ_SYNC : 0) | + (dpf & DP_UNPLUG ? REQ_UNPLUG : 0) | + (dpf & DP_FUA ? REQ_FUA : 0) | + (dpf & DP_FLUSH ? REQ_FUA : 0) | + (dpf & DP_DISCARD ? REQ_DISCARD : 0); + else + return dpf & DP_RW_SYNC ? (REQ_SYNC | REQ_UNPLUG) : 0; +} + /* mirrored write */ static int receive_Data(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size) { @@ -1818,12 +1830,8 @@ static int receive_Data(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned spin_unlock(&mdev->epoch_lock); dp_flags = be32_to_cpu(p->dp_flags); - if (dp_flags & DP_HARDBARRIER) { - dev_err(DEV, "ASSERT FAILED would have submitted barrier request\n"); - /* rw |= REQ_HARDBARRIER; */ - } - if (dp_flags & DP_RW_SYNC) - rw |= REQ_SYNC | REQ_UNPLUG; + rw |= write_flags_to_bio(mdev, dp_flags); + if (dp_flags & DP_MAY_SET_IN_SYNC) e->flags |= EE_MAY_SET_IN_SYNC; -- cgit v0.10.2 From 435f07402b3165b90592073bc0f8c6f8fa160ff9 Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Mon, 6 Sep 2010 12:30:25 +0200 Subject: drbd: don't count sendpage()d pages only referenced by tcp as in use Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index 79b877d..eb1273d 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -1077,7 +1077,8 @@ struct drbd_conf { int next_barrier_nr; struct hlist_head *app_reads_hash; /* is proteced by req_lock */ struct list_head resync_reads; - atomic_t pp_in_use; + atomic_t pp_in_use; /* allocated from page pool */ + atomic_t pp_in_use_by_net; /* sendpage()d, still referenced by tcp */ wait_queue_head_t ee_wait; struct page *md_io_page; /* one page buffer for md_io */ struct page *md_io_tmpp; /* for logical_block_size != 512 */ @@ -1555,7 +1556,10 @@ extern struct drbd_epoch_entry *drbd_alloc_ee(struct drbd_conf *mdev, sector_t sector, unsigned int data_size, gfp_t gfp_mask) __must_hold(local); -extern void drbd_free_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e); +extern void drbd_free_some_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e, + int is_net); +#define drbd_free_ee(m,e) drbd_free_some_ee(m, e, 0) +#define drbd_free_net_ee(m,e) drbd_free_some_ee(m, e, 1) extern void drbd_wait_ee_list_empty(struct drbd_conf *mdev, struct list_head *head); extern void _drbd_wait_ee_list_empty(struct drbd_conf *mdev, diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 1827cf0..981cfd178 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -2753,6 +2753,7 @@ void drbd_init_set_defaults(struct drbd_conf *mdev) atomic_set(&mdev->net_cnt, 0); atomic_set(&mdev->packet_seq, 0); atomic_set(&mdev->pp_in_use, 0); + atomic_set(&mdev->pp_in_use_by_net, 0); atomic_set(&mdev->rs_sect_in, 0); atomic_set(&mdev->rs_sect_ev, 0); diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index e96fbb0..2c3edf0 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -241,7 +241,7 @@ static void drbd_kick_lo_and_reclaim_net(struct drbd_conf *mdev) spin_unlock_irq(&mdev->req_lock); list_for_each_entry_safe(e, t, &reclaimed, w.list) - drbd_free_ee(mdev, e); + drbd_free_net_ee(mdev, e); } /** @@ -298,9 +298,11 @@ static struct page *drbd_pp_alloc(struct drbd_conf *mdev, unsigned number, bool * Is also used from inside an other spin_lock_irq(&mdev->req_lock); * Either links the page chain back to the global pool, * or returns all pages to the system. */ -static void drbd_pp_free(struct drbd_conf *mdev, struct page *page) +static void drbd_pp_free(struct drbd_conf *mdev, struct page *page, int is_net) { + atomic_t *a = is_net ? &mdev->pp_in_use_by_net : &mdev->pp_in_use; int i; + if (drbd_pp_vacant > (DRBD_MAX_SEGMENT_SIZE/PAGE_SIZE)*minor_count) i = page_chain_free(page); else { @@ -311,10 +313,10 @@ static void drbd_pp_free(struct drbd_conf *mdev, struct page *page) drbd_pp_vacant += i; spin_unlock(&drbd_pp_lock); } - atomic_sub(i, &mdev->pp_in_use); - i = atomic_read(&mdev->pp_in_use); + i = atomic_sub_return(i, a); if (i < 0) - dev_warn(DEV, "ASSERTION FAILED: pp_in_use: %d < 0\n", i); + dev_warn(DEV, "ASSERTION FAILED: %s: %d < 0\n", + is_net ? "pp_in_use_by_net" : "pp_in_use", i); wake_up(&drbd_pp_wait); } @@ -374,11 +376,11 @@ struct drbd_epoch_entry *drbd_alloc_ee(struct drbd_conf *mdev, return NULL; } -void drbd_free_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e) +void drbd_free_some_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e, int is_net) { if (e->flags & EE_HAS_DIGEST) kfree(e->digest); - drbd_pp_free(mdev, e->pages); + drbd_pp_free(mdev, e->pages, is_net); D_ASSERT(atomic_read(&e->pending_bios) == 0); D_ASSERT(hlist_unhashed(&e->colision)); mempool_free(e, drbd_ee_mempool); @@ -389,13 +391,14 @@ int drbd_release_ee(struct drbd_conf *mdev, struct list_head *list) LIST_HEAD(work_list); struct drbd_epoch_entry *e, *t; int count = 0; + int is_net = list == &mdev->net_ee; spin_lock_irq(&mdev->req_lock); list_splice_init(list, &work_list); spin_unlock_irq(&mdev->req_lock); list_for_each_entry_safe(e, t, &work_list, w.list) { - drbd_free_ee(mdev, e); + drbd_free_some_ee(mdev, e, is_net); count++; } return count; @@ -424,7 +427,7 @@ static int drbd_process_done_ee(struct drbd_conf *mdev) spin_unlock_irq(&mdev->req_lock); list_for_each_entry_safe(e, t, &reclaimed, w.list) - drbd_free_ee(mdev, e); + drbd_free_net_ee(mdev, e); /* possible callbacks here: * e_end_block, and e_end_resync_block, e_send_discard_ack. @@ -1460,7 +1463,7 @@ static int drbd_drain_block(struct drbd_conf *mdev, int data_size) data_size -= rr; } kunmap(page); - drbd_pp_free(mdev, page); + drbd_pp_free(mdev, page, 0); return rv; } @@ -3879,6 +3882,9 @@ static void drbd_disconnect(struct drbd_conf *mdev) i = drbd_release_ee(mdev, &mdev->net_ee); if (i) dev_info(DEV, "net_ee not empty, killed %u entries\n", i); + i = atomic_read(&mdev->pp_in_use_by_net); + if (i) + dev_info(DEV, "pp_in_use_by_net = %d, expected 0\n", i); i = atomic_read(&mdev->pp_in_use); if (i) dev_info(DEV, "pp_in_use = %d, expected 0\n", i); diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c index 0e5bf8c..0174319 100644 --- a/drivers/block/drbd/drbd_worker.c +++ b/drivers/block/drbd/drbd_worker.c @@ -914,9 +914,13 @@ static void move_to_net_ee_or_free(struct drbd_conf *mdev, struct drbd_epoch_ent { if (drbd_ee_has_active_page(e)) { /* This might happen if sendpage() has not finished */ + int i = DIV_ROUND_UP(e->size, PAGE_SIZE); + atomic_add(i, &mdev->pp_in_use_by_net); + atomic_sub(i, &mdev->pp_in_use); spin_lock_irq(&mdev->req_lock); list_add_tail(&e->w.list, &mdev->net_ee); spin_unlock_irq(&mdev->req_lock); + wake_up(&drbd_pp_wait); } else drbd_free_ee(mdev, e); } -- cgit v0.10.2 From 1d53f09e170e477de67babd7a10e277479260d51 Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Sun, 5 Sep 2010 01:13:24 +0200 Subject: drbd: fix potential kernel BUG (NULL deref) BUG trace would look like: lc_find drbd_rs_complete_io got_OVResult drbd_asender Could be triggered by explicit, or IO-error policy based, detach during online-verify. We may only dereference mdev->resync, if we first get_ldev(), as the disk may break any time, causing mdev->resync to disappear once all ldev references have been returned. Already in flight online-verify requests or replies may still come in, which we then need to ignore. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index 2c3edf0..e4e4edd 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -4241,10 +4241,13 @@ static int got_IsInSync(struct drbd_conf *mdev, struct p_header80 *h) update_peer_seq(mdev, be32_to_cpu(p->seq_num)); - drbd_rs_complete_io(mdev, sector); - drbd_set_in_sync(mdev, sector, blksize); - /* rs_same_csums is supposed to count in units of BM_BLOCK_SIZE */ - mdev->rs_same_csum += (blksize >> BM_BLOCK_SHIFT); + if (get_ldev(mdev)) { + drbd_rs_complete_io(mdev, sector); + drbd_set_in_sync(mdev, sector, blksize); + /* rs_same_csums is supposed to count in units of BM_BLOCK_SIZE */ + mdev->rs_same_csum += (blksize >> BM_BLOCK_SHIFT); + put_ldev(mdev); + } dec_rs_pending(mdev); atomic_add(blksize >> 9, &mdev->rs_sect_in); @@ -4423,6 +4426,9 @@ static int got_OVResult(struct drbd_conf *mdev, struct p_header80 *h) else ov_oos_print(mdev); + if (!get_ldev(mdev)) + return TRUE; + drbd_rs_complete_io(mdev, sector); dec_rs_pending(mdev); @@ -4437,6 +4443,7 @@ static int got_OVResult(struct drbd_conf *mdev, struct p_header80 *h) drbd_resync_finished(mdev); } } + put_ldev(mdev); return TRUE; } diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c index 0174319..c72a5fc 100644 --- a/drivers/block/drbd/drbd_worker.c +++ b/drivers/block/drbd/drbd_worker.c @@ -1027,7 +1027,10 @@ int w_e_end_csum_rs_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel) return 1; } - drbd_rs_complete_io(mdev, e->sector); + if (get_ldev(mdev)) { + drbd_rs_complete_io(mdev, e->sector); + put_ldev(mdev); + } di = e->digest; @@ -1134,7 +1137,10 @@ int w_e_end_ov_reply(struct drbd_conf *mdev, struct drbd_work *w, int cancel) /* after "cancel", because after drbd_disconnect/drbd_rs_cancel_all * the resync lru has been cleaned up already */ - drbd_rs_complete_io(mdev, e->sector); + if (get_ldev(mdev)) { + drbd_rs_complete_io(mdev, e->sector); + put_ldev(mdev); + } di = e->digest; -- cgit v0.10.2 From a821cc4a9a8d9e67356f9a5cfc1206aa3cfd30f7 Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Mon, 6 Sep 2010 12:31:37 +0200 Subject: drbd: fix spurious protocol error If we cannot satisfy a request (because our disk just broke), we still need to drain the payload. Or we'll get a protocol error when interpreting the payload as DRBD packet header. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index e4e4edd..983e49c 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -2085,7 +2085,8 @@ static int receive_DataRequest(struct drbd_conf *mdev, enum drbd_packets cmd, un "no local data.\n"); drbd_send_ack_rp(mdev, cmd == P_DATA_REQUEST ? P_NEG_DREPLY : P_NEG_RS_DREPLY , p); - return TRUE; + /* drain possibly payload */ + return drbd_drain_block(mdev, digest_size); } /* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD -- cgit v0.10.2 From 02bc7174ae83617b4364dc179d95d848d9fd6db5 Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Mon, 6 Sep 2010 12:13:20 +0200 Subject: drbd: cosmetic, don't report resync for online-verify Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 981cfd178..2060db6 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -504,7 +504,7 @@ static int is_valid_state(struct drbd_conf *mdev, union drbd_state ns); static int is_valid_state_transition(struct drbd_conf *, union drbd_state, union drbd_state); static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os, - union drbd_state ns, int *warn_sync_abort); + union drbd_state ns, const char **warn_sync_abort); int drbd_send_state_req(struct drbd_conf *, union drbd_state, union drbd_state); @@ -812,7 +812,7 @@ static int is_valid_state_transition(struct drbd_conf *mdev, * to D_UNKNOWN. This rule and many more along those lines are in this function. */ static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os, - union drbd_state ns, int *warn_sync_abort) + union drbd_state ns, const char **warn_sync_abort) { enum drbd_fencing_p fp; @@ -851,7 +851,9 @@ static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state if (os.conn > C_CONNECTED && ns.conn > C_CONNECTED && (ns.disk <= D_FAILED || ns.pdsk <= D_FAILED)) { if (warn_sync_abort) - *warn_sync_abort = 1; + *warn_sync_abort = + os.conn == C_VERIFY_S || os.conn == C_VERIFY_T ? + "Online-verify" : "Resync"; ns.conn = C_CONNECTED; } @@ -984,7 +986,7 @@ int __drbd_set_state(struct drbd_conf *mdev, { union drbd_state os; int rv = SS_SUCCESS; - int warn_sync_abort = 0; + const char *warn_sync_abort = NULL; struct after_state_chg_work *ascw; os = mdev->state; @@ -1016,7 +1018,7 @@ int __drbd_set_state(struct drbd_conf *mdev, } if (warn_sync_abort) - dev_warn(DEV, "Resync aborted.\n"); + dev_warn(DEV, "%s aborted.\n", warn_sync_abort); { char *pbp, pb[300]; -- cgit v0.10.2 From f2906e183f5460df9d9e774f5952f5ff670b3913 Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Wed, 21 Jul 2010 17:04:32 +0200 Subject: drbd: fix for spurious full sync (becoming sync target looked like invalidate) If a synctarget lost connection while being WFSyncUUID, due to "state sanitizing", the attempted state change to SyncTarget looked like an "invalidate" to after_state_ch() later, thus caused a full sync on next handshake (Bug #318). drbd0: PingAck did not arrive in time. drbd0: peer( Primary -> Unknown ) conn( WFSyncUUID -> NetworkFailure ) pdsk( UpToDate -> DUnknown ) from : { cs:NetworkFailure ro:Secondary/Unknown ds:UpToDate/DUnknown r--- } to : { cs:SyncTarget ro:Secondary/Unknown ds:Inconsistent/DUnknown r--- } after sanizising, resulted in state: { cs:NetworkFailure ro:Secondary/Unknown ds:Inconsistent/DUnknown r--- } drbd0: disk( UpToDate -> Inconsistent ) Fix: don't mask state transition errors in "sanitizing", so the requested state change to SyncTarget fails, instead of being implicitly "remaped" to invalidate. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 2060db6..04c305d 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -827,9 +827,10 @@ static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os.conn <= C_DISCONNECTING) ns.conn = os.conn; - /* After a network error (+C_TEAR_DOWN) only C_UNCONNECTED or C_DISCONNECTING can follow */ + /* After a network error (+C_TEAR_DOWN) only C_UNCONNECTED or C_DISCONNECTING can follow. + * If you try to go into some Sync* state, that shall fail (elsewhere). */ if (os.conn >= C_TIMEOUT && os.conn <= C_TEAR_DOWN && - ns.conn != C_UNCONNECTED && ns.conn != C_DISCONNECTING) + ns.conn != C_UNCONNECTED && ns.conn != C_DISCONNECTING && ns.conn <= C_TEAR_DOWN) ns.conn = os.conn; /* After C_DISCONNECTING only C_STANDALONE may follow */ -- cgit v0.10.2 From 5a75cc7cfbb98e896232902214432dae30653dfe Mon Sep 17 00:00:00 2001 From: Philipp Reisner Date: Thu, 9 Sep 2010 14:22:21 +0200 Subject: drbd: Fixed compatibility with protocol versions smaller than 95 Forgot to consider the max size for the resync requests. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c index c72a5fc..daa672f 100644 --- a/drivers/block/drbd/drbd_worker.c +++ b/drivers/block/drbd/drbd_worker.c @@ -534,8 +534,9 @@ int w_make_resync_request(struct drbd_conf *mdev, /* starting with drbd 8.3.8, we can handle multi-bio EEs, * if it should be necessary */ - max_segment_size = mdev->agreed_pro_version < 94 ? - queue_max_segment_size(mdev->rq_queue) : DRBD_MAX_SEGMENT_SIZE; + max_segment_size = + mdev->agreed_pro_version < 94 ? queue_max_segment_size(mdev->rq_queue) : + mdev->agreed_pro_version < 95 ? DRBD_MAX_SIZE_H80_PACKET : DRBD_MAX_SEGMENT_SIZE; if (mdev->rs_plan_s.size) { /* mdev->sync_conf.c_plan_ahead */ number = drbd_rs_controller(mdev) >> (BM_BLOCK_SHIFT - 9); -- cgit v0.10.2 From 78db89287ce0f146a1f2a019a0b243ea4557caac Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Mon, 13 Sep 2010 13:27:10 +0200 Subject: drbd: DIV_ROUND_UP not needed here Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c index daa672f..83ba63a 100644 --- a/drivers/block/drbd/drbd_worker.c +++ b/drivers/block/drbd/drbd_worker.c @@ -915,7 +915,7 @@ static void move_to_net_ee_or_free(struct drbd_conf *mdev, struct drbd_epoch_ent { if (drbd_ee_has_active_page(e)) { /* This might happen if sendpage() has not finished */ - int i = DIV_ROUND_UP(e->size, PAGE_SIZE); + int i = (e->size + PAGE_SIZE -1) >> PAGE_SHIFT; atomic_add(i, &mdev->pp_in_use_by_net); atomic_sub(i, &mdev->pp_in_use); spin_lock_irq(&mdev->req_lock); -- cgit v0.10.2 From fb22c402ffdf61dd121795b5809de587185d5240 Mon Sep 17 00:00:00 2001 From: Philipp Reisner Date: Wed, 8 Sep 2010 23:20:21 +0200 Subject: drbd: Track the reasons to suspend IO in dedicated state bits There are three ways to get IO suspended: * Loss of any access to data * Fence-peer-handler running * User requested to suspend IO Track those in different bits, so that one condition clearing its state bit does not interfere with the other two conditions. Only when the user resumes IO he overrules all three bits. The fact is hidden from the user, he sees only a single suspend bit. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index eb1273d..ff7fffa 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -1681,6 +1681,8 @@ void drbd_bcast_ee(struct drbd_conf *mdev, #define susp_MASK 1 #define user_isp_MASK 1 #define aftr_isp_MASK 1 +#define susp_nod_MASK 1 +#define susp_fen_MASK 1 #define NS(T, S) \ ({ union drbd_state mask; mask.i = 0; mask.T = T##_MASK; mask; }), \ @@ -2254,11 +2256,16 @@ static inline int drbd_state_is_stable(union drbd_state s) return 1; } +static inline int is_susp(union drbd_state s) +{ + return s.susp || s.susp_nod || s.susp_fen; +} + static inline int __inc_ap_bio_cond(struct drbd_conf *mdev) { int mxb = drbd_get_max_buffers(mdev); - if (mdev->state.susp) + if (is_susp(mdev->state)) return 0; if (test_bit(SUSPEND_IO, &mdev->flags)) return 0; diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 04c305d..4f33714 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -654,7 +654,7 @@ static void print_st(struct drbd_conf *mdev, char *name, union drbd_state ns) drbd_role_str(ns.peer), drbd_disk_str(ns.disk), drbd_disk_str(ns.pdsk), - ns.susp ? 's' : 'r', + is_susp(ns) ? 's' : 'r', ns.aftr_isp ? 'a' : '-', ns.peer_isp ? 'p' : '-', ns.user_isp ? 'u' : '-' @@ -925,12 +925,12 @@ static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state if (fp == FP_STONITH && (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk > D_OUTDATED) && !(os.role == R_PRIMARY && os.conn < C_CONNECTED && os.pdsk > D_OUTDATED)) - ns.susp = 1; /* Suspend IO while fence-peer handler runs (peer lost) */ + ns.susp_fen = 1; /* Suspend IO while fence-peer handler runs (peer lost) */ if (mdev->sync_conf.on_no_data == OND_SUSPEND_IO && (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE) && !(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE)) - ns.susp = 1; /* Suspend IO while no data available (no accessible data available) */ + ns.susp_nod = 1; /* Suspend IO while no data available (no accessible data available) */ if (ns.aftr_isp || ns.peer_isp || ns.user_isp) { if (ns.conn == C_SYNC_SOURCE) @@ -1030,7 +1030,10 @@ int __drbd_set_state(struct drbd_conf *mdev, PSC(conn); PSC(disk); PSC(pdsk); - PSC(susp); + if (is_susp(ns) != is_susp(os)) + pbp += sprintf(pbp, "susp( %s -> %s ) ", + drbd_susp_str(is_susp(os)), + drbd_susp_str(is_susp(ns))); PSC(aftr_isp); PSC(peer_isp); PSC(user_isp); @@ -1218,6 +1221,7 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os, { enum drbd_fencing_p fp; enum drbd_req_event what = nothing; + union drbd_state nsm = (union drbd_state){ .i = -1 }; if (os.conn != C_CONNECTED && ns.conn == C_CONNECTED) { clear_bit(CRASHED_PRIMARY, &mdev->flags); @@ -1241,19 +1245,21 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os, /* Here we have the actions that are performed after a state change. This function might sleep */ - if (os.susp && ns.susp && mdev->sync_conf.on_no_data == OND_SUSPEND_IO) { + nsm.i = -1; + if (ns.susp_nod) { if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED) { if (ns.conn == C_CONNECTED) - what = resend; + what = resend, nsm.susp_nod = 0; else /* ns.conn > C_CONNECTED */ dev_err(DEV, "Unexpected Resynd going on!\n"); } if (os.disk == D_ATTACHING && ns.disk > D_ATTACHING) - what = restart_frozen_disk_io; + what = restart_frozen_disk_io, nsm.susp_nod = 0; + } - if (fp == FP_STONITH && ns.susp) { + if (ns.susp_fen) { /* case1: The outdate peer handler is successful: */ if (os.pdsk > D_OUTDATED && ns.pdsk <= D_OUTDATED) { tl_clear(mdev); @@ -1263,20 +1269,22 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os, drbd_md_sync(mdev); } spin_lock_irq(&mdev->req_lock); - _drbd_set_state(_NS(mdev, susp, 0), CS_VERBOSE, NULL); + _drbd_set_state(_NS(mdev, susp_fen, 0), CS_VERBOSE, NULL); spin_unlock_irq(&mdev->req_lock); } /* case2: The connection was established again: */ if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED) { clear_bit(NEW_CUR_UUID, &mdev->flags); what = resend; + nsm.susp_fen = 0; } } if (what != nothing) { spin_lock_irq(&mdev->req_lock); _tl_restart(mdev, what); - _drbd_set_state(_NS(mdev, susp, 0), CS_VERBOSE, NULL); + nsm.i &= mdev->state.i; + _drbd_set_state(mdev, nsm, CS_VERBOSE, NULL); spin_unlock_irq(&mdev->req_lock); } @@ -1298,7 +1306,7 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os, if (get_ldev(mdev)) { if ((ns.role == R_PRIMARY || ns.peer == R_PRIMARY) && mdev->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) { - if (mdev->state.susp) { + if (is_susp(mdev->state)) { set_bit(NEW_CUR_UUID, &mdev->flags); } else { drbd_uuid_new_current(mdev); @@ -1417,7 +1425,7 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os, resume_next_sg(mdev); /* free tl_hash if we Got thawed and are C_STANDALONE */ - if (ns.conn == C_STANDALONE && ns.susp == 0 && mdev->tl_hash) + if (ns.conn == C_STANDALONE && !is_susp(ns) && mdev->tl_hash) drbd_free_tl_hash(mdev); /* Upon network connection, we need to start the receiver */ @@ -2732,7 +2740,9 @@ static void drbd_set_defaults(struct drbd_conf *mdev) .conn = C_STANDALONE, .disk = D_DISKLESS, .pdsk = D_UNKNOWN, - .susp = 0 + .susp = 0, + .susp_nod = 0, + .susp_fen = 0 } }; } diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index 5b30f90..9ee4456 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c @@ -209,7 +209,8 @@ enum drbd_disk_state drbd_try_outdate_peer(struct drbd_conf *mdev) put_ldev(mdev); } else { dev_warn(DEV, "Not fencing peer, I'm not even Consistent myself.\n"); - return mdev->state.pdsk; + nps = mdev->state.pdsk; + goto out; } r = drbd_khelper(mdev, "fence-peer"); @@ -256,6 +257,14 @@ enum drbd_disk_state drbd_try_outdate_peer(struct drbd_conf *mdev) dev_info(DEV, "fence-peer helper returned %d (%s)\n", (r>>8) & 0xff, ex_to_string); + +out: + if (mdev->state.susp_fen && nps >= D_UNKNOWN) { + /* The handler was not successful... unfreeze here, the + state engine can not unfreeze... */ + _drbd_request_state(mdev, NS(susp_fen, 0), CS_VERBOSE); + } + return nps; } @@ -550,7 +559,7 @@ char *ppsize(char *buf, unsigned long long size) void drbd_suspend_io(struct drbd_conf *mdev) { set_bit(SUSPEND_IO, &mdev->flags); - if (mdev->state.susp) + if (is_susp(mdev->state)) return; wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_bio_cnt)); } @@ -1016,7 +1025,7 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp drbd_suspend_io(mdev); /* also wait for the last barrier ack. */ - wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_pending_cnt) || mdev->state.susp); + wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_pending_cnt) || is_susp(mdev->state)); /* and for any other previously queued work */ drbd_flush_workqueue(mdev); @@ -1114,8 +1123,7 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp clear_bit(CRASHED_PRIMARY, &mdev->flags); if (drbd_md_test_flag(mdev->ldev, MDF_PRIMARY_IND) && - !(mdev->state.role == R_PRIMARY && mdev->state.susp && - mdev->sync_conf.on_no_data == OND_SUSPEND_IO)) { + !(mdev->state.role == R_PRIMARY && mdev->state.susp_nod)) { set_bit(CRASHED_PRIMARY, &mdev->flags); cp_discovered = 1; } @@ -1939,7 +1947,7 @@ static int drbd_nl_resume_io(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp drbd_md_sync(mdev); } drbd_suspend_io(mdev); - reply->ret_code = drbd_request_state(mdev, NS(susp, 0)); + reply->ret_code = drbd_request_state(mdev, NS3(susp, 0, susp_nod, 0, susp_fen, 0)); if (reply->ret_code == SS_SUCCESS) { if (mdev->state.conn < C_CONNECTED) tl_clear(mdev); diff --git a/drivers/block/drbd/drbd_proc.c b/drivers/block/drbd/drbd_proc.c index a4a4a06..aec8426 100644 --- a/drivers/block/drbd/drbd_proc.c +++ b/drivers/block/drbd/drbd_proc.c @@ -213,7 +213,7 @@ static int drbd_seq_show(struct seq_file *seq, void *v) drbd_disk_str(mdev->state.pdsk), (mdev->net_conf == NULL ? ' ' : (mdev->net_conf->wire_protocol - DRBD_PROT_A+'A')), - mdev->state.susp ? 's' : 'r', + is_susp(mdev->state) ? 's' : 'r', mdev->state.aftr_isp ? 'a' : '-', mdev->state.peer_isp ? 'p' : '-', mdev->state.user_isp ? 'u' : '-', diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index 983e49c..6b69b2f 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -3315,7 +3315,7 @@ static int receive_state(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned if ((nconn == C_CONNECTED || nconn == C_WF_BITMAP_S) && ns.disk == D_NEGOTIATING) ns.disk = mdev->new_state_tmp.disk; cs_flags = CS_VERBOSE + (oconn < C_CONNECTED && nconn >= C_CONNECTED ? 0 : CS_HARD); - if (ns.pdsk == D_CONSISTENT && ns.susp && nconn == C_CONNECTED && oconn < C_CONNECTED && + if (ns.pdsk == D_CONSISTENT && is_susp(ns) && nconn == C_CONNECTED && oconn < C_CONNECTED && test_bit(NEW_CUR_UUID, &mdev->flags)) { /* Do not allow tl_restart(resend) for a rebooted peer. We can only allow this for temporal network outages! */ @@ -3829,7 +3829,7 @@ static void drbd_disconnect(struct drbd_conf *mdev) kfree(mdev->p_uuid); mdev->p_uuid = NULL; - if (!mdev->state.susp) + if (!is_susp(mdev->state)) tl_clear(mdev); dev_info(DEV, "Connection closed\n"); @@ -3858,7 +3858,7 @@ static void drbd_disconnect(struct drbd_conf *mdev) if (os.conn == C_DISCONNECTING) { wait_event(mdev->net_cnt_wait, atomic_read(&mdev->net_cnt) == 0); - if (!mdev->state.susp) { + if (!is_susp(mdev->state)) { /* we must not free the tl_hash * while application io is still on the fly */ wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_bio_cnt)); diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c index af608b3..9e91a25 100644 --- a/drivers/block/drbd/drbd_req.c +++ b/drivers/block/drbd/drbd_req.c @@ -287,7 +287,7 @@ static void _req_may_be_done_not_susp(struct drbd_request *req, struct bio_and_e { struct drbd_conf *mdev = req->mdev; - if (!mdev->state.susp) + if (!is_susp(mdev->state)) _req_may_be_done(req, m); } @@ -812,7 +812,7 @@ static int drbd_make_request_common(struct drbd_conf *mdev, struct bio *bio) (mdev->state.pdsk == D_INCONSISTENT && mdev->state.conn >= C_CONNECTED)); - if (!(local || remote) && !mdev->state.susp) { + if (!(local || remote) && !is_susp(mdev->state)) { dev_err(DEV, "IO ERROR: neither local nor remote disk\n"); goto fail_free_complete; } @@ -838,7 +838,7 @@ allocate_barrier: /* GOOD, everything prepared, grab the spin_lock */ spin_lock_irq(&mdev->req_lock); - if (mdev->state.susp) { + if (is_susp(mdev->state)) { /* If we got suspended, use the retry mechanism of generic_make_request() to restart processing of this bio. In the next call to drbd_make_request_26 diff --git a/include/linux/drbd.h b/include/linux/drbd.h index 89718a3..5e72a5d 100644 --- a/include/linux/drbd.h +++ b/include/linux/drbd.h @@ -232,13 +232,17 @@ union drbd_state { unsigned conn:5 ; /* 17/32 cstates */ unsigned disk:4 ; /* 8/16 from D_DISKLESS to D_UP_TO_DATE */ unsigned pdsk:4 ; /* 8/16 from D_DISKLESS to D_UP_TO_DATE */ - unsigned susp:1 ; /* 2/2 IO suspended no/yes */ + unsigned susp:1 ; /* 2/2 IO suspended no/yes (by user) */ unsigned aftr_isp:1 ; /* isp .. imposed sync pause */ unsigned peer_isp:1 ; unsigned user_isp:1 ; - unsigned _pad:11; /* 0 unused */ + unsigned susp_nod:1 ; /* IO suspended because no data */ + unsigned susp_fen:1 ; /* IO suspended because fence peer handler runs*/ + unsigned _pad:9; /* 0 unused */ #elif defined(__BIG_ENDIAN_BITFIELD) - unsigned _pad:11; /* 0 unused */ + unsigned _pad:9; + unsigned susp_fen:1 ; + unsigned susp_nod:1 ; unsigned user_isp:1 ; unsigned peer_isp:1 ; unsigned aftr_isp:1 ; /* isp .. imposed sync pause */ -- cgit v0.10.2 From 8979d9c9e0bc8e54cf5bd7a89abb2145f087b5e1 Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Tue, 14 Sep 2010 15:56:29 +0200 Subject: drbd: protocol compatibility for maximum packet sizes Two missing corner cases to the "maximum packet size" handshake. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index 9ee4456..9ae33a5 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c @@ -861,6 +861,7 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp struct inode *inode, *inode2; struct lru_cache *resync_lru = NULL; union drbd_state ns, os; + unsigned int max_seg_s; int rv; int cp_discovered = 0; int logical_block_size; @@ -1133,9 +1134,20 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp mdev->read_cnt = 0; mdev->writ_cnt = 0; - drbd_setup_queue_param(mdev, mdev->state.conn == C_CONNECTED && - mdev->agreed_pro_version < 95 ? - DRBD_MAX_SIZE_H80_PACKET : DRBD_MAX_SEGMENT_SIZE); + max_seg_s = DRBD_MAX_SEGMENT_SIZE; + if (mdev->state.conn == C_CONNECTED) { + /* We are Primary, Connected, and now attach a new local + * backing store. We must not increase the user visible maximum + * bio size on this device to something the peer may not be + * able to handle. */ + if (mdev->agreed_pro_version < 94) + max_seg_s = queue_max_segment_size(mdev->rq_queue); + else if (mdev->agreed_pro_version == 94) + max_seg_s = DRBD_MAX_SIZE_H80_PACKET; + /* else: drbd 8.3.9 and later, stay with default */ + } + + drbd_setup_queue_param(mdev, max_seg_s); /* If I am currently not R_PRIMARY, * but meta data primary indicator is set, diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index 6b69b2f..9da32ac 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -3088,6 +3088,8 @@ static int receive_sizes(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned if (mdev->agreed_pro_version < 94) max_seg_s = be32_to_cpu(p->max_segment_size); + else if (mdev->agreed_pro_version == 94) + max_seg_s = DRBD_MAX_SIZE_H80_PACKET; else /* drbd 8.3.8 onwards */ max_seg_s = DRBD_MAX_SEGMENT_SIZE; -- cgit v0.10.2 From f65363cfa05fe60874030461a0eeb84b7e60cba4 Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Tue, 14 Sep 2010 20:14:09 +0200 Subject: drbd: fix possible access after free If we release the page pointed to by md_io_tmpp, we need to zero out the pointer, too, as that may be used later to decide whether we need to allocate a new page again. Impact: a previously freed page may be used and clobbered. Depending on what that particular page is being used for meanwhile, this may result in silent data corruption of completely unrelated things. Only of concern on devices with logical_block_size != 512 byte, if you re-attach after becoming diskless once. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 4f33714..e1f2c2e 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -1407,8 +1407,10 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os, drbd_free_bc(mdev->ldev); mdev->ldev = NULL;); - if (mdev->md_io_tmpp) + if (mdev->md_io_tmpp) { __free_page(mdev->md_io_tmpp); + mdev->md_io_tmpp = NULL; + } } /* Disks got bigger while they were detached */ -- cgit v0.10.2 From 00b425377d60e67e86721d4ce6d7cbf131a5d0fd Mon Sep 17 00:00:00 2001 From: Philipp Reisner Date: Tue, 5 Oct 2010 11:19:39 +0200 Subject: drbd: Allow larger values for c-fill-target. Connections through a compressing proxy might have more bits on the fly. 500MByte instead of 50MByte Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index ff7fffa..1680939 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -348,7 +348,7 @@ struct p_header80 { struct p_header95 { u16 magic; /* use DRBD_MAGIC_BIG here */ u16 command; - u32 length; + u32 length; /* Use only 24 bits of that. Ignore the highest 8 bit. */ u8 payload[0]; } __packed; diff --git a/include/linux/drbd_limits.h b/include/linux/drbd_limits.h index 0b24ded..4ac33f3 100644 --- a/include/linux/drbd_limits.h +++ b/include/linux/drbd_limits.h @@ -143,7 +143,7 @@ #define DRBD_C_DELAY_TARGET_DEF 10 #define DRBD_C_FILL_TARGET_MIN 0 -#define DRBD_C_FILL_TARGET_MAX 100000 +#define DRBD_C_FILL_TARGET_MAX (1<<20) /* 500MByte in sec */ #define DRBD_C_FILL_TARGET_DEF 0 /* By default disabled -> controlled by delay_target */ #define DRBD_C_MAX_RATE_MIN 250 /* kByte/sec */ -- cgit v0.10.2 From f10f262349762c96ab247b6108af3a30b52b6f5a Mon Sep 17 00:00:00 2001 From: Philipp Reisner Date: Tue, 5 Oct 2010 16:50:17 +0200 Subject: drbd: Fixed a stupid copy and paste error This caused rs_planed to be not in sync with the content of the fifo. That in turn could cause that the resync comes to a complete halt. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c index 83ba63a..166b51e 100644 --- a/drivers/block/drbd/drbd_worker.c +++ b/drivers/block/drbd/drbd_worker.c @@ -421,7 +421,7 @@ static void fifo_set(struct fifo_buffer *fb, int value) int i; for (i = 0; i < fb->size; i++) - fb->values[i] += value; + fb->values[i] = value; } static int fifo_push(struct fifo_buffer *fb, int value) -- cgit v0.10.2 From 004352fa60345e499379af310de73a2df1a5762a Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Tue, 5 Oct 2010 20:13:58 +0200 Subject: drbd: Fix regression in recv_bm_rle_bits (compressed bitmap) We used to be16_to_cpu the length field in our received packet header. drbd commit 17c854fea474a5eb3cfa12e4fb019e46debbc4ec drbd: receiving of big packets, for payloads between 64kByte and 4GByte changed this, but forgot to adjust a few places where we relied on h->length being in native byte order. This broke the receiving side of the RLE compressed bitmap exchange. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index 9da32ac..b5d3fa6 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -970,9 +970,10 @@ static int drbd_recv_header(struct drbd_conf *mdev, enum drbd_packets *cmd, unsi *cmd = be16_to_cpu(h->h95.command); *packet_size = be32_to_cpu(h->h95.length); } else { - dev_err(DEV, "magic?? on data m: 0x%lx c: %d l: %d\n", - (long)be32_to_cpu(h->h80.magic), - h->h80.command, h->h80.length); + dev_err(DEV, "magic?? on data m: 0x%08x c: %d l: %d\n", + be32_to_cpu(h->h80.magic), + be16_to_cpu(h->h80.command), + be16_to_cpu(h->h80.length)); return FALSE; } mdev->last_received = jiffies; @@ -3421,7 +3422,7 @@ recv_bm_rle_bits(struct drbd_conf *mdev, u64 tmp; unsigned long s = c->bit_offset; unsigned long e; - int len = p->head.length - (sizeof(*p) - sizeof(p->head)); + int len = be16_to_cpu(p->head.length) - (sizeof(*p) - sizeof(p->head)); int toggle = DCBP_get_start(p); int have; int bits; @@ -3570,8 +3571,8 @@ static int receive_bitmap(struct drbd_conf *mdev, enum drbd_packets cmd, unsigne memcpy(p, h, sizeof(*h)); if (drbd_recv(mdev, p->head.payload, data_size) != data_size) goto out; - if (p->head.length <= (sizeof(*p) - sizeof(p->head))) { - dev_err(DEV, "ReportCBitmap packet too small (l:%u)\n", p->head.length); + if (data_size <= (sizeof(*p) - sizeof(p->head))) { + dev_err(DEV, "ReportCBitmap packet too small (l:%u)\n", data_size); return FAILED; } ret = decode_bitmap_c(mdev, p, &c); @@ -4582,17 +4583,19 @@ int drbd_asender(struct drbd_thread *thi) if (received == expect && cmd == NULL) { if (unlikely(h->magic != BE_DRBD_MAGIC)) { - dev_err(DEV, "magic?? on meta m: 0x%lx c: %d l: %d\n", - (long)be32_to_cpu(h->magic), - h->command, h->length); + dev_err(DEV, "magic?? on meta m: 0x%08x c: %d l: %d\n", + be32_to_cpu(h->magic), + be16_to_cpu(h->command), + be16_to_cpu(h->length)); goto reconnect; } cmd = get_asender_cmd(be16_to_cpu(h->command)); len = be16_to_cpu(h->length); if (unlikely(cmd == NULL)) { - dev_err(DEV, "unknown command?? on meta m: 0x%lx c: %d l: %d\n", - (long)be32_to_cpu(h->magic), - h->command, h->length); + dev_err(DEV, "unknown command?? on meta m: 0x%08x c: %d l: %d\n", + be32_to_cpu(h->magic), + be16_to_cpu(h->command), + be16_to_cpu(h->length)); goto disconnect; } expect = cmd->pkt_size; -- cgit v0.10.2 From 2b2bf2148fd46874ee72a877c951e5c6675d1caa Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Wed, 6 Oct 2010 11:46:55 +0200 Subject: drbd: drbd_send_ack_dp must not rely on header information drbd commit 17c854fea474a5eb3cfa12e4fb019e46debbc4ec drbd: receiving of big packets, for payloads between 64kByte and 4GByte introduced a new on-the-wire packet header format. We must no longer assume either format, but use the result of whatever drbd_recv_header has decoded. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index 1680939..8ab6fed 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -1230,7 +1230,7 @@ extern int drbd_send_ack(struct drbd_conf *mdev, enum drbd_packets cmd, extern int drbd_send_ack_rp(struct drbd_conf *mdev, enum drbd_packets cmd, struct p_block_req *rp); extern int drbd_send_ack_dp(struct drbd_conf *mdev, enum drbd_packets cmd, - struct p_data *dp); + struct p_data *dp, int data_size); extern int drbd_send_ack_ex(struct drbd_conf *mdev, enum drbd_packets cmd, sector_t sector, int blksize, u64 block_id); extern int drbd_send_block(struct drbd_conf *mdev, enum drbd_packets cmd, diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index e1f2c2e..accb37d 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -2193,13 +2193,14 @@ static int _drbd_send_ack(struct drbd_conf *mdev, enum drbd_packets cmd, return ok; } +/* dp->sector and dp->block_id already/still in network byte order, + * data_size is payload size according to dp->head, + * and may need to be corrected for digest size. */ int drbd_send_ack_dp(struct drbd_conf *mdev, enum drbd_packets cmd, - struct p_data *dp) + struct p_data *dp, int data_size) { - const int header_size = sizeof(struct p_data) - - sizeof(struct p_header80); - int data_size = ((struct p_header80 *)dp)->length - header_size; - + data_size -= (mdev->agreed_pro_version >= 87 && mdev->integrity_r_tfm) ? + crypto_hash_digestsize(mdev->integrity_r_tfm) : 0; return _drbd_send_ack(mdev, cmd, dp->sector, cpu_to_be32(data_size), dp->block_id); } diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index b5d3fa6..45a2d61 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -1630,7 +1630,7 @@ static int receive_RSDataReply(struct drbd_conf *mdev, enum drbd_packets cmd, un ok = drbd_drain_block(mdev, data_size); - drbd_send_ack_dp(mdev, P_NEG_ACK, p); + drbd_send_ack_dp(mdev, P_NEG_ACK, p, data_size); } atomic_add(data_size >> 9, &mdev->rs_sect_in); @@ -1787,7 +1787,7 @@ static int receive_Data(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned mdev->peer_seq++; spin_unlock(&mdev->peer_seq_lock); - drbd_send_ack_dp(mdev, P_NEG_ACK, p); + drbd_send_ack_dp(mdev, P_NEG_ACK, p, data_size); atomic_inc(&mdev->current_epoch->epoch_size); return drbd_drain_block(mdev, data_size); } -- cgit v0.10.2 From 4ac4aadacb5badc45679cd94cd362132daafe8c4 Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Thu, 22 Jul 2010 17:39:26 +0200 Subject: drbd: preparation commit, using full state in receive_state() no functional change, just using full state instead of just the .conn part of it for comparisons. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index 45a2d61..585049d 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -3242,8 +3242,7 @@ static int receive_req_state(struct drbd_conf *mdev, enum drbd_packets cmd, unsi static int receive_state(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size) { struct p_state *p = &mdev->data.rbuf.state; - enum drbd_conns nconn, oconn; - union drbd_state ns, peer_state; + union drbd_state os, ns, peer_state; enum drbd_disk_state real_peer_disk; enum chg_state_flags cs_flags; int rv; @@ -3258,38 +3257,38 @@ static int receive_state(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned spin_lock_irq(&mdev->req_lock); retry: - oconn = nconn = mdev->state.conn; + os = ns = mdev->state; spin_unlock_irq(&mdev->req_lock); - if (nconn == C_WF_REPORT_PARAMS) - nconn = C_CONNECTED; + if (ns.conn == C_WF_REPORT_PARAMS) + ns.conn = C_CONNECTED; if (mdev->p_uuid && peer_state.disk >= D_NEGOTIATING && get_ldev_if_state(mdev, D_NEGOTIATING)) { int cr; /* consider resync */ /* if we established a new connection */ - cr = (oconn < C_CONNECTED); + cr = (os.conn < C_CONNECTED); /* if we had an established connection * and one of the nodes newly attaches a disk */ - cr |= (oconn == C_CONNECTED && + cr |= (os.conn == C_CONNECTED && (peer_state.disk == D_NEGOTIATING || - mdev->state.disk == D_NEGOTIATING)); + os.disk == D_NEGOTIATING)); /* if we have both been inconsistent, and the peer has been * forced to be UpToDate with --overwrite-data */ cr |= test_bit(CONSIDER_RESYNC, &mdev->flags); /* if we had been plain connected, and the admin requested to * start a sync by "invalidate" or "invalidate-remote" */ - cr |= (oconn == C_CONNECTED && + cr |= (os.conn == C_CONNECTED && (peer_state.conn >= C_STARTING_SYNC_S && peer_state.conn <= C_WF_BITMAP_T)); if (cr) - nconn = drbd_sync_handshake(mdev, peer_state.role, real_peer_disk); + ns.conn = drbd_sync_handshake(mdev, peer_state.role, real_peer_disk); put_ldev(mdev); - if (nconn == C_MASK) { - nconn = C_CONNECTED; + if (ns.conn == C_MASK) { + ns.conn = C_CONNECTED; if (mdev->state.disk == D_NEGOTIATING) { drbd_force_state(mdev, NS(disk, D_DISKLESS)); } else if (peer_state.disk == D_NEGOTIATING) { @@ -3299,7 +3298,7 @@ static int receive_state(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned } else { if (test_and_clear_bit(CONN_DRY_RUN, &mdev->flags)) return FALSE; - D_ASSERT(oconn == C_WF_REPORT_PARAMS); + D_ASSERT(os.conn == C_WF_REPORT_PARAMS); drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); return FALSE; } @@ -3307,18 +3306,16 @@ static int receive_state(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned } spin_lock_irq(&mdev->req_lock); - if (mdev->state.conn != oconn) + if (mdev->state.i != os.i) goto retry; clear_bit(CONSIDER_RESYNC, &mdev->flags); - ns.i = mdev->state.i; - ns.conn = nconn; ns.peer = peer_state.role; ns.pdsk = real_peer_disk; ns.peer_isp = (peer_state.aftr_isp | peer_state.user_isp); - if ((nconn == C_CONNECTED || nconn == C_WF_BITMAP_S) && ns.disk == D_NEGOTIATING) + if ((ns.conn == C_CONNECTED || ns.conn == C_WF_BITMAP_S) && ns.disk == D_NEGOTIATING) ns.disk = mdev->new_state_tmp.disk; - cs_flags = CS_VERBOSE + (oconn < C_CONNECTED && nconn >= C_CONNECTED ? 0 : CS_HARD); - if (ns.pdsk == D_CONSISTENT && is_susp(ns) && nconn == C_CONNECTED && oconn < C_CONNECTED && + cs_flags = CS_VERBOSE + (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED ? 0 : CS_HARD); + if (ns.pdsk == D_CONSISTENT && is_susp(ns) && ns.conn == C_CONNECTED && os.conn < C_CONNECTED && test_bit(NEW_CUR_UUID, &mdev->flags)) { /* Do not allow tl_restart(resend) for a rebooted peer. We can only allow this for temporal network outages! */ @@ -3339,8 +3336,8 @@ static int receive_state(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned return FALSE; } - if (oconn > C_WF_REPORT_PARAMS) { - if (nconn > C_CONNECTED && peer_state.conn <= C_CONNECTED && + if (os.conn > C_WF_REPORT_PARAMS) { + if (ns.conn > C_CONNECTED && peer_state.conn <= C_CONNECTED && peer_state.disk != D_NEGOTIATING ) { /* we want resync, peer has not yet decided to sync... */ /* Nowadays only used when forcing a node into primary role and -- cgit v0.10.2 From e9ef7bb6f9696471ddddf0065afac8b435e5d051 Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Thu, 7 Oct 2010 15:55:39 +0200 Subject: drbd: allow for explicit resync-finished notifications Preparation patch so more drbd_send_state() usage on the peer will not confuse drbd in receive_state(). Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index 585049d..990fe01 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -3260,6 +3260,40 @@ static int receive_state(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned os = ns = mdev->state; spin_unlock_irq(&mdev->req_lock); + /* peer says his disk is uptodate, while we think it is inconsistent, + * and this happens while we think we have a sync going on. */ + if (os.pdsk == D_INCONSISTENT && real_peer_disk == D_UP_TO_DATE && + os.conn > C_CONNECTED && os.disk == D_UP_TO_DATE) { + /* If we are (becoming) SyncSource, but peer is still in sync + * preparation, ignore its uptodate-ness to avoid flapping, it + * will change to inconsistent once the peer reaches active + * syncing states. + * It may have changed syncer-paused flags, however, so we + * cannot ignore this completely. */ + if (peer_state.conn > C_CONNECTED && + peer_state.conn < C_SYNC_SOURCE) + real_peer_disk = D_INCONSISTENT; + + /* if peer_state changes to connected at the same time, + * it explicitly notifies us that it finished resync. + * Maybe we should finish it up, too? */ + else if (os.conn >= C_SYNC_SOURCE && + peer_state.conn == C_CONNECTED) { + if (drbd_bm_total_weight(mdev) <= mdev->rs_failed) + drbd_resync_finished(mdev); + return TRUE; + } + } + + /* peer says his disk is inconsistent, while we think it is uptodate, + * and this happens while the peer still thinks we have a sync going on, + * but we think we are already done with the sync. + * We ignore this to avoid flapping pdsk. + * This should not happen, if the peer is a recent version of drbd. */ + if (os.pdsk == D_UP_TO_DATE && real_peer_disk == D_INCONSISTENT && + os.conn == C_CONNECTED && peer_state.conn > C_SYNC_SOURCE) + real_peer_disk = D_UP_TO_DATE; + if (ns.conn == C_WF_REPORT_PARAMS) ns.conn = C_CONNECTED; -- cgit v0.10.2 From af85e8e83d160f72a10e4467852646ac08614260 Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Thu, 7 Oct 2010 16:07:55 +0200 Subject: drbd: fix for spurious fullsync (uuids rotated too fast) If it was an "empty" resync, the SyncSource may have already "finished" the resync and rotated the UUIDs, before noticing the connection loss (and generating a new uuid, if Primary, rotating again), while the SyncTarget did not change its uuids at all, or only got to the previous sync-uuid. This would then again lead to a full sync on next handshake (see also Bug #251). Fix: Use explicit resync finished notification even for empty resyncs, do not finish an empty resync implicitly on the SyncSource. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index accb37d..63f45d7 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -1426,6 +1426,11 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os, (os.user_isp && !ns.user_isp)) resume_next_sg(mdev); + /* sync target done with resync. Explicitly notify peer, even though + * it should (at least for non-empty resyncs) already know itself. */ + if (os.disk < D_UP_TO_DATE && os.conn >= C_SYNC_SOURCE && ns.conn == C_CONNECTED) + drbd_send_state(mdev); + /* free tl_hash if we Got thawed and are C_STANDALONE */ if (ns.conn == C_STANDALONE && !is_susp(ns) && mdev->tl_hash) drbd_free_tl_hash(mdev); diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c index 166b51e..88be45a 100644 --- a/drivers/block/drbd/drbd_worker.c +++ b/drivers/block/drbd/drbd_worker.c @@ -522,6 +522,12 @@ int w_make_resync_request(struct drbd_conf *mdev, dev_err(DEV, "%s in w_make_resync_request\n", drbd_conn_str(mdev->state.conn)); + if (mdev->rs_total == 0) { + /* empty resync? */ + drbd_resync_finished(mdev); + return 1; + } + if (!get_ldev(mdev)) { /* Since we only need to access mdev->rsync a get_ldev_if_state(mdev,D_FAILED) would be sufficient, but @@ -768,6 +774,14 @@ static int w_resync_finished(struct drbd_conf *mdev, struct drbd_work *w, int ca return 1; } +static void ping_peer(struct drbd_conf *mdev) +{ + clear_bit(GOT_PING_ACK, &mdev->flags); + request_ping(mdev); + wait_event(mdev->misc_wait, + test_bit(GOT_PING_ACK, &mdev->flags) || mdev->state.conn < C_CONNECTED); +} + int drbd_resync_finished(struct drbd_conf *mdev) { unsigned long db, dt, dbdt; @@ -807,6 +821,8 @@ int drbd_resync_finished(struct drbd_conf *mdev) if (!get_ldev(mdev)) goto out; + ping_peer(mdev); + spin_lock_irq(&mdev->req_lock); os = mdev->state; @@ -1420,14 +1436,6 @@ int drbd_alter_sa(struct drbd_conf *mdev, int na) return retcode; } -static void ping_peer(struct drbd_conf *mdev) -{ - clear_bit(GOT_PING_ACK, &mdev->flags); - request_ping(mdev); - wait_event(mdev->misc_wait, - test_bit(GOT_PING_ACK, &mdev->flags) || mdev->state.conn < C_CONNECTED); -} - /** * drbd_start_resync() - Start the resync process * @mdev: DRBD device. @@ -1527,9 +1535,21 @@ void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side) (unsigned long) mdev->rs_total << (BM_BLOCK_SHIFT-10), (unsigned long) mdev->rs_total); - if (mdev->rs_total == 0) { - /* Peer still reachable? Beware of failing before-resync-target handlers! */ - ping_peer(mdev); + if (mdev->agreed_pro_version < 95 && mdev->rs_total == 0) { + /* This still has a race (about when exactly the peers + * detect connection loss) that can lead to a full sync + * on next handshake. In 8.3.9 we fixed this with explicit + * resync-finished notifications, but the fix + * introduces a protocol change. Sleeping for some + * time longer than the ping interval + timeout on the + * SyncSource, to give the SyncTarget the chance to + * detect connection loss, then waiting for a ping + * response (implicit in drbd_resync_finished) reduces + * the race considerably, but does not solve it. */ + if (side == C_SYNC_SOURCE) + schedule_timeout_interruptible( + mdev->net_conf->ping_int * HZ + + mdev->net_conf->ping_timeo*HZ/9); drbd_resync_finished(mdev); } -- cgit v0.10.2 From 22cc37a943832c948808884604ec6f5ff2594c1d Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Tue, 14 Sep 2010 20:40:41 +0200 Subject: drbd: fix unlikely access after free and list corruption Various cleanup paths have been incomplete, for the very unlikely case that we cannot allocate enough bios from process context when submitting on behalf of the peer or resync process. Never observed. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index 990fe01..71775a9 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -1573,6 +1573,13 @@ static int recv_resync_read(struct drbd_conf *mdev, sector_t sector, int data_si if (drbd_submit_ee(mdev, e, WRITE, DRBD_FAULT_RS_WR) == 0) return TRUE; + /* drbd_submit_ee currently fails for one reason only: + * not being able to allocate enough bios. + * Is dropping the connection going to help? */ + spin_lock_irq(&mdev->req_lock); + list_del(&e->w.list); + spin_unlock_irq(&mdev->req_lock); + drbd_free_ee(mdev, e); fail: put_ldev(mdev); @@ -1998,6 +2005,16 @@ static int receive_Data(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned if (drbd_submit_ee(mdev, e, rw, DRBD_FAULT_DT_WR) == 0) return TRUE; + /* drbd_submit_ee currently fails for one reason only: + * not being able to allocate enough bios. + * Is dropping the connection going to help? */ + spin_lock_irq(&mdev->req_lock); + list_del(&e->w.list); + hlist_del_init(&e->colision); + spin_unlock_irq(&mdev->req_lock); + if (e->flags & EE_CALL_AL_COMPLETE_IO) + drbd_al_complete_io(mdev, e->sector); + out_interrupted: /* yes, the epoch_size now is imbalanced. * but we drop the connection anyways, so we don't have a chance to @@ -2202,6 +2219,14 @@ submit: if (drbd_submit_ee(mdev, e, READ, fault_type) == 0) return TRUE; + /* drbd_submit_ee currently fails for one reason only: + * not being able to allocate enough bios. + * Is dropping the connection going to help? */ + spin_lock_irq(&mdev->req_lock); + list_del(&e->w.list); + spin_unlock_irq(&mdev->req_lock); + /* no drbd_rs_complete_io(), we are dropping the connection anyways */ + out_free_e: put_ldev(mdev); drbd_free_ee(mdev, e); diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c index 88be45a..f12822d 100644 --- a/drivers/block/drbd/drbd_worker.c +++ b/drivers/block/drbd/drbd_worker.c @@ -387,6 +387,13 @@ static int read_for_csum(struct drbd_conf *mdev, sector_t sector, int size) if (drbd_submit_ee(mdev, e, READ, DRBD_FAULT_RS_RD) == 0) return 0; + /* drbd_submit_ee currently fails for one reason only: + * not being able to allocate enough bios. + * Is dropping the connection going to help? */ + spin_lock_irq(&mdev->req_lock); + list_del(&e->w.list); + spin_unlock_irq(&mdev->req_lock); + drbd_free_ee(mdev, e); defer: put_ldev(mdev); diff --git a/include/linux/drbd.h b/include/linux/drbd.h index 5e72a5d..da7d9bd 100644 --- a/include/linux/drbd.h +++ b/include/linux/drbd.h @@ -53,10 +53,10 @@ extern const char *drbd_buildtag(void); -#define REL_VERSION "8.3.8.1" +#define REL_VERSION "8.3.9rc1" #define API_VERSION 88 #define PRO_VERSION_MIN 86 -#define PRO_VERSION_MAX 94 +#define PRO_VERSION_MAX 95 enum drbd_io_error_p { -- cgit v0.10.2 From e9e6f3ec535d7b7c9e2ca64ad691e743e7d3c2f0 Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Tue, 14 Sep 2010 20:26:27 +0200 Subject: drbd: fix for possible deadlock on IO error during resync Scenario: Something (say, flush-147:0) is in drbd_al_begin_io, holding a local_cnt, waiting for the resync to make progress. Disk fails, worker in after_state_ch does drbd_rs_cancel_all, then waits for local_cnt to drop to zero. flush-147:0 is woken by drbd_rs_cancel_all, needs to write an AL transaction, and queues that on the worker. Deadlock. Fix: do not wait in the worker, have put_ldev() trigger the state change D_FAILED -> D_DISKLESS when necessary. put_ldev() cannot do the state change directly, as it may or may not already hold various spinlocks. We queue a short work instead. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index 8ab6fed..c07c370 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -852,6 +852,7 @@ enum { BITMAP_IO, /* suspend application io; once no more io in flight, start bitmap io */ BITMAP_IO_QUEUED, /* Started bitmap IO */ + GO_DISKLESS, /* Disk failed, local_cnt reached zero, we are going diskless */ RESYNC_AFTER_NEG, /* Resync after online grow after the attach&negotiate finished. */ NET_CONGESTED, /* The data socket is congested */ @@ -976,6 +977,7 @@ struct drbd_conf { unsigned int ko_count; struct drbd_work resync_work, unplug_work, + go_diskless, md_sync_work; struct timer_list resync_timer; struct timer_list md_sync_timer; @@ -1278,6 +1280,7 @@ extern void drbd_queue_bitmap_io(struct drbd_conf *mdev, extern int drbd_bmio_set_n_write(struct drbd_conf *mdev); extern int drbd_bmio_clear_n_write(struct drbd_conf *mdev); extern int drbd_bitmap_io(struct drbd_conf *mdev, int (*io_fn)(struct drbd_conf *), char *why); +extern void drbd_go_diskless(struct drbd_conf *mdev); /* Meta data layout @@ -2123,8 +2126,11 @@ static inline void put_ldev(struct drbd_conf *mdev) int i = atomic_dec_return(&mdev->local_cnt); __release(local); D_ASSERT(i >= 0); - if (i == 0) + if (i == 0) { + if (mdev->state.disk == D_FAILED) + drbd_go_diskless(mdev); wake_up(&mdev->misc_wait); + } } #ifndef __CHECKER__ diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 63f45d7..f89b974 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -77,6 +77,7 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os, static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused); static void md_sync_timer_fn(unsigned long data); static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused); +static int w_go_diskless(struct drbd_conf *mdev, struct drbd_work *w, int unused); MODULE_AUTHOR("Philipp Reisner , " "Lars Ellenberg "); @@ -1363,42 +1364,46 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os, os.disk > D_INCONSISTENT && ns.disk == D_INCONSISTENT) drbd_queue_bitmap_io(mdev, &drbd_bmio_set_n_write, NULL, "set_n_write from invalidate"); + /* first half of local IO error */ if (os.disk > D_FAILED && ns.disk == D_FAILED) { - enum drbd_io_error_p eh; + enum drbd_io_error_p eh = EP_PASS_ON; + + if (drbd_send_state(mdev)) + dev_warn(DEV, "Notified peer that my disk is broken.\n"); + else + dev_err(DEV, "Sending state for drbd_io_error() failed\n"); + + drbd_rs_cancel_all(mdev); - eh = EP_PASS_ON; if (get_ldev_if_state(mdev, D_FAILED)) { eh = mdev->ldev->dc.on_io_error; put_ldev(mdev); } + if (eh == EP_CALL_HELPER) + drbd_khelper(mdev, "local-io-error"); + } - drbd_rs_cancel_all(mdev); - /* since get_ldev() only works as long as disk>=D_INCONSISTENT, - and it is D_DISKLESS here, local_cnt can only go down, it can - not increase... It will reach zero */ - wait_event(mdev->misc_wait, !atomic_read(&mdev->local_cnt)); + + /* second half of local IO error handling, + * after local_cnt references have reached zero: */ + if (os.disk == D_FAILED && ns.disk == D_DISKLESS) { mdev->rs_total = 0; mdev->rs_failed = 0; atomic_set(&mdev->rs_pending_cnt, 0); - - spin_lock_irq(&mdev->req_lock); - _drbd_set_state(_NS(mdev, disk, D_DISKLESS), CS_HARD, NULL); - spin_unlock_irq(&mdev->req_lock); - - if (eh == EP_CALL_HELPER) - drbd_khelper(mdev, "local-io-error"); } if (os.disk > D_DISKLESS && ns.disk == D_DISKLESS) { + int c = atomic_read(&mdev->local_cnt); - if (os.disk == D_FAILED) /* && ns.disk == D_DISKLESS*/ { - if (drbd_send_state(mdev)) - dev_warn(DEV, "Notified peer that my disk is broken.\n"); - else - dev_err(DEV, "Sending state in drbd_io_error() failed\n"); - } + if (drbd_send_state(mdev)) + dev_warn(DEV, "Notified peer that I detached my disk.\n"); + else + dev_err(DEV, "Sending state for detach failed\n"); - wait_event(mdev->misc_wait, !atomic_read(&mdev->local_cnt)); + if (c != 0) { + dev_err(DEV, "Logic bug, local_cnt=%d, but should be 0\n", c); + wait_event(mdev->misc_wait, !atomic_read(&mdev->local_cnt)); + } lc_destroy(mdev->resync); mdev->resync = NULL; lc_destroy(mdev->act_log); @@ -2803,11 +2808,13 @@ void drbd_init_set_defaults(struct drbd_conf *mdev) INIT_LIST_HEAD(&mdev->meta.work.q); INIT_LIST_HEAD(&mdev->resync_work.list); INIT_LIST_HEAD(&mdev->unplug_work.list); + INIT_LIST_HEAD(&mdev->go_diskless.list); INIT_LIST_HEAD(&mdev->md_sync_work.list); INIT_LIST_HEAD(&mdev->bm_io_work.w.list); mdev->resync_work.cb = w_resync_inactive; mdev->unplug_work.cb = w_send_write_hint; + mdev->go_diskless.cb = w_go_diskless; mdev->md_sync_work.cb = w_md_sync; mdev->bm_io_work.w.cb = w_bitmap_io; init_timer(&mdev->resync_timer); @@ -2885,6 +2892,7 @@ void drbd_mdev_cleanup(struct drbd_conf *mdev) D_ASSERT(list_empty(&mdev->meta.work.q)); D_ASSERT(list_empty(&mdev->resync_work.list)); D_ASSERT(list_empty(&mdev->unplug_work.list)); + D_ASSERT(list_empty(&mdev->go_diskless.list)); } @@ -3712,6 +3720,24 @@ static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused) return 1; } +static int w_go_diskless(struct drbd_conf *mdev, struct drbd_work *w, int unused) +{ + D_ASSERT(mdev->state.disk == D_FAILED); + D_ASSERT(atomic_read(&mdev->local_cnt) == 0); + + drbd_force_state(mdev, NS(disk, D_DISKLESS)); + + clear_bit(GO_DISKLESS, &mdev->flags); + return 1; +} + +void drbd_go_diskless(struct drbd_conf *mdev) +{ + D_ASSERT(mdev->state.disk == D_FAILED); + if (!test_and_set_bit(GO_DISKLESS, &mdev->flags)) + drbd_queue_work_front(&mdev->data.work, &mdev->go_diskless); +} + /** * drbd_queue_bitmap_io() - Queues an IO operation on the whole bitmap * @mdev: DRBD device. -- cgit v0.10.2 From b18b37befb37810ce50e1a9b0a6206dfe363d827 Mon Sep 17 00:00:00 2001 From: Philipp Reisner Date: Wed, 13 Oct 2010 15:32:44 +0200 Subject: drbd: Do not log an ASSERT for P_OV_REQUEST packets while C_CONNECTED This might happen if on the VERIFY_S node the disk gets dropped. Although this is an cluster wide state transition, the VERIFY_T node, updates it connection state first. Then the ack packet for the cluster wide state transition travels back, and the VERIFY_S node stops to produce the P_OV_REQUEST packets. There is absolutely nothing wrong with that. Further, do not log "Can not satisfy peer's..." on the VERIFY_S node in this case, but pretend that they had equal checksum. [Bugz 327] Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index 71775a9..4dc6a88 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -2079,7 +2079,7 @@ static int receive_DataRequest(struct drbd_conf *mdev, enum drbd_packets cmd, un const sector_t capacity = drbd_get_capacity(mdev->this_bdev); struct drbd_epoch_entry *e; struct digest_info *di = NULL; - int size; + int size, verb; unsigned int fault_type; struct p_block_req *p = &mdev->data.rbuf.block_req; @@ -2098,11 +2098,29 @@ static int receive_DataRequest(struct drbd_conf *mdev, enum drbd_packets cmd, un } if (!get_ldev_if_state(mdev, D_UP_TO_DATE)) { - if (__ratelimit(&drbd_ratelimit_state)) + verb = 1; + switch (cmd) { + case P_DATA_REQUEST: + drbd_send_ack_rp(mdev, P_NEG_DREPLY, p); + break; + case P_RS_DATA_REQUEST: + case P_CSUM_RS_REQUEST: + case P_OV_REQUEST: + drbd_send_ack_rp(mdev, P_NEG_RS_DREPLY , p); + break; + case P_OV_REPLY: + verb = 0; + dec_rs_pending(mdev); + drbd_send_ack_ex(mdev, P_OV_RESULT, sector, size, ID_IN_SYNC); + break; + default: + dev_err(DEV, "unexpected command (%s) in receive_DataRequest\n", + cmdname(cmd)); + } + if (verb && __ratelimit(&drbd_ratelimit_state)) dev_err(DEV, "Can not satisfy peer's read request, " "no local data.\n"); - drbd_send_ack_rp(mdev, cmd == P_DATA_REQUEST ? P_NEG_DREPLY : - P_NEG_RS_DREPLY , p); + /* drain possibly payload */ return drbd_drain_block(mdev, digest_size); } @@ -2157,10 +2175,6 @@ static int receive_DataRequest(struct drbd_conf *mdev, enum drbd_packets cmd, un break; case P_OV_REQUEST: - if (mdev->state.conn >= C_CONNECTED && - mdev->state.conn != C_VERIFY_T) - dev_warn(DEV, "ASSERT FAILED: got P_OV_REQUEST while being %s\n", - drbd_conn_str(mdev->state.conn)); if (mdev->ov_start_sector == ~(sector_t)0 && mdev->agreed_pro_version >= 90) { mdev->ov_start_sector = sector; -- cgit v0.10.2 From 13d42685bec1f012dcbc5d187490eb1d15ec8219 Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Wed, 13 Oct 2010 17:37:54 +0200 Subject: drbd: add explicit drbd_md_sync to drbd_resync_finished As we usually update the generation UUIDs here, we should explicitly sync them to disk. So far this has been done only implicitly by related code paths. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c index f12822d..108d580 100644 --- a/drivers/block/drbd/drbd_worker.c +++ b/drivers/block/drbd/drbd_worker.c @@ -922,6 +922,8 @@ out: mdev->rs_paused = 0; mdev->ov_start_sector = 0; + drbd_md_sync(mdev); + if (test_and_clear_bit(WRITE_BM_AFTER_RESYNC, &mdev->flags)) { dev_warn(DEV, "Writing the whole bitmap, due to failed kmalloc\n"); drbd_queue_bitmap_io(mdev, &drbd_bm_write, NULL, "write from resync_finished"); -- cgit v0.10.2 From 0f8488e1608b6e30e705460f8110888c645f7f9f Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Wed, 13 Oct 2010 18:19:23 +0200 Subject: drbd: cleanup useless leftover warn/error printk's Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index 9ae33a5..87925e9 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c @@ -780,9 +780,6 @@ void drbd_setup_queue_param(struct drbd_conf *mdev, unsigned int max_seg_s) __mu blk_queue_segment_boundary(q, PAGE_SIZE-1); blk_stack_limits(&q->limits, &b->limits, 0); - if (b->merge_bvec_fn) - dev_warn(DEV, "Backing device's merge_bvec_fn() = %p\n", - b->merge_bvec_fn); dev_info(DEV, "max_segment_size ( = BIO size ) = %u\n", queue_max_segment_size(q)); if (q->backing_dev_info.ra_pages != b->backing_dev_info.ra_pages) { diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index 4dc6a88..5a4b6dc 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -4591,10 +4591,8 @@ int drbd_asender(struct drbd_thread *thi) while (1) { clear_bit(SIGNAL_ASENDER, &mdev->flags); flush_signals(current); - if (!drbd_process_done_ee(mdev)) { - dev_err(DEV, "process_done_ee() = NOT_OK\n"); + if (!drbd_process_done_ee(mdev)) goto reconnect; - } /* to avoid race with newly queued ACKs */ set_bit(SIGNAL_ASENDER, &mdev->flags); spin_lock_irq(&mdev->req_lock); -- cgit v0.10.2 From 9d282875d85ebc2b49362310677fc0dcd91b9db9 Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Thu, 14 Oct 2010 13:57:07 +0200 Subject: drbd: drop wrong debug asserts, fix recently introduced race commit 2372c38caadeaebc68a5ee190782c2a0df01edc3 drbd: fix for possible deadlock on IO error during resync introduced a new ASSERT, which turns out to be wrong. Drop it. Also serialize the state change to D_DISKLESS with the after state change work of the -> D_FAILED transition, don't open a new race. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index f89b974..342574f 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -1393,17 +1393,22 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os, } if (os.disk > D_DISKLESS && ns.disk == D_DISKLESS) { - int c = atomic_read(&mdev->local_cnt); - + /* We must still be diskless, + * re-attach has to be serialized with this! */ + if (mdev->state.disk != D_DISKLESS) + dev_err(DEV, + "ASSERT FAILED: disk is %s while going diskless\n", + drbd_disk_str(mdev->state.disk)); + + /* we cannot assert local_cnt == 0 here, as get_ldev_if_state + * will inc/dec it frequently. Since we became D_DISKLESS, no + * one has touched the protected members anymore, though, so we + * are safe to free them here. */ if (drbd_send_state(mdev)) dev_warn(DEV, "Notified peer that I detached my disk.\n"); else dev_err(DEV, "Sending state for detach failed\n"); - if (c != 0) { - dev_err(DEV, "Logic bug, local_cnt=%d, but should be 0\n", c); - wait_event(mdev->misc_wait, !atomic_read(&mdev->local_cnt)); - } lc_destroy(mdev->resync); mdev->resync = NULL; lc_destroy(mdev->act_log); @@ -3723,8 +3728,10 @@ static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused) static int w_go_diskless(struct drbd_conf *mdev, struct drbd_work *w, int unused) { D_ASSERT(mdev->state.disk == D_FAILED); - D_ASSERT(atomic_read(&mdev->local_cnt) == 0); - + /* we cannot assert local_cnt == 0 here, as get_ldev_if_state will + * inc/dec it frequently. Once we are D_DISKLESS, no one will touch + * the protected members anymore, though, so in the after_state_ch work + * it will be safe to free them. */ drbd_force_state(mdev, NS(disk, D_DISKLESS)); clear_bit(GO_DISKLESS, &mdev->flags); @@ -3735,7 +3742,10 @@ void drbd_go_diskless(struct drbd_conf *mdev) { D_ASSERT(mdev->state.disk == D_FAILED); if (!test_and_set_bit(GO_DISKLESS, &mdev->flags)) - drbd_queue_work_front(&mdev->data.work, &mdev->go_diskless); + drbd_queue_work(&mdev->data.work, &mdev->go_diskless); + /* don't drbd_queue_work_front, + * we need to serialize with the after_state_ch work + * of the -> D_FAILED transition. */ } /** -- cgit v0.10.2 From 856c50c7b616d50e1a3ccd4ce35f7814650fa594 Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Thu, 14 Oct 2010 13:37:40 +0200 Subject: drbd: add some more explicit drbd_md_sync It sometimes may take a while for the after state change work to be scheduled, which does drbd_md_sync. At convenient places, we should do explicit drbd_md_sync to have the new state information on disk as soon as possible. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index 5a4b6dc..6ec922c 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -3804,6 +3804,9 @@ static void drbdd(struct drbd_conf *mdev) err_out: drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR)); } + /* If we leave here, we probably want to update at least the + * "Connected" indicator on stable storage. Do so explicitly here. */ + drbd_md_sync(mdev); } void drbd_flush_workqueue(struct drbd_conf *mdev) @@ -4685,10 +4688,12 @@ int drbd_asender(struct drbd_thread *thi) if (0) { reconnect: drbd_force_state(mdev, NS(conn, C_NETWORK_FAILURE)); + drbd_md_sync(mdev); } if (0) { disconnect: drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); + drbd_md_sync(mdev); } clear_bit(SIGNAL_ASENDER, &mdev->flags); -- cgit v0.10.2 From ca0e6098aad127a555ba29d12e0503dbb1577aac Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Thu, 14 Oct 2010 15:01:21 +0200 Subject: drbd: relax the grace period of the md_sync timer again Consolidate the ifdef's for the debug level, accidentally the used both DEBUG and DRBD_DEBUG_MD_SYNC. Default to off. For production, we can safely reduce the grace period for this timer again the the value we used to have. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 342574f..2e6a07e 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -3565,7 +3565,7 @@ int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev) * the meta-data super block. This function sets MD_DIRTY, and starts a * timer that ensures that within five seconds you have to call drbd_md_sync(). */ -#ifdef DRBD_DEBUG_MD_SYNC +#ifdef DEBUG void drbd_md_mark_dirty_(struct drbd_conf *mdev, unsigned int line, const char *func) { if (!test_and_set_bit(MD_DIRTY, &mdev->flags)) { @@ -3578,7 +3578,7 @@ void drbd_md_mark_dirty_(struct drbd_conf *mdev, unsigned int line, const char * void drbd_md_mark_dirty(struct drbd_conf *mdev) { if (!test_and_set_bit(MD_DIRTY, &mdev->flags)) - mod_timer(&mdev->md_sync_timer, jiffies + HZ); + mod_timer(&mdev->md_sync_timer, jiffies + 5*HZ); } #endif -- cgit v0.10.2 From 2265769531afe267f864111c103b04b4427720b6 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 12 Aug 2010 00:38:45 +0200 Subject: drbd: cleanup: change "<= 0" to "== 0" dt is unsigned so it's never less than zero. We are calculating the elapsed time, and that's never less than zero (unless there is a bug or we invent time travel). The comparison here is just to guard against divide by zero bugs. Signed-off-by: Dan Carpenter Signed-off-by: Philipp Reisner diff --git a/drivers/block/drbd/drbd_proc.c b/drivers/block/drbd/drbd_proc.c index aec8426..ad325c5 100644 --- a/drivers/block/drbd/drbd_proc.c +++ b/drivers/block/drbd/drbd_proc.c @@ -118,7 +118,7 @@ static void drbd_syncer_progress(struct drbd_conf *mdev, struct seq_file *seq) /* mean speed since syncer started * we do account for PausedSync periods */ dt = (jiffies - mdev->rs_start - mdev->rs_paused) / HZ; - if (dt <= 0) + if (dt == 0) dt = 1; db = mdev->rs_total - rs_left; dbdt = Bit2KB(db/dt); -- cgit v0.10.2 From be70e2671b95a8982ff133ebaafff6399ad393d4 Mon Sep 17 00:00:00 2001 From: Philipp Reisner Date: Thu, 14 Oct 2010 11:58:20 +0200 Subject: dynamic_debug.h: Fix dynamic_dev_dbg() macro if CONFIG_DYNAMIC_DEBUG not set Signed-off-by: Philipp Reisner diff --git a/include/linux/dynamic_debug.h b/include/linux/dynamic_debug.h index 52c0da4..81bc20e 100644 --- a/include/linux/dynamic_debug.h +++ b/include/linux/dynamic_debug.h @@ -80,7 +80,7 @@ static inline int ddebug_remove_module(const char *mod) #define dynamic_pr_debug(fmt, ...) \ do { if (0) printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__); } while (0) -#define dynamic_dev_dbg(dev, format, ...) \ +#define dynamic_dev_dbg(dev, fmt, ...) \ do { if (0) dev_printk(KERN_DEBUG, dev, fmt, ##__VA_ARGS__); } while (0) #endif -- cgit v0.10.2 From ac7241211ded714873e8dc6d2f7c98ae7ea2cc30 Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Thu, 7 Oct 2010 15:18:08 +0200 Subject: drbd: use dynamic_dev_dbg to optionally log uuid changes Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 2e6a07e..bbe3bff 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -3557,6 +3557,28 @@ int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev) return rv; } +static void debug_drbd_uuid(struct drbd_conf *mdev, enum drbd_uuid_index index) +{ + static char *uuid_str[UI_EXTENDED_SIZE] = { + [UI_CURRENT] = "CURRENT", + [UI_BITMAP] = "BITMAP", + [UI_HISTORY_START] = "HISTORY_START", + [UI_HISTORY_END] = "HISTORY_END", + [UI_SIZE] = "SIZE", + [UI_FLAGS] = "FLAGS", + }; + + if (index >= UI_EXTENDED_SIZE) { + dev_warn(DEV, " uuid_index >= EXTENDED_SIZE\n"); + return; + } + + dynamic_dev_dbg(DEV, " uuid[%s] now %016llX\n", + uuid_str[index], + (unsigned long long)mdev->ldev->md.uuid[index]); +} + + /** * drbd_md_mark_dirty() - Mark meta data super block as dirty * @mdev: DRBD device. @@ -3586,8 +3608,10 @@ static void drbd_uuid_move_history(struct drbd_conf *mdev) __must_hold(local) { int i; - for (i = UI_HISTORY_START; i < UI_HISTORY_END; i++) + for (i = UI_HISTORY_START; i < UI_HISTORY_END; i++) { mdev->ldev->md.uuid[i+1] = mdev->ldev->md.uuid[i]; + debug_drbd_uuid(mdev, i+1); + } } void _drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local) @@ -3602,6 +3626,7 @@ void _drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local) } mdev->ldev->md.uuid[idx] = val; + debug_drbd_uuid(mdev, idx); drbd_md_mark_dirty(mdev); } @@ -3611,6 +3636,7 @@ void drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local) if (mdev->ldev->md.uuid[idx]) { drbd_uuid_move_history(mdev); mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[idx]; + debug_drbd_uuid(mdev, UI_HISTORY_START); } _drbd_uuid_set(mdev, idx, val); } @@ -3629,6 +3655,7 @@ void drbd_uuid_new_current(struct drbd_conf *mdev) __must_hold(local) dev_info(DEV, "Creating new current UUID\n"); D_ASSERT(mdev->ldev->md.uuid[UI_BITMAP] == 0); mdev->ldev->md.uuid[UI_BITMAP] = mdev->ldev->md.uuid[UI_CURRENT]; + debug_drbd_uuid(mdev, UI_BITMAP); get_random_bytes(&val, sizeof(u64)); _drbd_uuid_set(mdev, UI_CURRENT, val); @@ -3643,6 +3670,8 @@ void drbd_uuid_set_bm(struct drbd_conf *mdev, u64 val) __must_hold(local) drbd_uuid_move_history(mdev); mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[UI_BITMAP]; mdev->ldev->md.uuid[UI_BITMAP] = 0; + debug_drbd_uuid(mdev, UI_HISTORY_START); + debug_drbd_uuid(mdev, UI_BITMAP); } else { if (mdev->ldev->md.uuid[UI_BITMAP]) dev_warn(DEV, "bm UUID already set"); @@ -3650,6 +3679,7 @@ void drbd_uuid_set_bm(struct drbd_conf *mdev, u64 val) __must_hold(local) mdev->ldev->md.uuid[UI_BITMAP] = val; mdev->ldev->md.uuid[UI_BITMAP] &= ~((u64)1); + debug_drbd_uuid(mdev, UI_BITMAP); } drbd_md_mark_dirty(mdev); } -- cgit v0.10.2 From 5dbfe7aedf54aa7f62fd659e34371d4ea0e7bffe Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Fri, 15 Oct 2010 09:52:46 +0200 Subject: drbd: add race-breaker to drbd_go_diskless This adds a necessary race breaker to these commits: drbd: fix for possible deadlock on IO error during resync drbd: drop wrong debug asserts, fix recently introduced race What we do is get a refcount, check the state, then depending on the state and the requested minimum disk state, either hold it (success), or give it back immediately (failed "try lock"). Some code paths (flushing of drbd metadata) may still grab and hold a refcount even if we are D_FAILED (application IO won't). So even if we hit local_cnt == 0 once after being D_FAILED, we still need to wait for that again after we changed to D_DISKLESS. Once local_cnt reaches 0 while we are D_DISKLESS, we can be sure that no one will look at the protected members anymore, so only then is it safe to free them. We cannot easily convert to standard locking primitives here, as we want to be able to use it in atomic context (we always do a "try lock"), as well as hold references for a "long time" (from IO submission to completion callback). Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index bbe3bff..8bfedc7 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -3763,6 +3763,9 @@ static int w_go_diskless(struct drbd_conf *mdev, struct drbd_work *w, int unused * the protected members anymore, though, so in the after_state_ch work * it will be safe to free them. */ drbd_force_state(mdev, NS(disk, D_DISKLESS)); + /* We need to wait for return of references checked out while we still + * have been D_FAILED, though (drbd_md_sync, bitmap io). */ + wait_event(mdev->misc_wait, !atomic_read(&mdev->local_cnt)); clear_bit(GO_DISKLESS, &mdev->flags); return 1; diff --git a/include/linux/drbd.h b/include/linux/drbd.h index da7d9bd..9b2a015 100644 --- a/include/linux/drbd.h +++ b/include/linux/drbd.h @@ -53,7 +53,7 @@ extern const char *drbd_buildtag(void); -#define REL_VERSION "8.3.9rc1" +#define REL_VERSION "8.3.9rc2" #define API_VERSION 88 #define PRO_VERSION_MIN 86 #define PRO_VERSION_MAX 95 -- cgit v0.10.2 From 6362beea8914cbd4630ccde3617d944aeca2d48f Mon Sep 17 00:00:00 2001 From: Mike Miller Date: Tue, 19 Oct 2010 09:40:34 +0200 Subject: cciss: fix PCI IDs for new Smart Array controllers cciss: fix PCI IDs for new controllers This patch fixes the botched up PCI IDs of new controllers. Please consider this patch for inclusion. Signed-off-by: Mike Miller Signed-off-by: Jens Axboe diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c index df2ed4d..41f9acb 100644 --- a/drivers/block/cciss.c +++ b/drivers/block/cciss.c @@ -105,11 +105,12 @@ static const struct pci_device_id cciss_pci_device_id[] = { {PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSE, 0x103C, 0x3249}, {PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSE, 0x103C, 0x324A}, {PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSE, 0x103C, 0x324B}, - {PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSE, 0x103C, 0x3250}, - {PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSE, 0x103C, 0x3251}, - {PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSE, 0x103C, 0x3252}, - {PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSE, 0x103C, 0x3253}, - {PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSE, 0x103C, 0x3254}, + {PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSF, 0x103C, 0x3350}, + {PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSF, 0x103C, 0x3351}, + {PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSF, 0x103C, 0x3352}, + {PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSF, 0x103C, 0x3353}, + {PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSF, 0x103C, 0x3354}, + {PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSF, 0x103C, 0x3355}, {0,} }; @@ -149,11 +150,12 @@ static struct board_type products[] = { {0x3249103C, "Smart Array P812", &SA5_access}, {0x324A103C, "Smart Array P712m", &SA5_access}, {0x324B103C, "Smart Array P711m", &SA5_access}, - {0x3250103C, "Smart Array", &SA5_access}, - {0x3251103C, "Smart Array", &SA5_access}, - {0x3252103C, "Smart Array", &SA5_access}, - {0x3253103C, "Smart Array", &SA5_access}, - {0x3254103C, "Smart Array", &SA5_access}, + {0x3350103C, "Smart Array", &SA5_access}, + {0x3351103C, "Smart Array", &SA5_access}, + {0x3352103C, "Smart Array", &SA5_access}, + {0x3353103C, "Smart Array", &SA5_access}, + {0x3354103C, "Smart Array", &SA5_access}, + {0x3355103C, "Smart Array", &SA5_access}, }; /* How long to wait (in milliseconds) for board to go into simple mode */ diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h index f6a3b2d..fe7d72c 100644 --- a/include/linux/pci_ids.h +++ b/include/linux/pci_ids.h @@ -739,6 +739,7 @@ #define PCI_DEVICE_ID_HP_CISSC 0x3230 #define PCI_DEVICE_ID_HP_CISSD 0x3238 #define PCI_DEVICE_ID_HP_CISSE 0x323a +#define PCI_DEVICE_ID_HP_CISSF 0x323b #define PCI_DEVICE_ID_HP_ZX2_IOC 0x4031 #define PCI_VENDOR_ID_PCTECH 0x1042 -- cgit v0.10.2