diff options
Diffstat (limited to 'drivers/block')
27 files changed, 5203 insertions, 2152 deletions
diff --git a/drivers/block/DAC960.c b/drivers/block/DAC960.c index 9a13e88..5b5ee79 100644 --- a/drivers/block/DAC960.c +++ b/drivers/block/DAC960.c @@ -6547,7 +6547,7 @@ static ssize_t dac960_user_command_proc_write(struct file *file, const char __user *Buffer, size_t Count, loff_t *pos) { - DAC960_Controller_T *Controller = (DAC960_Controller_T *) PDE(file->f_path.dentry->d_inode)->data; + DAC960_Controller_T *Controller = (DAC960_Controller_T *) PDE(file_inode(file))->data; unsigned char CommandBuffer[80]; int Length; if (Count > sizeof(CommandBuffer)-1) return -EINVAL; @@ -7054,6 +7054,7 @@ static long DAC960_gam_ioctl(struct file *file, unsigned int Request, else ErrorCode = 0; } + break; default: ErrorCode = -ENOTTY; } diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig index 824e09c..5dc0dae 100644 --- a/drivers/block/Kconfig +++ b/drivers/block/Kconfig @@ -63,19 +63,6 @@ config AMIGA_Z2RAM To compile this driver as a module, choose M here: the module will be called z2ram. -config BLK_DEV_XD - tristate "XT hard disk support" - depends on ISA && ISA_DMA_API - select CHECK_SIGNATURE - help - Very old 8 bit hard disk controllers used in the IBM XT computer - will be supported if you say Y here. - - To compile this driver as a module, choose M here: the - module will be called xd. - - It's pretty unlikely that you have one of these: say N. - config GDROM tristate "SEGA Dreamcast GD-ROM drive" depends on SH_DREAMCAST @@ -544,4 +531,14 @@ config BLK_DEV_RBD If unsure, say N. +config BLK_DEV_RSXX + tristate "RamSam PCIe Flash SSD Device Driver" + depends on PCI + help + Device driver for IBM's high speed PCIe SSD + storage devices: RamSan-70 and RamSan-80. + + To compile this driver as a module, choose M here: the + module will be called rsxx. + endif # BLK_DEV diff --git a/drivers/block/Makefile b/drivers/block/Makefile index 17e82df..a3b4023 100644 --- a/drivers/block/Makefile +++ b/drivers/block/Makefile @@ -15,7 +15,6 @@ obj-$(CONFIG_ATARI_FLOPPY) += ataflop.o obj-$(CONFIG_AMIGA_Z2RAM) += z2ram.o obj-$(CONFIG_BLK_DEV_RAM) += brd.o obj-$(CONFIG_BLK_DEV_LOOP) += loop.o -obj-$(CONFIG_BLK_DEV_XD) += xd.o obj-$(CONFIG_BLK_CPQ_DA) += cpqarray.o obj-$(CONFIG_BLK_CPQ_CISS_DA) += cciss.o obj-$(CONFIG_BLK_DEV_DAC960) += DAC960.o @@ -41,4 +40,6 @@ obj-$(CONFIG_BLK_DEV_DRBD) += drbd/ obj-$(CONFIG_BLK_DEV_RBD) += rbd.o obj-$(CONFIG_BLK_DEV_PCIESSD_MTIP32XX) += mtip32xx/ +obj-$(CONFIG_BLK_DEV_RSXX) += rsxx/ + swim_mod-y := swim.o swim_asm.o diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 8c13eeb..e98da67 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -2660,25 +2660,24 @@ enum drbd_ret_code conn_new_minor(struct drbd_tconn *tconn, unsigned int minor, mdev->read_requests = RB_ROOT; mdev->write_requests = RB_ROOT; - if (!idr_pre_get(&minors, GFP_KERNEL)) - goto out_no_minor_idr; - if (idr_get_new_above(&minors, mdev, minor, &minor_got)) + minor_got = idr_alloc(&minors, mdev, minor, minor + 1, GFP_KERNEL); + if (minor_got < 0) { + if (minor_got == -ENOSPC) { + err = ERR_MINOR_EXISTS; + drbd_msg_put_info("requested minor exists already"); + } goto out_no_minor_idr; - if (minor_got != minor) { - err = ERR_MINOR_EXISTS; - drbd_msg_put_info("requested minor exists already"); - goto out_idr_remove_minor; } - if (!idr_pre_get(&tconn->volumes, GFP_KERNEL)) - goto out_idr_remove_minor; - if (idr_get_new_above(&tconn->volumes, mdev, vnr, &vnr_got)) + vnr_got = idr_alloc(&tconn->volumes, mdev, vnr, vnr + 1, GFP_KERNEL); + if (vnr_got < 0) { + if (vnr_got == -ENOSPC) { + err = ERR_INVALID_REQUEST; + drbd_msg_put_info("requested volume exists already"); + } goto out_idr_remove_minor; - if (vnr_got != vnr) { - err = ERR_INVALID_REQUEST; - drbd_msg_put_info("requested volume exists already"); - goto out_idr_remove_vol; } + add_disk(disk); kref_init(&mdev->kref); /* one ref for both idrs and the the add_disk */ @@ -2689,8 +2688,6 @@ enum drbd_ret_code conn_new_minor(struct drbd_tconn *tconn, unsigned int minor, return NO_ERROR; -out_idr_remove_vol: - idr_remove(&tconn->volumes, vnr_got); out_idr_remove_minor: idr_remove(&minors, minor_got); synchronize_rcu(); diff --git a/drivers/block/loop.c b/drivers/block/loop.c index ae12512..747bb2a 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -162,12 +162,13 @@ static struct loop_func_table *xfer_funcs[MAX_LO_CRYPT] = { static loff_t get_size(loff_t offset, loff_t sizelimit, struct file *file) { - loff_t size, loopsize; + loff_t loopsize; /* Compute loopsize in bytes */ - size = i_size_read(file->f_mapping->host); - loopsize = size - offset; - /* offset is beyond i_size, wierd but possible */ + loopsize = i_size_read(file->f_mapping->host); + if (offset > 0) + loopsize -= offset; + /* offset is beyond i_size, weird but possible */ if (loopsize < 0) return 0; @@ -190,6 +191,7 @@ figure_loop_size(struct loop_device *lo, loff_t offset, loff_t sizelimit) { loff_t size = get_size(offset, sizelimit, lo->lo_backing_file); sector_t x = (sector_t)size; + struct block_device *bdev = lo->lo_device; if (unlikely((loff_t)x != size)) return -EFBIG; @@ -198,6 +200,9 @@ figure_loop_size(struct loop_device *lo, loff_t offset, loff_t sizelimit) if (lo->lo_sizelimit != sizelimit) lo->lo_sizelimit = sizelimit; set_capacity(lo->lo_disk, x); + bd_set_size(bdev, (loff_t)get_capacity(bdev->bd_disk) << 9); + /* let user-space know about the new size */ + kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, KOBJ_CHANGE); return 0; } @@ -1091,10 +1096,10 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info) return err; if (lo->lo_offset != info->lo_offset || - lo->lo_sizelimit != info->lo_sizelimit) { + lo->lo_sizelimit != info->lo_sizelimit) if (figure_loop_size(lo, info->lo_offset, info->lo_sizelimit)) return -EFBIG; - } + loop_config_discard(lo); memcpy(lo->lo_file_name, info->lo_file_name, LO_NAME_SIZE); @@ -1139,7 +1144,7 @@ loop_get_status(struct loop_device *lo, struct loop_info64 *info) if (lo->lo_state != Lo_bound) return -ENXIO; - error = vfs_getattr(file->f_path.mnt, file->f_path.dentry, &stat); + error = vfs_getattr(&file->f_path, &stat); if (error) return error; memset(info, 0, sizeof(*info)); @@ -1271,28 +1276,10 @@ loop_get_status64(struct loop_device *lo, struct loop_info64 __user *arg) { static int loop_set_capacity(struct loop_device *lo, struct block_device *bdev) { - int err; - sector_t sec; - loff_t sz; - - err = -ENXIO; if (unlikely(lo->lo_state != Lo_bound)) - goto out; - err = figure_loop_size(lo, lo->lo_offset, lo->lo_sizelimit); - if (unlikely(err)) - goto out; - sec = get_capacity(lo->lo_disk); - /* the width of sector_t may be narrow for bit-shift */ - sz = sec; - sz <<= 9; - mutex_lock(&bdev->bd_mutex); - bd_set_size(bdev, sz); - /* let user-space know about the new size */ - kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, KOBJ_CHANGE); - mutex_unlock(&bdev->bd_mutex); + return -ENXIO; - out: - return err; + return figure_loop_size(lo, lo->lo_offset, lo->lo_sizelimit); } static int lo_ioctl(struct block_device *bdev, fmode_t mode, @@ -1624,30 +1611,17 @@ static int loop_add(struct loop_device **l, int i) if (!lo) goto out; - if (!idr_pre_get(&loop_index_idr, GFP_KERNEL)) - goto out_free_dev; - + /* allocate id, if @id >= 0, we're requesting that specific id */ if (i >= 0) { - int m; - - /* create specific i in the index */ - err = idr_get_new_above(&loop_index_idr, lo, i, &m); - if (err >= 0 && i != m) { - idr_remove(&loop_index_idr, m); + err = idr_alloc(&loop_index_idr, lo, i, i + 1, GFP_KERNEL); + if (err == -ENOSPC) err = -EEXIST; - } - } else if (i == -1) { - int m; - - /* get next free nr */ - err = idr_get_new(&loop_index_idr, lo, &m); - if (err >= 0) - i = m; } else { - err = -EINVAL; + err = idr_alloc(&loop_index_idr, lo, 0, 0, GFP_KERNEL); } if (err < 0) goto out_free_dev; + i = err; lo->lo_queue = blk_alloc_queue(GFP_KERNEL); if (!lo->lo_queue) @@ -1858,11 +1832,15 @@ static int __init loop_init(void) max_part = (1UL << part_shift) - 1; } - if ((1UL << part_shift) > DISK_MAX_PARTS) - return -EINVAL; + if ((1UL << part_shift) > DISK_MAX_PARTS) { + err = -EINVAL; + goto misc_out; + } - if (max_loop > 1UL << (MINORBITS - part_shift)) - return -EINVAL; + if (max_loop > 1UL << (MINORBITS - part_shift)) { + err = -EINVAL; + goto misc_out; + } /* * If max_loop is specified, create that many devices upfront. @@ -1880,8 +1858,10 @@ static int __init loop_init(void) range = 1UL << MINORBITS; } - if (register_blkdev(LOOP_MAJOR, "loop")) - return -EIO; + if (register_blkdev(LOOP_MAJOR, "loop")) { + err = -EIO; + goto misc_out; + } blk_register_region(MKDEV(LOOP_MAJOR, 0), range, THIS_MODULE, loop_probe, NULL, NULL); @@ -1894,6 +1874,10 @@ static int __init loop_init(void) printk(KERN_INFO "loop: module loaded\n"); return 0; + +misc_out: + misc_deregister(&loop_misc); + return err; } static int loop_exit_cb(int id, void *ptr, void *data) @@ -1911,7 +1895,6 @@ static void __exit loop_exit(void) range = max_loop ? max_loop << part_shift : 1UL << MINORBITS; idr_for_each(&loop_index_idr, &loop_exit_cb, NULL); - idr_remove_all(&loop_index_idr); idr_destroy(&loop_index_idr); blk_unregister_region(MKDEV(LOOP_MAJOR, 0), range); diff --git a/drivers/block/mtip32xx/Kconfig b/drivers/block/mtip32xx/Kconfig index 0ba837f..1fca1f99 100644 --- a/drivers/block/mtip32xx/Kconfig +++ b/drivers/block/mtip32xx/Kconfig @@ -4,6 +4,6 @@ config BLK_DEV_PCIESSD_MTIP32XX tristate "Block Device Driver for Micron PCIe SSDs" - depends on PCI + depends on PCI && GENERIC_HARDIRQS help This enables the block driver for Micron PCIe SSDs. diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c index 3fd1009..11cc952 100644 --- a/drivers/block/mtip32xx/mtip32xx.c +++ b/drivers/block/mtip32xx/mtip32xx.c @@ -88,6 +88,8 @@ static int instance; static int mtip_major; static struct dentry *dfs_parent; +static u32 cpu_use[NR_CPUS]; + static DEFINE_SPINLOCK(rssd_index_lock); static DEFINE_IDA(rssd_index_ida); @@ -296,16 +298,17 @@ static int hba_reset_nosleep(struct driver_data *dd) */ static inline void mtip_issue_ncq_command(struct mtip_port *port, int tag) { - atomic_set(&port->commands[tag].active, 1); + int group = tag >> 5; - spin_lock(&port->cmd_issue_lock); + atomic_set(&port->commands[tag].active, 1); + /* guard SACT and CI registers */ + spin_lock(&port->cmd_issue_lock[group]); writel((1 << MTIP_TAG_BIT(tag)), port->s_active[MTIP_TAG_INDEX(tag)]); writel((1 << MTIP_TAG_BIT(tag)), port->cmd_issue[MTIP_TAG_INDEX(tag)]); - - spin_unlock(&port->cmd_issue_lock); + spin_unlock(&port->cmd_issue_lock[group]); /* Set the command's timeout value.*/ port->commands[tag].comp_time = jiffies + msecs_to_jiffies( @@ -964,56 +967,56 @@ handle_tfe_exit: /* * Handle a set device bits interrupt */ -static inline void mtip_process_sdbf(struct driver_data *dd) +static inline void mtip_workq_sdbfx(struct mtip_port *port, int group, + u32 completed) { - struct mtip_port *port = dd->port; - int group, tag, bit; - u32 completed; + struct driver_data *dd = port->dd; + int tag, bit; struct mtip_cmd *command; - /* walk all bits in all slot groups */ - for (group = 0; group < dd->slot_groups; group++) { - completed = readl(port->completed[group]); - if (!completed) - continue; + if (!completed) { + WARN_ON_ONCE(!completed); + return; + } + /* clear completed status register in the hardware.*/ + writel(completed, port->completed[group]); - /* clear completed status register in the hardware.*/ - writel(completed, port->completed[group]); + /* Process completed commands. */ + for (bit = 0; (bit < 32) && completed; bit++) { + if (completed & 0x01) { + tag = (group << 5) | bit; - /* Process completed commands. */ - for (bit = 0; - (bit < 32) && completed; - bit++, completed >>= 1) { - if (completed & 0x01) { - tag = (group << 5) | bit; + /* skip internal command slot. */ + if (unlikely(tag == MTIP_TAG_INTERNAL)) + continue; - /* skip internal command slot. */ - if (unlikely(tag == MTIP_TAG_INTERNAL)) - continue; + command = &port->commands[tag]; + /* make internal callback */ + if (likely(command->comp_func)) { + command->comp_func( + port, + tag, + command->comp_data, + 0); + } else { + dev_warn(&dd->pdev->dev, + "Null completion " + "for tag %d", + tag); - command = &port->commands[tag]; - /* make internal callback */ - if (likely(command->comp_func)) { - command->comp_func( - port, - tag, - command->comp_data, - 0); - } else { - dev_warn(&dd->pdev->dev, - "Null completion " - "for tag %d", - tag); - - if (mtip_check_surprise_removal( - dd->pdev)) { - mtip_command_cleanup(dd); - return; - } + if (mtip_check_surprise_removal( + dd->pdev)) { + mtip_command_cleanup(dd); + return; } } } + completed >>= 1; } + + /* If last, re-enable interrupts */ + if (atomic_dec_return(&dd->irq_workers_active) == 0) + writel(0xffffffff, dd->mmio + HOST_IRQ_STAT); } /* @@ -1072,6 +1075,8 @@ static inline irqreturn_t mtip_handle_irq(struct driver_data *data) struct mtip_port *port = dd->port; u32 hba_stat, port_stat; int rv = IRQ_NONE; + int do_irq_enable = 1, i, workers; + struct mtip_work *twork; hba_stat = readl(dd->mmio + HOST_IRQ_STAT); if (hba_stat) { @@ -1082,8 +1087,42 @@ static inline irqreturn_t mtip_handle_irq(struct driver_data *data) writel(port_stat, port->mmio + PORT_IRQ_STAT); /* Demux port status */ - if (likely(port_stat & PORT_IRQ_SDB_FIS)) - mtip_process_sdbf(dd); + if (likely(port_stat & PORT_IRQ_SDB_FIS)) { + do_irq_enable = 0; + WARN_ON_ONCE(atomic_read(&dd->irq_workers_active) != 0); + + /* Start at 1: group zero is always local? */ + for (i = 0, workers = 0; i < MTIP_MAX_SLOT_GROUPS; + i++) { + twork = &dd->work[i]; + twork->completed = readl(port->completed[i]); + if (twork->completed) + workers++; + } + + atomic_set(&dd->irq_workers_active, workers); + if (workers) { + for (i = 1; i < MTIP_MAX_SLOT_GROUPS; i++) { + twork = &dd->work[i]; + if (twork->completed) + queue_work_on( + twork->cpu_binding, + dd->isr_workq, + &twork->work); + } + + if (likely(dd->work[0].completed)) + mtip_workq_sdbfx(port, 0, + dd->work[0].completed); + + } else { + /* + * Chip quirk: SDB interrupt but nothing + * to complete + */ + do_irq_enable = 1; + } + } if (unlikely(port_stat & PORT_IRQ_ERR)) { if (unlikely(mtip_check_surprise_removal(dd->pdev))) { @@ -1103,21 +1142,13 @@ static inline irqreturn_t mtip_handle_irq(struct driver_data *data) } /* acknowledge interrupt */ - writel(hba_stat, dd->mmio + HOST_IRQ_STAT); + if (unlikely(do_irq_enable)) + writel(hba_stat, dd->mmio + HOST_IRQ_STAT); return rv; } /* - * Wrapper for mtip_handle_irq - * (ignores return code) - */ -static void mtip_tasklet(unsigned long data) -{ - mtip_handle_irq((struct driver_data *) data); -} - -/* * HBA interrupt subroutine. * * @irq IRQ number. @@ -1130,8 +1161,8 @@ static void mtip_tasklet(unsigned long data) static irqreturn_t mtip_irq_handler(int irq, void *instance) { struct driver_data *dd = instance; - tasklet_schedule(&dd->tasklet); - return IRQ_HANDLED; + + return mtip_handle_irq(dd); } static void mtip_issue_non_ncq_command(struct mtip_port *port, int tag) @@ -1489,6 +1520,12 @@ static int mtip_get_identify(struct mtip_port *port, void __user *user_buffer) } #endif + /* Demux ID.DRAT & ID.RZAT to determine trim support */ + if (port->identify[69] & (1 << 14) && port->identify[69] & (1 << 5)) + port->dd->trim_supp = true; + else + port->dd->trim_supp = false; + /* Set the identify buffer as valid. */ port->identify_valid = 1; @@ -1676,6 +1713,81 @@ static int mtip_get_smart_attr(struct mtip_port *port, unsigned int id, } /* + * Trim unused sectors + * + * @dd pointer to driver_data structure + * @lba starting lba + * @len # of 512b sectors to trim + * + * return value + * -ENOMEM Out of dma memory + * -EINVAL Invalid parameters passed in, trim not supported + * -EIO Error submitting trim request to hw + */ +static int mtip_send_trim(struct driver_data *dd, unsigned int lba, unsigned int len) +{ + int i, rv = 0; + u64 tlba, tlen, sect_left; + struct mtip_trim_entry *buf; + dma_addr_t dma_addr; + struct host_to_dev_fis fis; + + if (!len || dd->trim_supp == false) + return -EINVAL; + + /* Trim request too big */ + WARN_ON(len > (MTIP_MAX_TRIM_ENTRY_LEN * MTIP_MAX_TRIM_ENTRIES)); + + /* Trim request not aligned on 4k boundary */ + WARN_ON(len % 8 != 0); + + /* Warn if vu_trim structure is too big */ + WARN_ON(sizeof(struct mtip_trim) > ATA_SECT_SIZE); + + /* Allocate a DMA buffer for the trim structure */ + buf = dmam_alloc_coherent(&dd->pdev->dev, ATA_SECT_SIZE, &dma_addr, + GFP_KERNEL); + if (!buf) + return -ENOMEM; + memset(buf, 0, ATA_SECT_SIZE); + + for (i = 0, sect_left = len, tlba = lba; + i < MTIP_MAX_TRIM_ENTRIES && sect_left; + i++) { + tlen = (sect_left >= MTIP_MAX_TRIM_ENTRY_LEN ? + MTIP_MAX_TRIM_ENTRY_LEN : + sect_left); + buf[i].lba = __force_bit2int cpu_to_le32(tlba); + buf[i].range = __force_bit2int cpu_to_le16(tlen); + tlba += tlen; + sect_left -= tlen; + } + WARN_ON(sect_left != 0); + + /* Build the fis */ + memset(&fis, 0, sizeof(struct host_to_dev_fis)); + fis.type = 0x27; + fis.opts = 1 << 7; + fis.command = 0xfb; + fis.features = 0x60; + fis.sect_count = 1; + fis.device = ATA_DEVICE_OBS; + + if (mtip_exec_internal_command(dd->port, + &fis, + 5, + dma_addr, + ATA_SECT_SIZE, + 0, + GFP_KERNEL, + MTIP_TRIM_TIMEOUT_MS) < 0) + rv = -EIO; + + dmam_free_coherent(&dd->pdev->dev, ATA_SECT_SIZE, buf, dma_addr); + return rv; +} + +/* * Get the drive capacity. * * @dd Pointer to the device data structure. @@ -3005,20 +3117,24 @@ static int mtip_hw_init(struct driver_data *dd) hba_setup(dd); - tasklet_init(&dd->tasklet, mtip_tasklet, (unsigned long)dd); - - dd->port = kzalloc(sizeof(struct mtip_port), GFP_KERNEL); + dd->port = kzalloc_node(sizeof(struct mtip_port), GFP_KERNEL, + dd->numa_node); if (!dd->port) { dev_err(&dd->pdev->dev, "Memory allocation: port structure\n"); return -ENOMEM; } + /* Continue workqueue setup */ + for (i = 0; i < MTIP_MAX_SLOT_GROUPS; i++) + dd->work[i].port = dd->port; + /* Counting semaphore to track command slot usage */ sema_init(&dd->port->cmd_slot, num_command_slots - 1); /* Spinlock to prevent concurrent issue */ - spin_lock_init(&dd->port->cmd_issue_lock); + for (i = 0; i < MTIP_MAX_SLOT_GROUPS; i++) + spin_lock_init(&dd->port->cmd_issue_lock[i]); /* Set the port mmio base address. */ dd->port->mmio = dd->mmio + PORT_OFFSET; @@ -3165,6 +3281,7 @@ static int mtip_hw_init(struct driver_data *dd) "Unable to allocate IRQ %d\n", dd->pdev->irq); goto out2; } + irq_set_affinity_hint(dd->pdev->irq, get_cpu_mask(dd->isr_binding)); /* Enable interrupts on the HBA. */ writel(readl(dd->mmio + HOST_CTL) | HOST_IRQ_EN, @@ -3241,7 +3358,8 @@ out3: writel(readl(dd->mmio + HOST_CTL) & ~HOST_IRQ_EN, dd->mmio + HOST_CTL); - /*Release the IRQ. */ + /* Release the IRQ. */ + irq_set_affinity_hint(dd->pdev->irq, NULL); devm_free_irq(&dd->pdev->dev, dd->pdev->irq, dd); out2: @@ -3291,11 +3409,9 @@ static int mtip_hw_exit(struct driver_data *dd) del_timer_sync(&dd->port->cmd_timer); /* Release the IRQ. */ + irq_set_affinity_hint(dd->pdev->irq, NULL); devm_free_irq(&dd->pdev->dev, dd->pdev->irq, dd); - /* Stop the bottom half tasklet. */ - tasklet_kill(&dd->tasklet); - /* Free the command/command header memory. */ dmam_free_coherent(&dd->pdev->dev, HW_PORT_PRIV_DMA_SZ + (ATA_SECT_SIZE * 4), @@ -3641,6 +3757,12 @@ static void mtip_make_request(struct request_queue *queue, struct bio *bio) } } + if (unlikely(bio->bi_rw & REQ_DISCARD)) { + bio_endio(bio, mtip_send_trim(dd, bio->bi_sector, + bio_sectors(bio))); + return; + } + if (unlikely(!bio_has_data(bio))) { blk_queue_flush(queue, 0); bio_endio(bio, 0); @@ -3711,7 +3833,7 @@ static int mtip_block_initialize(struct driver_data *dd) goto protocol_init_error; } - dd->disk = alloc_disk(MTIP_MAX_MINORS); + dd->disk = alloc_disk_node(MTIP_MAX_MINORS, dd->numa_node); if (dd->disk == NULL) { dev_err(&dd->pdev->dev, "Unable to allocate gendisk structure\n"); @@ -3755,7 +3877,7 @@ static int mtip_block_initialize(struct driver_data *dd) skip_create_disk: /* Allocate the request queue. */ - dd->queue = blk_alloc_queue(GFP_KERNEL); + dd->queue = blk_alloc_queue_node(GFP_KERNEL, dd->numa_node); if (dd->queue == NULL) { dev_err(&dd->pdev->dev, "Unable to allocate request queue\n"); @@ -3783,6 +3905,15 @@ skip_create_disk: */ blk_queue_flush(dd->queue, 0); + /* Signal trim support */ + if (dd->trim_supp == true) { + set_bit(QUEUE_FLAG_DISCARD, &dd->queue->queue_flags); + dd->queue->limits.discard_granularity = 4096; + blk_queue_max_discard_sectors(dd->queue, + MTIP_MAX_TRIM_ENTRY_LEN * MTIP_MAX_TRIM_ENTRIES); + dd->queue->limits.discard_zeroes_data = 0; + } + /* Set the capacity of the device in 512 byte sectors. */ if (!(mtip_hw_get_capacity(dd, &capacity))) { dev_warn(&dd->pdev->dev, @@ -3813,9 +3944,8 @@ skip_create_disk: start_service_thread: sprintf(thd_name, "mtip_svc_thd_%02d", index); - - dd->mtip_svc_handler = kthread_run(mtip_service_thread, - dd, thd_name); + dd->mtip_svc_handler = kthread_create_on_node(mtip_service_thread, + dd, dd->numa_node, thd_name); if (IS_ERR(dd->mtip_svc_handler)) { dev_err(&dd->pdev->dev, "service thread failed to start\n"); @@ -3823,7 +3953,7 @@ start_service_thread: rv = -EFAULT; goto kthread_run_error; } - + wake_up_process(dd->mtip_svc_handler); if (wait_for_rebuild == MTIP_FTL_REBUILD_MAGIC) rv = wait_for_rebuild; @@ -3963,6 +4093,56 @@ static int mtip_block_resume(struct driver_data *dd) return 0; } +static void drop_cpu(int cpu) +{ + cpu_use[cpu]--; +} + +static int get_least_used_cpu_on_node(int node) +{ + int cpu, least_used_cpu, least_cnt; + const struct cpumask *node_mask; + + node_mask = cpumask_of_node(node); + least_used_cpu = cpumask_first(node_mask); + least_cnt = cpu_use[least_used_cpu]; + cpu = least_used_cpu; + + for_each_cpu(cpu, node_mask) { + if (cpu_use[cpu] < least_cnt) { + least_used_cpu = cpu; + least_cnt = cpu_use[cpu]; + } + } + cpu_use[least_used_cpu]++; + return least_used_cpu; +} + +/* Helper for selecting a node in round robin mode */ +static inline int mtip_get_next_rr_node(void) +{ + static int next_node = -1; + + if (next_node == -1) { + next_node = first_online_node; + return next_node; + } + + next_node = next_online_node(next_node); + if (next_node == MAX_NUMNODES) + next_node = first_online_node; + return next_node; +} + +static DEFINE_HANDLER(0); +static DEFINE_HANDLER(1); +static DEFINE_HANDLER(2); +static DEFINE_HANDLER(3); +static DEFINE_HANDLER(4); +static DEFINE_HANDLER(5); +static DEFINE_HANDLER(6); +static DEFINE_HANDLER(7); + /* * Called for each supported PCI device detected. * @@ -3977,9 +4157,25 @@ static int mtip_pci_probe(struct pci_dev *pdev, { int rv = 0; struct driver_data *dd = NULL; + char cpu_list[256]; + const struct cpumask *node_mask; + int cpu, i = 0, j = 0; + int my_node = NUMA_NO_NODE; /* Allocate memory for this devices private data. */ - dd = kzalloc(sizeof(struct driver_data), GFP_KERNEL); + my_node = pcibus_to_node(pdev->bus); + if (my_node != NUMA_NO_NODE) { + if (!node_online(my_node)) + my_node = mtip_get_next_rr_node(); + } else { + dev_info(&pdev->dev, "Kernel not reporting proximity, choosing a node\n"); + my_node = mtip_get_next_rr_node(); + } + dev_info(&pdev->dev, "NUMA node %d (closest: %d,%d, probe on %d:%d)\n", + my_node, pcibus_to_node(pdev->bus), dev_to_node(&pdev->dev), + cpu_to_node(smp_processor_id()), smp_processor_id()); + + dd = kzalloc_node(sizeof(struct driver_data), GFP_KERNEL, my_node); if (dd == NULL) { dev_err(&pdev->dev, "Unable to allocate memory for driver data\n"); @@ -4016,19 +4212,82 @@ static int mtip_pci_probe(struct pci_dev *pdev, } } - pci_set_master(pdev); + /* Copy the info we may need later into the private data structure. */ + dd->major = mtip_major; + dd->instance = instance; + dd->pdev = pdev; + dd->numa_node = my_node; + memset(dd->workq_name, 0, 32); + snprintf(dd->workq_name, 31, "mtipq%d", dd->instance); + + dd->isr_workq = create_workqueue(dd->workq_name); + if (!dd->isr_workq) { + dev_warn(&pdev->dev, "Can't create wq %d\n", dd->instance); + goto block_initialize_err; + } + + memset(cpu_list, 0, sizeof(cpu_list)); + + node_mask = cpumask_of_node(dd->numa_node); + if (!cpumask_empty(node_mask)) { + for_each_cpu(cpu, node_mask) + { + snprintf(&cpu_list[j], 256 - j, "%d ", cpu); + j = strlen(cpu_list); + } + + dev_info(&pdev->dev, "Node %d on package %d has %d cpu(s): %s\n", + dd->numa_node, + topology_physical_package_id(cpumask_first(node_mask)), + nr_cpus_node(dd->numa_node), + cpu_list); + } else + dev_dbg(&pdev->dev, "mtip32xx: node_mask empty\n"); + + dd->isr_binding = get_least_used_cpu_on_node(dd->numa_node); + dev_info(&pdev->dev, "Initial IRQ binding node:cpu %d:%d\n", + cpu_to_node(dd->isr_binding), dd->isr_binding); + + /* first worker context always runs in ISR */ + dd->work[0].cpu_binding = dd->isr_binding; + dd->work[1].cpu_binding = get_least_used_cpu_on_node(dd->numa_node); + dd->work[2].cpu_binding = get_least_used_cpu_on_node(dd->numa_node); + dd->work[3].cpu_binding = dd->work[0].cpu_binding; + dd->work[4].cpu_binding = dd->work[1].cpu_binding; + dd->work[5].cpu_binding = dd->work[2].cpu_binding; + dd->work[6].cpu_binding = dd->work[2].cpu_binding; + dd->work[7].cpu_binding = dd->work[1].cpu_binding; + + /* Log the bindings */ + for_each_present_cpu(cpu) { + memset(cpu_list, 0, sizeof(cpu_list)); + for (i = 0, j = 0; i < MTIP_MAX_SLOT_GROUPS; i++) { + if (dd->work[i].cpu_binding == cpu) { + snprintf(&cpu_list[j], 256 - j, "%d ", i); + j = strlen(cpu_list); + } + } + if (j) + dev_info(&pdev->dev, "CPU %d: WQs %s\n", cpu, cpu_list); + } + + INIT_WORK(&dd->work[0].work, mtip_workq_sdbf0); + INIT_WORK(&dd->work[1].work, mtip_workq_sdbf1); + INIT_WORK(&dd->work[2].work, mtip_workq_sdbf2); + INIT_WORK(&dd->work[3].work, mtip_workq_sdbf3); + INIT_WORK(&dd->work[4].work, mtip_workq_sdbf4); + INIT_WORK(&dd->work[5].work, mtip_workq_sdbf5); + INIT_WORK(&dd->work[6].work, mtip_workq_sdbf6); + INIT_WORK(&dd->work[7].work, mtip_workq_sdbf7); + + pci_set_master(pdev); if (pci_enable_msi(pdev)) { dev_warn(&pdev->dev, "Unable to enable MSI interrupt.\n"); goto block_initialize_err; } - /* Copy the info we may need later into the private data structure. */ - dd->major = mtip_major; - dd->instance = instance; - dd->pdev = pdev; - /* Initialize the block layer. */ rv = mtip_block_initialize(dd); if (rv < 0) { @@ -4048,7 +4307,13 @@ static int mtip_pci_probe(struct pci_dev *pdev, block_initialize_err: pci_disable_msi(pdev); - + if (dd->isr_workq) { + flush_workqueue(dd->isr_workq); + destroy_workqueue(dd->isr_workq); + drop_cpu(dd->work[0].cpu_binding); + drop_cpu(dd->work[1].cpu_binding); + drop_cpu(dd->work[2].cpu_binding); + } setmask_err: pcim_iounmap_regions(pdev, 1 << MTIP_ABAR); @@ -4089,6 +4354,14 @@ static void mtip_pci_remove(struct pci_dev *pdev) /* Clean up the block layer. */ mtip_block_remove(dd); + if (dd->isr_workq) { + flush_workqueue(dd->isr_workq); + destroy_workqueue(dd->isr_workq); + drop_cpu(dd->work[0].cpu_binding); + drop_cpu(dd->work[1].cpu_binding); + drop_cpu(dd->work[2].cpu_binding); + } + pci_disable_msi(pdev); kfree(dd); diff --git a/drivers/block/mtip32xx/mtip32xx.h b/drivers/block/mtip32xx/mtip32xx.h index b174264..3bffff5 100644 --- a/drivers/block/mtip32xx/mtip32xx.h +++ b/drivers/block/mtip32xx/mtip32xx.h @@ -164,6 +164,35 @@ struct smart_attr { u8 res[3]; } __packed; +struct mtip_work { + struct work_struct work; + void *port; + int cpu_binding; + u32 completed; +} ____cacheline_aligned_in_smp; + +#define DEFINE_HANDLER(group) \ + void mtip_workq_sdbf##group(struct work_struct *work) \ + { \ + struct mtip_work *w = (struct mtip_work *) work; \ + mtip_workq_sdbfx(w->port, group, w->completed); \ + } + +#define MTIP_TRIM_TIMEOUT_MS 240000 +#define MTIP_MAX_TRIM_ENTRIES 8 +#define MTIP_MAX_TRIM_ENTRY_LEN 0xfff8 + +struct mtip_trim_entry { + u32 lba; /* starting lba of region */ + u16 rsvd; /* unused */ + u16 range; /* # of 512b blocks to trim */ +} __packed; + +struct mtip_trim { + /* Array of regions to trim */ + struct mtip_trim_entry entry[MTIP_MAX_TRIM_ENTRIES]; +} __packed; + /* Register Frame Information Structure (FIS), host to device. */ struct host_to_dev_fis { /* @@ -424,7 +453,7 @@ struct mtip_port { */ struct semaphore cmd_slot; /* Spinlock for working around command-issue bug. */ - spinlock_t cmd_issue_lock; + spinlock_t cmd_issue_lock[MTIP_MAX_SLOT_GROUPS]; }; /* @@ -447,9 +476,6 @@ struct driver_data { struct mtip_port *port; /* Pointer to the port data structure. */ - /* Tasklet used to process the bottom half of the ISR. */ - struct tasklet_struct tasklet; - unsigned product_type; /* magic value declaring the product type */ unsigned slot_groups; /* number of slot groups the product supports */ @@ -461,6 +487,20 @@ struct driver_data { struct task_struct *mtip_svc_handler; /* task_struct of svc thd */ struct dentry *dfs_node; + + bool trim_supp; /* flag indicating trim support */ + + int numa_node; /* NUMA support */ + + char workq_name[32]; + + struct workqueue_struct *isr_workq; + + struct mtip_work work[MTIP_MAX_SLOT_GROUPS]; + + atomic_t irq_workers_active; + + int isr_binding; }; #endif diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 043ddcc..7fecc78 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -98,6 +98,7 @@ static const char *nbdcmd_to_ascii(int cmd) case NBD_CMD_READ: return "read"; case NBD_CMD_WRITE: return "write"; case NBD_CMD_DISC: return "disconnect"; + case NBD_CMD_FLUSH: return "flush"; case NBD_CMD_TRIM: return "trim/discard"; } return "invalid"; @@ -244,8 +245,15 @@ static int nbd_send_req(struct nbd_device *nbd, struct request *req) request.magic = htonl(NBD_REQUEST_MAGIC); request.type = htonl(nbd_cmd(req)); - request.from = cpu_to_be64((u64)blk_rq_pos(req) << 9); - request.len = htonl(size); + + if (nbd_cmd(req) == NBD_CMD_FLUSH) { + /* Other values are reserved for FLUSH requests. */ + request.from = 0; + request.len = 0; + } else { + request.from = cpu_to_be64((u64)blk_rq_pos(req) << 9); + request.len = htonl(size); + } memcpy(request.handle, &req, sizeof(req)); dprintk(DBG_TX, "%s: request %p: sending control (%s@%llu,%uB)\n", @@ -482,6 +490,11 @@ static void nbd_handle_req(struct nbd_device *nbd, struct request *req) } } + if (req->cmd_flags & REQ_FLUSH) { + BUG_ON(unlikely(blk_rq_sectors(req))); + nbd_cmd(req) = NBD_CMD_FLUSH; + } + req->errors = 0; mutex_lock(&nbd->tx_lock); @@ -551,6 +564,7 @@ static int nbd_thread(void *data) */ static void do_nbd_request(struct request_queue *q) + __releases(q->queue_lock) __acquires(q->queue_lock) { struct request *req; @@ -595,12 +609,20 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd, struct request sreq; dev_info(disk_to_dev(nbd->disk), "NBD_DISCONNECT\n"); + if (!nbd->sock) + return -EINVAL; + mutex_unlock(&nbd->tx_lock); + fsync_bdev(bdev); + mutex_lock(&nbd->tx_lock); blk_rq_init(NULL, &sreq); sreq.cmd_type = REQ_TYPE_SPECIAL; nbd_cmd(&sreq) = NBD_CMD_DISC; + + /* Check again after getting mutex back. */ if (!nbd->sock) return -EINVAL; + nbd_send_req(nbd, &sreq); return 0; } @@ -614,6 +636,7 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd, nbd_clear_que(nbd); BUG_ON(!list_empty(&nbd->queue_head)); BUG_ON(!list_empty(&nbd->waiting_queue)); + kill_bdev(bdev); if (file) fput(file); return 0; @@ -625,7 +648,7 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd, return -EBUSY; file = fget(arg); if (file) { - struct inode *inode = file->f_path.dentry->d_inode; + struct inode *inode = file_inode(file); if (S_ISSOCK(inode->i_mode)) { nbd->file = file; nbd->sock = SOCKET_I(inode); @@ -681,9 +704,15 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd, mutex_unlock(&nbd->tx_lock); + if (nbd->flags & NBD_FLAG_READ_ONLY) + set_device_ro(bdev, true); if (nbd->flags & NBD_FLAG_SEND_TRIM) queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, nbd->disk->queue); + if (nbd->flags & NBD_FLAG_SEND_FLUSH) + blk_queue_flush(nbd->disk->queue, REQ_FLUSH); + else + blk_queue_flush(nbd->disk->queue, 0); thread = kthread_create(nbd_thread, nbd, nbd->disk->disk_name); if (IS_ERR(thread)) { @@ -702,9 +731,12 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd, nbd->file = NULL; nbd_clear_que(nbd); dev_warn(disk_to_dev(nbd->disk), "queue cleared\n"); + kill_bdev(bdev); queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, nbd->disk->queue); + set_device_ro(bdev, false); if (file) fput(file); + nbd->flags = 0; nbd->bytesize = 0; bdev->bd_inode->i_size = 0; set_capacity(nbd->disk, 0); diff --git a/drivers/block/paride/Kconfig b/drivers/block/paride/Kconfig index 28cf308..efefb5a 100644 --- a/drivers/block/paride/Kconfig +++ b/drivers/block/paride/Kconfig @@ -205,8 +205,8 @@ config PARIDE_EPAT support. config PARIDE_EPATC8 - bool "Support c7/c8 chips (EXPERIMENTAL)" - depends on PARIDE_EPAT && EXPERIMENTAL + bool "Support c7/c8 chips" + depends on PARIDE_EPAT help This option enables support for the newer Shuttle EP1284 (aka c7 and c8) chip. You need this if you are using any recent Imation SuperDisk diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 89576a0..6c81a4c 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -52,9 +52,12 @@ #define SECTOR_SHIFT 9 #define SECTOR_SIZE (1ULL << SECTOR_SHIFT) -/* It might be useful to have this defined elsewhere too */ +/* It might be useful to have these defined elsewhere */ -#define U64_MAX ((u64) (~0ULL)) +#define U8_MAX ((u8) (~0U)) +#define U16_MAX ((u16) (~0U)) +#define U32_MAX ((u32) (~0U)) +#define U64_MAX ((u64) (~0ULL)) #define RBD_DRV_NAME "rbd" #define RBD_DRV_NAME_LONG "rbd (rados block device)" @@ -66,7 +69,6 @@ (NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1)) #define RBD_MAX_SNAP_COUNT 510 /* allows max snapc to fit in 4KB */ -#define RBD_MAX_OPT_LEN 1024 #define RBD_SNAP_HEAD_NAME "-" @@ -93,8 +95,6 @@ #define DEV_NAME_LEN 32 #define MAX_INT_FORMAT_WIDTH ((5 * sizeof (int)) / 2 + 1) -#define RBD_READ_ONLY_DEFAULT false - /* * block device image metadata (in-memory version) */ @@ -119,16 +119,33 @@ struct rbd_image_header { * An rbd image specification. * * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely - * identify an image. + * identify an image. Each rbd_dev structure includes a pointer to + * an rbd_spec structure that encapsulates this identity. + * + * Each of the id's in an rbd_spec has an associated name. For a + * user-mapped image, the names are supplied and the id's associated + * with them are looked up. For a layered image, a parent image is + * defined by the tuple, and the names are looked up. + * + * An rbd_dev structure contains a parent_spec pointer which is + * non-null if the image it represents is a child in a layered + * image. This pointer will refer to the rbd_spec structure used + * by the parent rbd_dev for its own identity (i.e., the structure + * is shared between the parent and child). + * + * Since these structures are populated once, during the discovery + * phase of image construction, they are effectively immutable so + * we make no effort to synchronize access to them. + * + * Note that code herein does not assume the image name is known (it + * could be a null pointer). */ struct rbd_spec { u64 pool_id; char *pool_name; char *image_id; - size_t image_id_len; char *image_name; - size_t image_name_len; u64 snap_id; char *snap_name; @@ -136,10 +153,6 @@ struct rbd_spec { struct kref kref; }; -struct rbd_options { - bool read_only; -}; - /* * an instance of the client. multiple devices may share an rbd client. */ @@ -149,37 +162,76 @@ struct rbd_client { struct list_head node; }; -/* - * a request completion status - */ -struct rbd_req_status { - int done; - int rc; - u64 bytes; +struct rbd_img_request; +typedef void (*rbd_img_callback_t)(struct rbd_img_request *); + +#define BAD_WHICH U32_MAX /* Good which or bad which, which? */ + +struct rbd_obj_request; +typedef void (*rbd_obj_callback_t)(struct rbd_obj_request *); + +enum obj_request_type { + OBJ_REQUEST_NODATA, OBJ_REQUEST_BIO, OBJ_REQUEST_PAGES }; -/* - * a collection of requests - */ -struct rbd_req_coll { - int total; - int num_done; +struct rbd_obj_request { + const char *object_name; + u64 offset; /* object start byte */ + u64 length; /* bytes from offset */ + + struct rbd_img_request *img_request; + struct list_head links; /* img_request->obj_requests */ + u32 which; /* posn image request list */ + + enum obj_request_type type; + union { + struct bio *bio_list; + struct { + struct page **pages; + u32 page_count; + }; + }; + + struct ceph_osd_request *osd_req; + + u64 xferred; /* bytes transferred */ + u64 version; + int result; + atomic_t done; + + rbd_obj_callback_t callback; + struct completion completion; + struct kref kref; - struct rbd_req_status status[0]; }; -/* - * a single io request - */ -struct rbd_request { - struct request *rq; /* blk layer request */ - struct bio *bio; /* cloned bio */ - struct page **pages; /* list of used pages */ - u64 len; - int coll_index; - struct rbd_req_coll *coll; +struct rbd_img_request { + struct request *rq; + struct rbd_device *rbd_dev; + u64 offset; /* starting image byte offset */ + u64 length; /* byte count from offset */ + bool write_request; /* false for read */ + union { + struct ceph_snap_context *snapc; /* for writes */ + u64 snap_id; /* for reads */ + }; + spinlock_t completion_lock;/* protects next_completion */ + u32 next_completion; + rbd_img_callback_t callback; + + u32 obj_request_count; + struct list_head obj_requests; /* rbd_obj_request structs */ + + struct kref kref; }; +#define for_each_obj_request(ireq, oreq) \ + list_for_each_entry(oreq, &(ireq)->obj_requests, links) +#define for_each_obj_request_from(ireq, oreq) \ + list_for_each_entry_from(oreq, &(ireq)->obj_requests, links) +#define for_each_obj_request_safe(ireq, oreq, n) \ + list_for_each_entry_safe_reverse(oreq, n, &(ireq)->obj_requests, links) + struct rbd_snap { struct device dev; const char *name; @@ -209,16 +261,18 @@ struct rbd_device { char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */ - spinlock_t lock; /* queue lock */ + spinlock_t lock; /* queue, flags, open_count */ struct rbd_image_header header; - bool exists; + unsigned long flags; /* possibly lock protected */ struct rbd_spec *spec; char *header_name; + struct ceph_file_layout layout; + struct ceph_osd_event *watch_event; - struct ceph_osd_request *watch_request; + struct rbd_obj_request *watch_request; struct rbd_spec *parent_spec; u64 parent_overlap; @@ -235,7 +289,19 @@ struct rbd_device { /* sysfs related */ struct device dev; - unsigned long open_count; + unsigned long open_count; /* protected by lock */ +}; + +/* + * Flag bits for rbd_dev->flags. If atomicity is required, + * rbd_dev->lock is used to protect access. + * + * Currently, only the "removing" flag (which is coupled with the + * "open_count" field) requires atomic access. + */ +enum rbd_dev_flags { + RBD_DEV_FLAG_EXISTS, /* mapped snapshot has not been deleted */ + RBD_DEV_FLAG_REMOVING, /* this mapping is being removed */ }; static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */ @@ -277,6 +343,33 @@ static struct device rbd_root_dev = { .release = rbd_root_dev_release, }; +static __printf(2, 3) +void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...) +{ + struct va_format vaf; + va_list args; + + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + + if (!rbd_dev) + printk(KERN_WARNING "%s: %pV\n", RBD_DRV_NAME, &vaf); + else if (rbd_dev->disk) + printk(KERN_WARNING "%s: %s: %pV\n", + RBD_DRV_NAME, rbd_dev->disk->disk_name, &vaf); + else if (rbd_dev->spec && rbd_dev->spec->image_name) + printk(KERN_WARNING "%s: image %s: %pV\n", + RBD_DRV_NAME, rbd_dev->spec->image_name, &vaf); + else if (rbd_dev->spec && rbd_dev->spec->image_id) + printk(KERN_WARNING "%s: id %s: %pV\n", + RBD_DRV_NAME, rbd_dev->spec->image_id, &vaf); + else /* punt */ + printk(KERN_WARNING "%s: rbd_dev %p: %pV\n", + RBD_DRV_NAME, rbd_dev, &vaf); + va_end(args); +} + #ifdef RBD_DEBUG #define rbd_assert(expr) \ if (unlikely(!(expr))) { \ @@ -296,14 +389,23 @@ static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver); static int rbd_open(struct block_device *bdev, fmode_t mode) { struct rbd_device *rbd_dev = bdev->bd_disk->private_data; + bool removing = false; if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only) return -EROFS; + spin_lock_irq(&rbd_dev->lock); + if (test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags)) + removing = true; + else + rbd_dev->open_count++; + spin_unlock_irq(&rbd_dev->lock); + if (removing) + return -ENOENT; + mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); (void) get_device(&rbd_dev->dev); set_device_ro(bdev, rbd_dev->mapping.read_only); - rbd_dev->open_count++; mutex_unlock(&ctl_mutex); return 0; @@ -312,10 +414,14 @@ static int rbd_open(struct block_device *bdev, fmode_t mode) static int rbd_release(struct gendisk *disk, fmode_t mode) { struct rbd_device *rbd_dev = disk->private_data; + unsigned long open_count_before; + + spin_lock_irq(&rbd_dev->lock); + open_count_before = rbd_dev->open_count--; + spin_unlock_irq(&rbd_dev->lock); + rbd_assert(open_count_before > 0); mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); - rbd_assert(rbd_dev->open_count > 0); - rbd_dev->open_count--; put_device(&rbd_dev->dev); mutex_unlock(&ctl_mutex); @@ -337,7 +443,7 @@ static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts) struct rbd_client *rbdc; int ret = -ENOMEM; - dout("rbd_client_create\n"); + dout("%s:\n", __func__); rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL); if (!rbdc) goto out_opt; @@ -361,8 +467,8 @@ static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts) spin_unlock(&rbd_client_list_lock); mutex_unlock(&ctl_mutex); + dout("%s: rbdc %p\n", __func__, rbdc); - dout("rbd_client_create created %p\n", rbdc); return rbdc; out_err: @@ -373,6 +479,8 @@ out_mutex: out_opt: if (ceph_opts) ceph_destroy_options(ceph_opts); + dout("%s: error %d\n", __func__, ret); + return ERR_PTR(ret); } @@ -426,6 +534,12 @@ static match_table_t rbd_opts_tokens = { {-1, NULL} }; +struct rbd_options { + bool read_only; +}; + +#define RBD_READ_ONLY_DEFAULT false + static int parse_rbd_opts_token(char *c, void *private) { struct rbd_options *rbd_opts = private; @@ -493,7 +607,7 @@ static void rbd_client_release(struct kref *kref) { struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref); - dout("rbd_release_client %p\n", rbdc); + dout("%s: rbdc %p\n", __func__, rbdc); spin_lock(&rbd_client_list_lock); list_del(&rbdc->node); spin_unlock(&rbd_client_list_lock); @@ -512,18 +626,6 @@ static void rbd_put_client(struct rbd_client *rbdc) kref_put(&rbdc->kref, rbd_client_release); } -/* - * Destroy requests collection - */ -static void rbd_coll_release(struct kref *kref) -{ - struct rbd_req_coll *coll = - container_of(kref, struct rbd_req_coll, kref); - - dout("rbd_coll_release %p\n", coll); - kfree(coll); -} - static bool rbd_image_format_valid(u32 image_format) { return image_format == 1 || image_format == 2; @@ -707,7 +809,8 @@ static int rbd_dev_set_mapping(struct rbd_device *rbd_dev) goto done; rbd_dev->mapping.read_only = true; } - rbd_dev->exists = true; + set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags); + done: return ret; } @@ -724,7 +827,7 @@ static void rbd_header_free(struct rbd_image_header *header) header->snapc = NULL; } -static char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset) +static const char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset) { char *name; u64 segment; @@ -767,23 +870,6 @@ static u64 rbd_segment_length(struct rbd_device *rbd_dev, return length; } -static int rbd_get_num_segments(struct rbd_image_header *header, - u64 ofs, u64 len) -{ - u64 start_seg; - u64 end_seg; - - if (!len) - return 0; - if (len - 1 > U64_MAX - ofs) - return -ERANGE; - - start_seg = ofs >> header->obj_order; - end_seg = (ofs + len - 1) >> header->obj_order; - - return end_seg - start_seg + 1; -} - /* * returns the size of an object in the image */ @@ -949,8 +1035,10 @@ static struct bio *bio_chain_clone_range(struct bio **bio_src, unsigned int bi_size; struct bio *bio; - if (!bi) + if (!bi) { + rbd_warn(NULL, "bio_chain exhausted with %u left", len); goto out_err; /* EINVAL; ran out of bio's */ + } bi_size = min_t(unsigned int, bi->bi_size - off, len); bio = bio_clone_range(bi, off, bi_size, gfpmask); if (!bio) @@ -976,399 +1064,721 @@ out_err: return NULL; } -/* - * helpers for osd request op vectors. - */ -static struct ceph_osd_req_op *rbd_create_rw_ops(int num_ops, - int opcode, u32 payload_len) +static void rbd_obj_request_get(struct rbd_obj_request *obj_request) { - struct ceph_osd_req_op *ops; + dout("%s: obj %p (was %d)\n", __func__, obj_request, + atomic_read(&obj_request->kref.refcount)); + kref_get(&obj_request->kref); +} + +static void rbd_obj_request_destroy(struct kref *kref); +static void rbd_obj_request_put(struct rbd_obj_request *obj_request) +{ + rbd_assert(obj_request != NULL); + dout("%s: obj %p (was %d)\n", __func__, obj_request, + atomic_read(&obj_request->kref.refcount)); + kref_put(&obj_request->kref, rbd_obj_request_destroy); +} + +static void rbd_img_request_get(struct rbd_img_request *img_request) +{ + dout("%s: img %p (was %d)\n", __func__, img_request, + atomic_read(&img_request->kref.refcount)); + kref_get(&img_request->kref); +} + +static void rbd_img_request_destroy(struct kref *kref); +static void rbd_img_request_put(struct rbd_img_request *img_request) +{ + rbd_assert(img_request != NULL); + dout("%s: img %p (was %d)\n", __func__, img_request, + atomic_read(&img_request->kref.refcount)); + kref_put(&img_request->kref, rbd_img_request_destroy); +} + +static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request, + struct rbd_obj_request *obj_request) +{ + rbd_assert(obj_request->img_request == NULL); + + rbd_obj_request_get(obj_request); + obj_request->img_request = img_request; + obj_request->which = img_request->obj_request_count; + rbd_assert(obj_request->which != BAD_WHICH); + img_request->obj_request_count++; + list_add_tail(&obj_request->links, &img_request->obj_requests); + dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request, + obj_request->which); +} - ops = kzalloc(sizeof (*ops) * (num_ops + 1), GFP_NOIO); - if (!ops) +static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request, + struct rbd_obj_request *obj_request) +{ + rbd_assert(obj_request->which != BAD_WHICH); + + dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request, + obj_request->which); + list_del(&obj_request->links); + rbd_assert(img_request->obj_request_count > 0); + img_request->obj_request_count--; + rbd_assert(obj_request->which == img_request->obj_request_count); + obj_request->which = BAD_WHICH; + rbd_assert(obj_request->img_request == img_request); + obj_request->img_request = NULL; + obj_request->callback = NULL; + rbd_obj_request_put(obj_request); +} + +static bool obj_request_type_valid(enum obj_request_type type) +{ + switch (type) { + case OBJ_REQUEST_NODATA: + case OBJ_REQUEST_BIO: + case OBJ_REQUEST_PAGES: + return true; + default: + return false; + } +} + +static struct ceph_osd_req_op *rbd_osd_req_op_create(u16 opcode, ...) +{ + struct ceph_osd_req_op *op; + va_list args; + size_t size; + + op = kzalloc(sizeof (*op), GFP_NOIO); + if (!op) return NULL; + op->op = opcode; + va_start(args, opcode); + switch (opcode) { + case CEPH_OSD_OP_READ: + case CEPH_OSD_OP_WRITE: + /* rbd_osd_req_op_create(READ, offset, length) */ + /* rbd_osd_req_op_create(WRITE, offset, length) */ + op->extent.offset = va_arg(args, u64); + op->extent.length = va_arg(args, u64); + if (opcode == CEPH_OSD_OP_WRITE) + op->payload_len = op->extent.length; + break; + case CEPH_OSD_OP_STAT: + break; + case CEPH_OSD_OP_CALL: + /* rbd_osd_req_op_create(CALL, class, method, data, datalen) */ + op->cls.class_name = va_arg(args, char *); + size = strlen(op->cls.class_name); + rbd_assert(size <= (size_t) U8_MAX); + op->cls.class_len = size; + op->payload_len = size; + + op->cls.method_name = va_arg(args, char *); + size = strlen(op->cls.method_name); + rbd_assert(size <= (size_t) U8_MAX); + op->cls.method_len = size; + op->payload_len += size; + + op->cls.argc = 0; + op->cls.indata = va_arg(args, void *); + size = va_arg(args, size_t); + rbd_assert(size <= (size_t) U32_MAX); + op->cls.indata_len = (u32) size; + op->payload_len += size; + break; + case CEPH_OSD_OP_NOTIFY_ACK: + case CEPH_OSD_OP_WATCH: + /* rbd_osd_req_op_create(NOTIFY_ACK, cookie, version) */ + /* rbd_osd_req_op_create(WATCH, cookie, version, flag) */ + op->watch.cookie = va_arg(args, u64); + op->watch.ver = va_arg(args, u64); + op->watch.ver = cpu_to_le64(op->watch.ver); + if (opcode == CEPH_OSD_OP_WATCH && va_arg(args, int)) + op->watch.flag = (u8) 1; + break; + default: + rbd_warn(NULL, "unsupported opcode %hu\n", opcode); + kfree(op); + op = NULL; + break; + } + va_end(args); - ops[0].op = opcode; + return op; +} - /* - * op extent offset and length will be set later on - * in calc_raw_layout() - */ - ops[0].payload_len = payload_len; +static void rbd_osd_req_op_destroy(struct ceph_osd_req_op *op) +{ + kfree(op); +} + +static int rbd_obj_request_submit(struct ceph_osd_client *osdc, + struct rbd_obj_request *obj_request) +{ + dout("%s: osdc %p obj %p\n", __func__, osdc, obj_request); - return ops; + return ceph_osdc_start_request(osdc, obj_request->osd_req, false); } -static void rbd_destroy_ops(struct ceph_osd_req_op *ops) +static void rbd_img_request_complete(struct rbd_img_request *img_request) { - kfree(ops); + dout("%s: img %p\n", __func__, img_request); + if (img_request->callback) + img_request->callback(img_request); + else + rbd_img_request_put(img_request); } -static void rbd_coll_end_req_index(struct request *rq, - struct rbd_req_coll *coll, - int index, - int ret, u64 len) +/* Caller is responsible for rbd_obj_request_destroy(obj_request) */ + +static int rbd_obj_request_wait(struct rbd_obj_request *obj_request) { - struct request_queue *q; - int min, max, i; + dout("%s: obj %p\n", __func__, obj_request); - dout("rbd_coll_end_req_index %p index %d ret %d len %llu\n", - coll, index, ret, (unsigned long long) len); + return wait_for_completion_interruptible(&obj_request->completion); +} - if (!rq) - return; +static void obj_request_done_init(struct rbd_obj_request *obj_request) +{ + atomic_set(&obj_request->done, 0); + smp_wmb(); +} - if (!coll) { - blk_end_request(rq, ret, len); - return; +static void obj_request_done_set(struct rbd_obj_request *obj_request) +{ + int done; + + done = atomic_inc_return(&obj_request->done); + if (done > 1) { + struct rbd_img_request *img_request = obj_request->img_request; + struct rbd_device *rbd_dev; + + rbd_dev = img_request ? img_request->rbd_dev : NULL; + rbd_warn(rbd_dev, "obj_request %p was already done\n", + obj_request); } +} - q = rq->q; - - spin_lock_irq(q->queue_lock); - coll->status[index].done = 1; - coll->status[index].rc = ret; - coll->status[index].bytes = len; - max = min = coll->num_done; - while (max < coll->total && coll->status[max].done) - max++; - - for (i = min; i<max; i++) { - __blk_end_request(rq, coll->status[i].rc, - coll->status[i].bytes); - coll->num_done++; - kref_put(&coll->kref, rbd_coll_release); +static bool obj_request_done_test(struct rbd_obj_request *obj_request) +{ + smp_mb(); + return atomic_read(&obj_request->done) != 0; +} + +static void rbd_obj_request_complete(struct rbd_obj_request *obj_request) +{ + dout("%s: obj %p cb %p\n", __func__, obj_request, + obj_request->callback); + if (obj_request->callback) + obj_request->callback(obj_request); + else + complete_all(&obj_request->completion); +} + +static void rbd_osd_trivial_callback(struct rbd_obj_request *obj_request) +{ + dout("%s: obj %p\n", __func__, obj_request); + obj_request_done_set(obj_request); +} + +static void rbd_osd_read_callback(struct rbd_obj_request *obj_request) +{ + dout("%s: obj %p result %d %llu/%llu\n", __func__, obj_request, + obj_request->result, obj_request->xferred, obj_request->length); + /* + * ENOENT means a hole in the object. We zero-fill the + * entire length of the request. A short read also implies + * zero-fill to the end of the request. Either way we + * update the xferred count to indicate the whole request + * was satisfied. + */ + if (obj_request->result == -ENOENT) { + zero_bio_chain(obj_request->bio_list, 0); + obj_request->result = 0; + obj_request->xferred = obj_request->length; + } else if (obj_request->xferred < obj_request->length && + !obj_request->result) { + zero_bio_chain(obj_request->bio_list, obj_request->xferred); + obj_request->xferred = obj_request->length; } - spin_unlock_irq(q->queue_lock); + obj_request_done_set(obj_request); } -static void rbd_coll_end_req(struct rbd_request *req, - int ret, u64 len) +static void rbd_osd_write_callback(struct rbd_obj_request *obj_request) { - rbd_coll_end_req_index(req->rq, req->coll, req->coll_index, ret, len); + dout("%s: obj %p result %d %llu\n", __func__, obj_request, + obj_request->result, obj_request->length); + /* + * There is no such thing as a successful short write. + * Our xferred value is the number of bytes transferred + * back. Set it to our originally-requested length. + */ + obj_request->xferred = obj_request->length; + obj_request_done_set(obj_request); } /* - * Send ceph osd request + * For a simple stat call there's nothing to do. We'll do more if + * this is part of a write sequence for a layered image. */ -static int rbd_do_request(struct request *rq, - struct rbd_device *rbd_dev, - struct ceph_snap_context *snapc, - u64 snapid, - const char *object_name, u64 ofs, u64 len, - struct bio *bio, - struct page **pages, - int num_pages, - int flags, - struct ceph_osd_req_op *ops, - struct rbd_req_coll *coll, - int coll_index, - void (*rbd_cb)(struct ceph_osd_request *req, - struct ceph_msg *msg), - struct ceph_osd_request **linger_req, - u64 *ver) -{ - struct ceph_osd_request *req; - struct ceph_file_layout *layout; - int ret; - u64 bno; - struct timespec mtime = CURRENT_TIME; - struct rbd_request *req_data; - struct ceph_osd_request_head *reqhead; - struct ceph_osd_client *osdc; +static void rbd_osd_stat_callback(struct rbd_obj_request *obj_request) +{ + dout("%s: obj %p\n", __func__, obj_request); + obj_request_done_set(obj_request); +} - req_data = kzalloc(sizeof(*req_data), GFP_NOIO); - if (!req_data) { - if (coll) - rbd_coll_end_req_index(rq, coll, coll_index, - -ENOMEM, len); - return -ENOMEM; +static void rbd_osd_req_callback(struct ceph_osd_request *osd_req, + struct ceph_msg *msg) +{ + struct rbd_obj_request *obj_request = osd_req->r_priv; + u16 opcode; + + dout("%s: osd_req %p msg %p\n", __func__, osd_req, msg); + rbd_assert(osd_req == obj_request->osd_req); + rbd_assert(!!obj_request->img_request ^ + (obj_request->which == BAD_WHICH)); + + if (osd_req->r_result < 0) + obj_request->result = osd_req->r_result; + obj_request->version = le64_to_cpu(osd_req->r_reassert_version.version); + + WARN_ON(osd_req->r_num_ops != 1); /* For now */ + + /* + * We support a 64-bit length, but ultimately it has to be + * passed to blk_end_request(), which takes an unsigned int. + */ + obj_request->xferred = osd_req->r_reply_op_len[0]; + rbd_assert(obj_request->xferred < (u64) UINT_MAX); + opcode = osd_req->r_request_ops[0].op; + switch (opcode) { + case CEPH_OSD_OP_READ: + rbd_osd_read_callback(obj_request); + break; + case CEPH_OSD_OP_WRITE: + rbd_osd_write_callback(obj_request); + break; + case CEPH_OSD_OP_STAT: + rbd_osd_stat_callback(obj_request); + break; + case CEPH_OSD_OP_CALL: + case CEPH_OSD_OP_NOTIFY_ACK: + case CEPH_OSD_OP_WATCH: + rbd_osd_trivial_callback(obj_request); + break; + default: + rbd_warn(NULL, "%s: unsupported op %hu\n", + obj_request->object_name, (unsigned short) opcode); + break; } - if (coll) { - req_data->coll = coll; - req_data->coll_index = coll_index; + if (obj_request_done_test(obj_request)) + rbd_obj_request_complete(obj_request); +} + +static struct ceph_osd_request *rbd_osd_req_create( + struct rbd_device *rbd_dev, + bool write_request, + struct rbd_obj_request *obj_request, + struct ceph_osd_req_op *op) +{ + struct rbd_img_request *img_request = obj_request->img_request; + struct ceph_snap_context *snapc = NULL; + struct ceph_osd_client *osdc; + struct ceph_osd_request *osd_req; + struct timespec now; + struct timespec *mtime; + u64 snap_id = CEPH_NOSNAP; + u64 offset = obj_request->offset; + u64 length = obj_request->length; + + if (img_request) { + rbd_assert(img_request->write_request == write_request); + if (img_request->write_request) + snapc = img_request->snapc; + else + snap_id = img_request->snap_id; } - dout("rbd_do_request object_name=%s ofs=%llu len=%llu coll=%p[%d]\n", - object_name, (unsigned long long) ofs, - (unsigned long long) len, coll, coll_index); + /* Allocate and initialize the request, for the single op */ osdc = &rbd_dev->rbd_client->client->osdc; - req = ceph_osdc_alloc_request(osdc, flags, snapc, ops, - false, GFP_NOIO, pages, bio); - if (!req) { - ret = -ENOMEM; - goto done_pages; + osd_req = ceph_osdc_alloc_request(osdc, snapc, 1, false, GFP_ATOMIC); + if (!osd_req) + return NULL; /* ENOMEM */ + + rbd_assert(obj_request_type_valid(obj_request->type)); + switch (obj_request->type) { + case OBJ_REQUEST_NODATA: + break; /* Nothing to do */ + case OBJ_REQUEST_BIO: + rbd_assert(obj_request->bio_list != NULL); + osd_req->r_bio = obj_request->bio_list; + break; + case OBJ_REQUEST_PAGES: + osd_req->r_pages = obj_request->pages; + osd_req->r_num_pages = obj_request->page_count; + osd_req->r_page_alignment = offset & ~PAGE_MASK; + break; } - req->r_callback = rbd_cb; + if (write_request) { + osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK; + now = CURRENT_TIME; + mtime = &now; + } else { + osd_req->r_flags = CEPH_OSD_FLAG_READ; + mtime = NULL; /* not needed for reads */ + offset = 0; /* These are not used... */ + length = 0; /* ...for osd read requests */ + } - req_data->rq = rq; - req_data->bio = bio; - req_data->pages = pages; - req_data->len = len; + osd_req->r_callback = rbd_osd_req_callback; + osd_req->r_priv = obj_request; - req->r_priv = req_data; + osd_req->r_oid_len = strlen(obj_request->object_name); + rbd_assert(osd_req->r_oid_len < sizeof (osd_req->r_oid)); + memcpy(osd_req->r_oid, obj_request->object_name, osd_req->r_oid_len); - reqhead = req->r_request->front.iov_base; - reqhead->snapid = cpu_to_le64(CEPH_NOSNAP); + osd_req->r_file_layout = rbd_dev->layout; /* struct */ - strncpy(req->r_oid, object_name, sizeof(req->r_oid)); - req->r_oid_len = strlen(req->r_oid); + /* osd_req will get its own reference to snapc (if non-null) */ - layout = &req->r_file_layout; - memset(layout, 0, sizeof(*layout)); - layout->fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER); - layout->fl_stripe_count = cpu_to_le32(1); - layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER); - layout->fl_pg_pool = cpu_to_le32((int) rbd_dev->spec->pool_id); - ret = ceph_calc_raw_layout(osdc, layout, snapid, ofs, &len, &bno, - req, ops); - rbd_assert(ret == 0); + ceph_osdc_build_request(osd_req, offset, length, 1, op, + snapc, snap_id, mtime); - ceph_osdc_build_request(req, ofs, &len, - ops, - snapc, - &mtime, - req->r_oid, req->r_oid_len); + return osd_req; +} - if (linger_req) { - ceph_osdc_set_request_linger(osdc, req); - *linger_req = req; - } +static void rbd_osd_req_destroy(struct ceph_osd_request *osd_req) +{ + ceph_osdc_put_request(osd_req); +} - ret = ceph_osdc_start_request(osdc, req, false); - if (ret < 0) - goto done_err; - - if (!rbd_cb) { - ret = ceph_osdc_wait_request(osdc, req); - if (ver) - *ver = le64_to_cpu(req->r_reassert_version.version); - dout("reassert_ver=%llu\n", - (unsigned long long) - le64_to_cpu(req->r_reassert_version.version)); - ceph_osdc_put_request(req); +/* object_name is assumed to be a non-null pointer and NUL-terminated */ + +static struct rbd_obj_request *rbd_obj_request_create(const char *object_name, + u64 offset, u64 length, + enum obj_request_type type) +{ + struct rbd_obj_request *obj_request; + size_t size; + char *name; + + rbd_assert(obj_request_type_valid(type)); + + size = strlen(object_name) + 1; + obj_request = kzalloc(sizeof (*obj_request) + size, GFP_KERNEL); + if (!obj_request) + return NULL; + + name = (char *)(obj_request + 1); + obj_request->object_name = memcpy(name, object_name, size); + obj_request->offset = offset; + obj_request->length = length; + obj_request->which = BAD_WHICH; + obj_request->type = type; + INIT_LIST_HEAD(&obj_request->links); + obj_request_done_init(obj_request); + init_completion(&obj_request->completion); + kref_init(&obj_request->kref); + + dout("%s: \"%s\" %llu/%llu %d -> obj %p\n", __func__, object_name, + offset, length, (int)type, obj_request); + + return obj_request; +} + +static void rbd_obj_request_destroy(struct kref *kref) +{ + struct rbd_obj_request *obj_request; + + obj_request = container_of(kref, struct rbd_obj_request, kref); + + dout("%s: obj %p\n", __func__, obj_request); + + rbd_assert(obj_request->img_request == NULL); + rbd_assert(obj_request->which == BAD_WHICH); + + if (obj_request->osd_req) + rbd_osd_req_destroy(obj_request->osd_req); + + rbd_assert(obj_request_type_valid(obj_request->type)); + switch (obj_request->type) { + case OBJ_REQUEST_NODATA: + break; /* Nothing to do */ + case OBJ_REQUEST_BIO: + if (obj_request->bio_list) + bio_chain_put(obj_request->bio_list); + break; + case OBJ_REQUEST_PAGES: + if (obj_request->pages) + ceph_release_page_vector(obj_request->pages, + obj_request->page_count); + break; } - return ret; -done_err: - bio_chain_put(req_data->bio); - ceph_osdc_put_request(req); -done_pages: - rbd_coll_end_req(req_data, ret, len); - kfree(req_data); - return ret; + kfree(obj_request); } /* - * Ceph osd op callback + * Caller is responsible for filling in the list of object requests + * that comprises the image request, and the Linux request pointer + * (if there is one). */ -static void rbd_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg) -{ - struct rbd_request *req_data = req->r_priv; - struct ceph_osd_reply_head *replyhead; - struct ceph_osd_op *op; - __s32 rc; - u64 bytes; - int read_op; - - /* parse reply */ - replyhead = msg->front.iov_base; - WARN_ON(le32_to_cpu(replyhead->num_ops) == 0); - op = (void *)(replyhead + 1); - rc = le32_to_cpu(replyhead->result); - bytes = le64_to_cpu(op->extent.length); - read_op = (le16_to_cpu(op->op) == CEPH_OSD_OP_READ); - - dout("rbd_req_cb bytes=%llu readop=%d rc=%d\n", - (unsigned long long) bytes, read_op, (int) rc); - - if (rc == -ENOENT && read_op) { - zero_bio_chain(req_data->bio, 0); - rc = 0; - } else if (rc == 0 && read_op && bytes < req_data->len) { - zero_bio_chain(req_data->bio, bytes); - bytes = req_data->len; - } +static struct rbd_img_request *rbd_img_request_create( + struct rbd_device *rbd_dev, + u64 offset, u64 length, + bool write_request) +{ + struct rbd_img_request *img_request; + struct ceph_snap_context *snapc = NULL; - rbd_coll_end_req(req_data, rc, bytes); + img_request = kmalloc(sizeof (*img_request), GFP_ATOMIC); + if (!img_request) + return NULL; - if (req_data->bio) - bio_chain_put(req_data->bio); + if (write_request) { + down_read(&rbd_dev->header_rwsem); + snapc = ceph_get_snap_context(rbd_dev->header.snapc); + up_read(&rbd_dev->header_rwsem); + if (WARN_ON(!snapc)) { + kfree(img_request); + return NULL; /* Shouldn't happen */ + } + } - ceph_osdc_put_request(req); - kfree(req_data); + img_request->rq = NULL; + img_request->rbd_dev = rbd_dev; + img_request->offset = offset; + img_request->length = length; + img_request->write_request = write_request; + if (write_request) + img_request->snapc = snapc; + else + img_request->snap_id = rbd_dev->spec->snap_id; + spin_lock_init(&img_request->completion_lock); + img_request->next_completion = 0; + img_request->callback = NULL; + img_request->obj_request_count = 0; + INIT_LIST_HEAD(&img_request->obj_requests); + kref_init(&img_request->kref); + + rbd_img_request_get(img_request); /* Avoid a warning */ + rbd_img_request_put(img_request); /* TEMPORARY */ + + dout("%s: rbd_dev %p %s %llu/%llu -> img %p\n", __func__, rbd_dev, + write_request ? "write" : "read", offset, length, + img_request); + + return img_request; } -static void rbd_simple_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg) +static void rbd_img_request_destroy(struct kref *kref) { - ceph_osdc_put_request(req); + struct rbd_img_request *img_request; + struct rbd_obj_request *obj_request; + struct rbd_obj_request *next_obj_request; + + img_request = container_of(kref, struct rbd_img_request, kref); + + dout("%s: img %p\n", __func__, img_request); + + for_each_obj_request_safe(img_request, obj_request, next_obj_request) + rbd_img_obj_request_del(img_request, obj_request); + rbd_assert(img_request->obj_request_count == 0); + + if (img_request->write_request) + ceph_put_snap_context(img_request->snapc); + + kfree(img_request); } -/* - * Do a synchronous ceph osd operation - */ -static int rbd_req_sync_op(struct rbd_device *rbd_dev, - struct ceph_snap_context *snapc, - u64 snapid, - int flags, - struct ceph_osd_req_op *ops, - const char *object_name, - u64 ofs, u64 inbound_size, - char *inbound, - struct ceph_osd_request **linger_req, - u64 *ver) +static int rbd_img_request_fill_bio(struct rbd_img_request *img_request, + struct bio *bio_list) { - int ret; - struct page **pages; - int num_pages; - - rbd_assert(ops != NULL); + struct rbd_device *rbd_dev = img_request->rbd_dev; + struct rbd_obj_request *obj_request = NULL; + struct rbd_obj_request *next_obj_request; + unsigned int bio_offset; + u64 image_offset; + u64 resid; + u16 opcode; + + dout("%s: img %p bio %p\n", __func__, img_request, bio_list); + + opcode = img_request->write_request ? CEPH_OSD_OP_WRITE + : CEPH_OSD_OP_READ; + bio_offset = 0; + image_offset = img_request->offset; + rbd_assert(image_offset == bio_list->bi_sector << SECTOR_SHIFT); + resid = img_request->length; + rbd_assert(resid > 0); + while (resid) { + const char *object_name; + unsigned int clone_size; + struct ceph_osd_req_op *op; + u64 offset; + u64 length; + + object_name = rbd_segment_name(rbd_dev, image_offset); + if (!object_name) + goto out_unwind; + offset = rbd_segment_offset(rbd_dev, image_offset); + length = rbd_segment_length(rbd_dev, image_offset, resid); + obj_request = rbd_obj_request_create(object_name, + offset, length, + OBJ_REQUEST_BIO); + kfree(object_name); /* object request has its own copy */ + if (!obj_request) + goto out_unwind; + + rbd_assert(length <= (u64) UINT_MAX); + clone_size = (unsigned int) length; + obj_request->bio_list = bio_chain_clone_range(&bio_list, + &bio_offset, clone_size, + GFP_ATOMIC); + if (!obj_request->bio_list) + goto out_partial; - num_pages = calc_pages_for(ofs, inbound_size); - pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL); - if (IS_ERR(pages)) - return PTR_ERR(pages); + /* + * Build up the op to use in building the osd + * request. Note that the contents of the op are + * copied by rbd_osd_req_create(). + */ + op = rbd_osd_req_op_create(opcode, offset, length); + if (!op) + goto out_partial; + obj_request->osd_req = rbd_osd_req_create(rbd_dev, + img_request->write_request, + obj_request, op); + rbd_osd_req_op_destroy(op); + if (!obj_request->osd_req) + goto out_partial; + /* status and version are initially zero-filled */ + + rbd_img_obj_request_add(img_request, obj_request); + + image_offset += length; + resid -= length; + } - ret = rbd_do_request(NULL, rbd_dev, snapc, snapid, - object_name, ofs, inbound_size, NULL, - pages, num_pages, - flags, - ops, - NULL, 0, - NULL, - linger_req, ver); - if (ret < 0) - goto done; + return 0; - if ((flags & CEPH_OSD_FLAG_READ) && inbound) - ret = ceph_copy_from_page_vector(pages, inbound, ofs, ret); +out_partial: + rbd_obj_request_put(obj_request); +out_unwind: + for_each_obj_request_safe(img_request, obj_request, next_obj_request) + rbd_obj_request_put(obj_request); -done: - ceph_release_page_vector(pages, num_pages); - return ret; + return -ENOMEM; } -/* - * Do an asynchronous ceph osd operation - */ -static int rbd_do_op(struct request *rq, - struct rbd_device *rbd_dev, - struct ceph_snap_context *snapc, - u64 ofs, u64 len, - struct bio *bio, - struct rbd_req_coll *coll, - int coll_index) -{ - char *seg_name; - u64 seg_ofs; - u64 seg_len; - int ret; - struct ceph_osd_req_op *ops; - u32 payload_len; - int opcode; - int flags; - u64 snapid; - - seg_name = rbd_segment_name(rbd_dev, ofs); - if (!seg_name) - return -ENOMEM; - seg_len = rbd_segment_length(rbd_dev, ofs, len); - seg_ofs = rbd_segment_offset(rbd_dev, ofs); - - if (rq_data_dir(rq) == WRITE) { - opcode = CEPH_OSD_OP_WRITE; - flags = CEPH_OSD_FLAG_WRITE|CEPH_OSD_FLAG_ONDISK; - snapid = CEPH_NOSNAP; - payload_len = seg_len; - } else { - opcode = CEPH_OSD_OP_READ; - flags = CEPH_OSD_FLAG_READ; - snapc = NULL; - snapid = rbd_dev->spec->snap_id; - payload_len = 0; +static void rbd_img_obj_callback(struct rbd_obj_request *obj_request) +{ + struct rbd_img_request *img_request; + u32 which = obj_request->which; + bool more = true; + + img_request = obj_request->img_request; + + dout("%s: img %p obj %p\n", __func__, img_request, obj_request); + rbd_assert(img_request != NULL); + rbd_assert(img_request->rq != NULL); + rbd_assert(img_request->obj_request_count > 0); + rbd_assert(which != BAD_WHICH); + rbd_assert(which < img_request->obj_request_count); + rbd_assert(which >= img_request->next_completion); + + spin_lock_irq(&img_request->completion_lock); + if (which != img_request->next_completion) + goto out; + + for_each_obj_request_from(img_request, obj_request) { + unsigned int xferred; + int result; + + rbd_assert(more); + rbd_assert(which < img_request->obj_request_count); + + if (!obj_request_done_test(obj_request)) + break; + + rbd_assert(obj_request->xferred <= (u64) UINT_MAX); + xferred = (unsigned int) obj_request->xferred; + result = (int) obj_request->result; + if (result) + rbd_warn(NULL, "obj_request %s result %d xferred %u\n", + img_request->write_request ? "write" : "read", + result, xferred); + + more = blk_end_request(img_request->rq, result, xferred); + which++; } - ret = -ENOMEM; - ops = rbd_create_rw_ops(1, opcode, payload_len); - if (!ops) - goto done; + rbd_assert(more ^ (which == img_request->obj_request_count)); + img_request->next_completion = which; +out: + spin_unlock_irq(&img_request->completion_lock); - /* we've taken care of segment sizes earlier when we - cloned the bios. We should never have a segment - truncated at this point */ - rbd_assert(seg_len == len); - - ret = rbd_do_request(rq, rbd_dev, snapc, snapid, - seg_name, seg_ofs, seg_len, - bio, - NULL, 0, - flags, - ops, - coll, coll_index, - rbd_req_cb, 0, NULL); - - rbd_destroy_ops(ops); -done: - kfree(seg_name); - return ret; + if (!more) + rbd_img_request_complete(img_request); } -/* - * Request sync osd read - */ -static int rbd_req_sync_read(struct rbd_device *rbd_dev, - u64 snapid, - const char *object_name, - u64 ofs, u64 len, - char *buf, - u64 *ver) -{ - struct ceph_osd_req_op *ops; - int ret; +static int rbd_img_request_submit(struct rbd_img_request *img_request) +{ + struct rbd_device *rbd_dev = img_request->rbd_dev; + struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; + struct rbd_obj_request *obj_request; - ops = rbd_create_rw_ops(1, CEPH_OSD_OP_READ, 0); - if (!ops) - return -ENOMEM; + dout("%s: img %p\n", __func__, img_request); + for_each_obj_request(img_request, obj_request) { + int ret; - ret = rbd_req_sync_op(rbd_dev, NULL, - snapid, - CEPH_OSD_FLAG_READ, - ops, object_name, ofs, len, buf, NULL, ver); - rbd_destroy_ops(ops); + obj_request->callback = rbd_img_obj_callback; + ret = rbd_obj_request_submit(osdc, obj_request); + if (ret) + return ret; + /* + * The image request has its own reference to each + * of its object requests, so we can safely drop the + * initial one here. + */ + rbd_obj_request_put(obj_request); + } - return ret; + return 0; } -/* - * Request sync osd watch - */ -static int rbd_req_sync_notify_ack(struct rbd_device *rbd_dev, - u64 ver, - u64 notify_id) +static int rbd_obj_notify_ack(struct rbd_device *rbd_dev, + u64 ver, u64 notify_id) { - struct ceph_osd_req_op *ops; + struct rbd_obj_request *obj_request; + struct ceph_osd_req_op *op; + struct ceph_osd_client *osdc; int ret; - ops = rbd_create_rw_ops(1, CEPH_OSD_OP_NOTIFY_ACK, 0); - if (!ops) + obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0, + OBJ_REQUEST_NODATA); + if (!obj_request) return -ENOMEM; - ops[0].watch.ver = cpu_to_le64(ver); - ops[0].watch.cookie = notify_id; - ops[0].watch.flag = 0; + ret = -ENOMEM; + op = rbd_osd_req_op_create(CEPH_OSD_OP_NOTIFY_ACK, notify_id, ver); + if (!op) + goto out; + obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, + obj_request, op); + rbd_osd_req_op_destroy(op); + if (!obj_request->osd_req) + goto out; - ret = rbd_do_request(NULL, rbd_dev, NULL, CEPH_NOSNAP, - rbd_dev->header_name, 0, 0, NULL, - NULL, 0, - CEPH_OSD_FLAG_READ, - ops, - NULL, 0, - rbd_simple_req_cb, 0, NULL); + osdc = &rbd_dev->rbd_client->client->osdc; + obj_request->callback = rbd_obj_request_put; + ret = rbd_obj_request_submit(osdc, obj_request); +out: + if (ret) + rbd_obj_request_put(obj_request); - rbd_destroy_ops(ops); return ret; } @@ -1381,95 +1791,103 @@ static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data) if (!rbd_dev) return; - dout("rbd_watch_cb %s notify_id=%llu opcode=%u\n", + dout("%s: \"%s\" notify_id %llu opcode %u\n", __func__, rbd_dev->header_name, (unsigned long long) notify_id, (unsigned int) opcode); rc = rbd_dev_refresh(rbd_dev, &hver); if (rc) - pr_warning(RBD_DRV_NAME "%d got notification but failed to " - " update snaps: %d\n", rbd_dev->major, rc); + rbd_warn(rbd_dev, "got notification but failed to " + " update snaps: %d\n", rc); - rbd_req_sync_notify_ack(rbd_dev, hver, notify_id); + rbd_obj_notify_ack(rbd_dev, hver, notify_id); } /* - * Request sync osd watch + * Request sync osd watch/unwatch. The value of "start" determines + * whether a watch request is being initiated or torn down. */ -static int rbd_req_sync_watch(struct rbd_device *rbd_dev) +static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev, int start) { - struct ceph_osd_req_op *ops; struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; + struct rbd_obj_request *obj_request; + struct ceph_osd_req_op *op; int ret; - ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0); - if (!ops) - return -ENOMEM; + rbd_assert(start ^ !!rbd_dev->watch_event); + rbd_assert(start ^ !!rbd_dev->watch_request); - ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0, - (void *)rbd_dev, &rbd_dev->watch_event); - if (ret < 0) - goto fail; + if (start) { + ret = ceph_osdc_create_event(osdc, rbd_watch_cb, rbd_dev, + &rbd_dev->watch_event); + if (ret < 0) + return ret; + rbd_assert(rbd_dev->watch_event != NULL); + } - ops[0].watch.ver = cpu_to_le64(rbd_dev->header.obj_version); - ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie); - ops[0].watch.flag = 1; + ret = -ENOMEM; + obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0, + OBJ_REQUEST_NODATA); + if (!obj_request) + goto out_cancel; + + op = rbd_osd_req_op_create(CEPH_OSD_OP_WATCH, + rbd_dev->watch_event->cookie, + rbd_dev->header.obj_version, start); + if (!op) + goto out_cancel; + obj_request->osd_req = rbd_osd_req_create(rbd_dev, true, + obj_request, op); + rbd_osd_req_op_destroy(op); + if (!obj_request->osd_req) + goto out_cancel; + + if (start) + ceph_osdc_set_request_linger(osdc, obj_request->osd_req); + else + ceph_osdc_unregister_linger_request(osdc, + rbd_dev->watch_request->osd_req); + ret = rbd_obj_request_submit(osdc, obj_request); + if (ret) + goto out_cancel; + ret = rbd_obj_request_wait(obj_request); + if (ret) + goto out_cancel; + ret = obj_request->result; + if (ret) + goto out_cancel; - ret = rbd_req_sync_op(rbd_dev, NULL, - CEPH_NOSNAP, - CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, - ops, - rbd_dev->header_name, - 0, 0, NULL, - &rbd_dev->watch_request, NULL); + /* + * A watch request is set to linger, so the underlying osd + * request won't go away until we unregister it. We retain + * a pointer to the object request during that time (in + * rbd_dev->watch_request), so we'll keep a reference to + * it. We'll drop that reference (below) after we've + * unregistered it. + */ + if (start) { + rbd_dev->watch_request = obj_request; - if (ret < 0) - goto fail_event; + return 0; + } - rbd_destroy_ops(ops); - return 0; + /* We have successfully torn down the watch request */ -fail_event: + rbd_obj_request_put(rbd_dev->watch_request); + rbd_dev->watch_request = NULL; +out_cancel: + /* Cancel the event if we're tearing down, or on error */ ceph_osdc_cancel_event(rbd_dev->watch_event); rbd_dev->watch_event = NULL; -fail: - rbd_destroy_ops(ops); - return ret; -} + if (obj_request) + rbd_obj_request_put(obj_request); -/* - * Request sync osd unwatch - */ -static int rbd_req_sync_unwatch(struct rbd_device *rbd_dev) -{ - struct ceph_osd_req_op *ops; - int ret; - - ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0); - if (!ops) - return -ENOMEM; - - ops[0].watch.ver = 0; - ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie); - ops[0].watch.flag = 0; - - ret = rbd_req_sync_op(rbd_dev, NULL, - CEPH_NOSNAP, - CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, - ops, - rbd_dev->header_name, - 0, 0, NULL, NULL, NULL); - - - rbd_destroy_ops(ops); - ceph_osdc_cancel_event(rbd_dev->watch_event); - rbd_dev->watch_event = NULL; return ret; } /* * Synchronous osd object method call */ -static int rbd_req_sync_exec(struct rbd_device *rbd_dev, +static int rbd_obj_method_sync(struct rbd_device *rbd_dev, const char *object_name, const char *class_name, const char *method_name, @@ -1477,169 +1895,154 @@ static int rbd_req_sync_exec(struct rbd_device *rbd_dev, size_t outbound_size, char *inbound, size_t inbound_size, - int flags, - u64 *ver) + u64 *version) { - struct ceph_osd_req_op *ops; - int class_name_len = strlen(class_name); - int method_name_len = strlen(method_name); - int payload_size; + struct rbd_obj_request *obj_request; + struct ceph_osd_client *osdc; + struct ceph_osd_req_op *op; + struct page **pages; + u32 page_count; int ret; /* - * Any input parameters required by the method we're calling - * will be sent along with the class and method names as - * part of the message payload. That data and its size are - * supplied via the indata and indata_len fields (named from - * the perspective of the server side) in the OSD request - * operation. + * Method calls are ultimately read operations but they + * don't involve object data (so no offset or length). + * The result should placed into the inbound buffer + * provided. They also supply outbound data--parameters for + * the object method. Currently if this is present it will + * be a snapshot id. */ - payload_size = class_name_len + method_name_len + outbound_size; - ops = rbd_create_rw_ops(1, CEPH_OSD_OP_CALL, payload_size); - if (!ops) - return -ENOMEM; + page_count = (u32) calc_pages_for(0, inbound_size); + pages = ceph_alloc_page_vector(page_count, GFP_KERNEL); + if (IS_ERR(pages)) + return PTR_ERR(pages); - ops[0].cls.class_name = class_name; - ops[0].cls.class_len = (__u8) class_name_len; - ops[0].cls.method_name = method_name; - ops[0].cls.method_len = (__u8) method_name_len; - ops[0].cls.argc = 0; - ops[0].cls.indata = outbound; - ops[0].cls.indata_len = outbound_size; + ret = -ENOMEM; + obj_request = rbd_obj_request_create(object_name, 0, 0, + OBJ_REQUEST_PAGES); + if (!obj_request) + goto out; - ret = rbd_req_sync_op(rbd_dev, NULL, - CEPH_NOSNAP, - flags, ops, - object_name, 0, inbound_size, inbound, - NULL, ver); + obj_request->pages = pages; + obj_request->page_count = page_count; - rbd_destroy_ops(ops); + op = rbd_osd_req_op_create(CEPH_OSD_OP_CALL, class_name, + method_name, outbound, outbound_size); + if (!op) + goto out; + obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, + obj_request, op); + rbd_osd_req_op_destroy(op); + if (!obj_request->osd_req) + goto out; - dout("cls_exec returned %d\n", ret); - return ret; -} + osdc = &rbd_dev->rbd_client->client->osdc; + ret = rbd_obj_request_submit(osdc, obj_request); + if (ret) + goto out; + ret = rbd_obj_request_wait(obj_request); + if (ret) + goto out; -static struct rbd_req_coll *rbd_alloc_coll(int num_reqs) -{ - struct rbd_req_coll *coll = - kzalloc(sizeof(struct rbd_req_coll) + - sizeof(struct rbd_req_status) * num_reqs, - GFP_ATOMIC); + ret = obj_request->result; + if (ret < 0) + goto out; + ret = 0; + ceph_copy_from_page_vector(pages, inbound, 0, obj_request->xferred); + if (version) + *version = obj_request->version; +out: + if (obj_request) + rbd_obj_request_put(obj_request); + else + ceph_release_page_vector(pages, page_count); - if (!coll) - return NULL; - coll->total = num_reqs; - kref_init(&coll->kref); - return coll; + return ret; } -/* - * block device queue callback - */ -static void rbd_rq_fn(struct request_queue *q) +static void rbd_request_fn(struct request_queue *q) + __releases(q->queue_lock) __acquires(q->queue_lock) { struct rbd_device *rbd_dev = q->queuedata; + bool read_only = rbd_dev->mapping.read_only; struct request *rq; + int result; while ((rq = blk_fetch_request(q))) { - struct bio *bio; - bool do_write; - unsigned int size; - u64 ofs; - int num_segs, cur_seg = 0; - struct rbd_req_coll *coll; - struct ceph_snap_context *snapc; - unsigned int bio_offset; - - dout("fetched request\n"); - - /* filter out block requests we don't understand */ - if ((rq->cmd_type != REQ_TYPE_FS)) { - __blk_end_request_all(rq, 0); - continue; - } + bool write_request = rq_data_dir(rq) == WRITE; + struct rbd_img_request *img_request; + u64 offset; + u64 length; + + /* Ignore any non-FS requests that filter through. */ - /* deduce our operation (read, write) */ - do_write = (rq_data_dir(rq) == WRITE); - if (do_write && rbd_dev->mapping.read_only) { - __blk_end_request_all(rq, -EROFS); + if (rq->cmd_type != REQ_TYPE_FS) { + dout("%s: non-fs request type %d\n", __func__, + (int) rq->cmd_type); + __blk_end_request_all(rq, 0); continue; } - spin_unlock_irq(q->queue_lock); + /* Ignore/skip any zero-length requests */ - down_read(&rbd_dev->header_rwsem); + offset = (u64) blk_rq_pos(rq) << SECTOR_SHIFT; + length = (u64) blk_rq_bytes(rq); - if (!rbd_dev->exists) { - rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP); - up_read(&rbd_dev->header_rwsem); - dout("request for non-existent snapshot"); - spin_lock_irq(q->queue_lock); - __blk_end_request_all(rq, -ENXIO); + if (!length) { + dout("%s: zero-length request\n", __func__); + __blk_end_request_all(rq, 0); continue; } - snapc = ceph_get_snap_context(rbd_dev->header.snapc); - - up_read(&rbd_dev->header_rwsem); - - size = blk_rq_bytes(rq); - ofs = blk_rq_pos(rq) * SECTOR_SIZE; - bio = rq->bio; + spin_unlock_irq(q->queue_lock); - dout("%s 0x%x bytes at 0x%llx\n", - do_write ? "write" : "read", - size, (unsigned long long) blk_rq_pos(rq) * SECTOR_SIZE); + /* Disallow writes to a read-only device */ - num_segs = rbd_get_num_segments(&rbd_dev->header, ofs, size); - if (num_segs <= 0) { - spin_lock_irq(q->queue_lock); - __blk_end_request_all(rq, num_segs); - ceph_put_snap_context(snapc); - continue; + if (write_request) { + result = -EROFS; + if (read_only) + goto end_request; + rbd_assert(rbd_dev->spec->snap_id == CEPH_NOSNAP); } - coll = rbd_alloc_coll(num_segs); - if (!coll) { - spin_lock_irq(q->queue_lock); - __blk_end_request_all(rq, -ENOMEM); - ceph_put_snap_context(snapc); - continue; - } - - bio_offset = 0; - do { - u64 limit = rbd_segment_length(rbd_dev, ofs, size); - unsigned int chain_size; - struct bio *bio_chain; - - BUG_ON(limit > (u64) UINT_MAX); - chain_size = (unsigned int) limit; - dout("rq->bio->bi_vcnt=%hu\n", rq->bio->bi_vcnt); - kref_get(&coll->kref); + /* + * Quit early if the mapped snapshot no longer + * exists. It's still possible the snapshot will + * have disappeared by the time our request arrives + * at the osd, but there's no sense in sending it if + * we already know. + */ + if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) { + dout("request for non-existent snapshot"); + rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP); + result = -ENXIO; + goto end_request; + } - /* Pass a cloned bio chain via an osd request */ + result = -EINVAL; + if (WARN_ON(offset && length > U64_MAX - offset + 1)) + goto end_request; /* Shouldn't happen */ - bio_chain = bio_chain_clone_range(&bio, - &bio_offset, chain_size, - GFP_ATOMIC); - if (bio_chain) - (void) rbd_do_op(rq, rbd_dev, snapc, - ofs, chain_size, - bio_chain, coll, cur_seg); - else - rbd_coll_end_req_index(rq, coll, cur_seg, - -ENOMEM, chain_size); - size -= chain_size; - ofs += chain_size; + result = -ENOMEM; + img_request = rbd_img_request_create(rbd_dev, offset, length, + write_request); + if (!img_request) + goto end_request; - cur_seg++; - } while (size > 0); - kref_put(&coll->kref, rbd_coll_release); + img_request->rq = rq; + result = rbd_img_request_fill_bio(img_request, rq->bio); + if (!result) + result = rbd_img_request_submit(img_request); + if (result) + rbd_img_request_put(img_request); +end_request: spin_lock_irq(q->queue_lock); - - ceph_put_snap_context(snapc); + if (result < 0) { + rbd_warn(rbd_dev, "obj_request %s result %d\n", + write_request ? "write" : "read", result); + __blk_end_request_all(rq, result); + } } } @@ -1703,6 +2106,71 @@ static void rbd_free_disk(struct rbd_device *rbd_dev) put_disk(disk); } +static int rbd_obj_read_sync(struct rbd_device *rbd_dev, + const char *object_name, + u64 offset, u64 length, + char *buf, u64 *version) + +{ + struct ceph_osd_req_op *op; + struct rbd_obj_request *obj_request; + struct ceph_osd_client *osdc; + struct page **pages = NULL; + u32 page_count; + size_t size; + int ret; + + page_count = (u32) calc_pages_for(offset, length); + pages = ceph_alloc_page_vector(page_count, GFP_KERNEL); + if (IS_ERR(pages)) + ret = PTR_ERR(pages); + + ret = -ENOMEM; + obj_request = rbd_obj_request_create(object_name, offset, length, + OBJ_REQUEST_PAGES); + if (!obj_request) + goto out; + + obj_request->pages = pages; + obj_request->page_count = page_count; + + op = rbd_osd_req_op_create(CEPH_OSD_OP_READ, offset, length); + if (!op) + goto out; + obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, + obj_request, op); + rbd_osd_req_op_destroy(op); + if (!obj_request->osd_req) + goto out; + + osdc = &rbd_dev->rbd_client->client->osdc; + ret = rbd_obj_request_submit(osdc, obj_request); + if (ret) + goto out; + ret = rbd_obj_request_wait(obj_request); + if (ret) + goto out; + + ret = obj_request->result; + if (ret < 0) + goto out; + + rbd_assert(obj_request->xferred <= (u64) SIZE_MAX); + size = (size_t) obj_request->xferred; + ceph_copy_from_page_vector(pages, buf, 0, size); + rbd_assert(size <= (size_t) INT_MAX); + ret = (int) size; + if (version) + *version = obj_request->version; +out: + if (obj_request) + rbd_obj_request_put(obj_request); + else + ceph_release_page_vector(pages, page_count); + + return ret; +} + /* * Read the complete header for the given rbd device. * @@ -1741,24 +2209,20 @@ rbd_dev_v1_header_read(struct rbd_device *rbd_dev, u64 *version) if (!ondisk) return ERR_PTR(-ENOMEM); - ret = rbd_req_sync_read(rbd_dev, CEPH_NOSNAP, - rbd_dev->header_name, + ret = rbd_obj_read_sync(rbd_dev, rbd_dev->header_name, 0, size, (char *) ondisk, version); - if (ret < 0) goto out_err; if (WARN_ON((size_t) ret < size)) { ret = -ENXIO; - pr_warning("short header read for image %s" - " (want %zd got %d)\n", - rbd_dev->spec->image_name, size, ret); + rbd_warn(rbd_dev, "short header read (want %zd got %d)", + size, ret); goto out_err; } if (!rbd_dev_ondisk_valid(ondisk)) { ret = -ENXIO; - pr_warning("invalid header for image %s\n", - rbd_dev->spec->image_name); + rbd_warn(rbd_dev, "invalid header"); goto out_err; } @@ -1895,8 +2359,7 @@ static int rbd_init_disk(struct rbd_device *rbd_dev) disk->fops = &rbd_bd_ops; disk->private_data = rbd_dev; - /* init rq */ - q = blk_init_queue(rbd_rq_fn, &rbd_dev->lock); + q = blk_init_queue(rbd_request_fn, &rbd_dev->lock); if (!q) goto out_disk; @@ -2233,7 +2696,7 @@ static void rbd_spec_free(struct kref *kref) kfree(spec); } -struct rbd_device *rbd_dev_create(struct rbd_client *rbdc, +static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc, struct rbd_spec *spec) { struct rbd_device *rbd_dev; @@ -2243,6 +2706,7 @@ struct rbd_device *rbd_dev_create(struct rbd_client *rbdc, return NULL; spin_lock_init(&rbd_dev->lock); + rbd_dev->flags = 0; INIT_LIST_HEAD(&rbd_dev->node); INIT_LIST_HEAD(&rbd_dev->snaps); init_rwsem(&rbd_dev->header_rwsem); @@ -2250,6 +2714,13 @@ struct rbd_device *rbd_dev_create(struct rbd_client *rbdc, rbd_dev->spec = spec; rbd_dev->rbd_client = rbdc; + /* Initialize the layout used for all rbd requests */ + + rbd_dev->layout.fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER); + rbd_dev->layout.fl_stripe_count = cpu_to_le32(1); + rbd_dev->layout.fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER); + rbd_dev->layout.fl_pg_pool = cpu_to_le32((u32) spec->pool_id); + return rbd_dev; } @@ -2360,12 +2831,11 @@ static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id, __le64 size; } __attribute__ ((packed)) size_buf = { 0 }; - ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name, + ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, "rbd", "get_size", (char *) &snapid, sizeof (snapid), - (char *) &size_buf, sizeof (size_buf), - CEPH_OSD_FLAG_READ, NULL); - dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret); + (char *) &size_buf, sizeof (size_buf), NULL); + dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); if (ret < 0) return ret; @@ -2396,15 +2866,13 @@ static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev) if (!reply_buf) return -ENOMEM; - ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name, + ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, "rbd", "get_object_prefix", NULL, 0, - reply_buf, RBD_OBJ_PREFIX_LEN_MAX, - CEPH_OSD_FLAG_READ, NULL); - dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret); + reply_buf, RBD_OBJ_PREFIX_LEN_MAX, NULL); + dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); if (ret < 0) goto out; - ret = 0; /* rbd_req_sync_exec() can return positive */ p = reply_buf; rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p, @@ -2435,12 +2903,12 @@ static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id, u64 incompat; int ret; - ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name, + ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, "rbd", "get_features", (char *) &snapid, sizeof (snapid), (char *) &features_buf, sizeof (features_buf), - CEPH_OSD_FLAG_READ, NULL); - dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret); + NULL); + dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); if (ret < 0) return ret; @@ -2474,7 +2942,6 @@ static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev) void *end; char *image_id; u64 overlap; - size_t len = 0; int ret; parent_spec = rbd_spec_alloc(); @@ -2492,12 +2959,11 @@ static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev) } snapid = cpu_to_le64(CEPH_NOSNAP); - ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name, + ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, "rbd", "get_parent", (char *) &snapid, sizeof (snapid), - (char *) reply_buf, size, - CEPH_OSD_FLAG_READ, NULL); - dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret); + (char *) reply_buf, size, NULL); + dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); if (ret < 0) goto out_err; @@ -2508,13 +2974,18 @@ static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev) if (parent_spec->pool_id == CEPH_NOPOOL) goto out; /* No parent? No problem. */ - image_id = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL); + /* The ceph file layout needs to fit pool id in 32 bits */ + + ret = -EIO; + if (WARN_ON(parent_spec->pool_id > (u64) U32_MAX)) + goto out; + + image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL); if (IS_ERR(image_id)) { ret = PTR_ERR(image_id); goto out_err; } parent_spec->image_id = image_id; - parent_spec->image_id_len = len; ceph_decode_64_safe(&p, end, parent_spec->snap_id, out_err); ceph_decode_64_safe(&p, end, overlap, out_err); @@ -2544,26 +3015,25 @@ static char *rbd_dev_image_name(struct rbd_device *rbd_dev) rbd_assert(!rbd_dev->spec->image_name); - image_id_size = sizeof (__le32) + rbd_dev->spec->image_id_len; + len = strlen(rbd_dev->spec->image_id); + image_id_size = sizeof (__le32) + len; image_id = kmalloc(image_id_size, GFP_KERNEL); if (!image_id) return NULL; p = image_id; end = (char *) image_id + image_id_size; - ceph_encode_string(&p, end, rbd_dev->spec->image_id, - (u32) rbd_dev->spec->image_id_len); + ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32) len); size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX; reply_buf = kmalloc(size, GFP_KERNEL); if (!reply_buf) goto out; - ret = rbd_req_sync_exec(rbd_dev, RBD_DIRECTORY, + ret = rbd_obj_method_sync(rbd_dev, RBD_DIRECTORY, "rbd", "dir_get_name", image_id, image_id_size, - (char *) reply_buf, size, - CEPH_OSD_FLAG_READ, NULL); + (char *) reply_buf, size, NULL); if (ret < 0) goto out; p = reply_buf; @@ -2602,8 +3072,11 @@ static int rbd_dev_probe_update_spec(struct rbd_device *rbd_dev) osdc = &rbd_dev->rbd_client->client->osdc; name = ceph_pg_pool_name_by_id(osdc->osdmap, rbd_dev->spec->pool_id); - if (!name) - return -EIO; /* pool id too large (>= 2^31) */ + if (!name) { + rbd_warn(rbd_dev, "there is no pool with id %llu", + rbd_dev->spec->pool_id); /* Really a BUG() */ + return -EIO; + } rbd_dev->spec->pool_name = kstrdup(name, GFP_KERNEL); if (!rbd_dev->spec->pool_name) @@ -2612,19 +3085,17 @@ static int rbd_dev_probe_update_spec(struct rbd_device *rbd_dev) /* Fetch the image name; tolerate failure here */ name = rbd_dev_image_name(rbd_dev); - if (name) { - rbd_dev->spec->image_name_len = strlen(name); + if (name) rbd_dev->spec->image_name = (char *) name; - } else { - pr_warning(RBD_DRV_NAME "%d " - "unable to get image name for image id %s\n", - rbd_dev->major, rbd_dev->spec->image_id); - } + else + rbd_warn(rbd_dev, "unable to get image name"); /* Look up the snapshot name. */ name = rbd_snap_name(rbd_dev, rbd_dev->spec->snap_id); if (!name) { + rbd_warn(rbd_dev, "no snapshot with id %llu", + rbd_dev->spec->snap_id); /* Really a BUG() */ ret = -EIO; goto out_err; } @@ -2665,12 +3136,11 @@ static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev, u64 *ver) if (!reply_buf) return -ENOMEM; - ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name, + ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, "rbd", "get_snapcontext", NULL, 0, - reply_buf, size, - CEPH_OSD_FLAG_READ, ver); - dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret); + reply_buf, size, ver); + dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); if (ret < 0) goto out; @@ -2735,12 +3205,11 @@ static char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, u32 which) return ERR_PTR(-ENOMEM); snap_id = cpu_to_le64(rbd_dev->header.snapc->snaps[which]); - ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name, + ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, "rbd", "get_snapshot_name", (char *) &snap_id, sizeof (snap_id), - reply_buf, size, - CEPH_OSD_FLAG_READ, NULL); - dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret); + reply_buf, size, NULL); + dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); if (ret < 0) goto out; @@ -2766,7 +3235,7 @@ out: static char *rbd_dev_v2_snap_info(struct rbd_device *rbd_dev, u32 which, u64 *snap_size, u64 *snap_features) { - __le64 snap_id; + u64 snap_id; u8 order; int ret; @@ -2865,10 +3334,17 @@ static int rbd_dev_snaps_update(struct rbd_device *rbd_dev) if (snap_id == CEPH_NOSNAP || (snap && snap->id > snap_id)) { struct list_head *next = links->next; - /* Existing snapshot not in the new snap context */ - + /* + * A previously-existing snapshot is not in + * the new snap context. + * + * If the now missing snapshot is the one the + * image is mapped to, clear its exists flag + * so we can avoid sending any more requests + * to it. + */ if (rbd_dev->spec->snap_id == snap->id) - rbd_dev->exists = false; + clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags); rbd_remove_snap_dev(snap); dout("%ssnap id %llu has been removed\n", rbd_dev->spec->snap_id == snap->id ? @@ -2942,7 +3418,7 @@ static int rbd_dev_snaps_register(struct rbd_device *rbd_dev) struct rbd_snap *snap; int ret = 0; - dout("%s called\n", __func__); + dout("%s:\n", __func__); if (WARN_ON(!device_is_registered(&rbd_dev->dev))) return -EIO; @@ -2983,22 +3459,6 @@ static void rbd_bus_del_dev(struct rbd_device *rbd_dev) device_unregister(&rbd_dev->dev); } -static int rbd_init_watch_dev(struct rbd_device *rbd_dev) -{ - int ret, rc; - - do { - ret = rbd_req_sync_watch(rbd_dev); - if (ret == -ERANGE) { - rc = rbd_dev_refresh(rbd_dev, NULL); - if (rc < 0) - return rc; - } - } while (ret == -ERANGE); - - return ret; -} - static atomic64_t rbd_dev_id_max = ATOMIC64_INIT(0); /* @@ -3138,11 +3598,9 @@ static inline char *dup_token(const char **buf, size_t *lenp) size_t len; len = next_token(buf); - dup = kmalloc(len + 1, GFP_KERNEL); + dup = kmemdup(*buf, len + 1, GFP_KERNEL); if (!dup) return NULL; - - memcpy(dup, *buf, len); *(dup + len) = '\0'; *buf += len; @@ -3210,8 +3668,10 @@ static int rbd_add_parse_args(const char *buf, /* The first four tokens are required */ len = next_token(&buf); - if (!len) - return -EINVAL; /* Missing monitor address(es) */ + if (!len) { + rbd_warn(NULL, "no monitor address(es) provided"); + return -EINVAL; + } mon_addrs = buf; mon_addrs_size = len + 1; buf += len; @@ -3220,8 +3680,10 @@ static int rbd_add_parse_args(const char *buf, options = dup_token(&buf, NULL); if (!options) return -ENOMEM; - if (!*options) - goto out_err; /* Missing options */ + if (!*options) { + rbd_warn(NULL, "no options provided"); + goto out_err; + } spec = rbd_spec_alloc(); if (!spec) @@ -3230,14 +3692,18 @@ static int rbd_add_parse_args(const char *buf, spec->pool_name = dup_token(&buf, NULL); if (!spec->pool_name) goto out_mem; - if (!*spec->pool_name) - goto out_err; /* Missing pool name */ + if (!*spec->pool_name) { + rbd_warn(NULL, "no pool name provided"); + goto out_err; + } - spec->image_name = dup_token(&buf, &spec->image_name_len); + spec->image_name = dup_token(&buf, NULL); if (!spec->image_name) goto out_mem; - if (!*spec->image_name) - goto out_err; /* Missing image name */ + if (!*spec->image_name) { + rbd_warn(NULL, "no image name provided"); + goto out_err; + } /* * Snapshot name is optional; default is to use "-" @@ -3251,10 +3717,9 @@ static int rbd_add_parse_args(const char *buf, ret = -ENAMETOOLONG; goto out_err; } - spec->snap_name = kmalloc(len + 1, GFP_KERNEL); + spec->snap_name = kmemdup(buf, len + 1, GFP_KERNEL); if (!spec->snap_name) goto out_mem; - memcpy(spec->snap_name, buf, len); *(spec->snap_name + len) = '\0'; /* Initialize all rbd options to the defaults */ @@ -3323,7 +3788,7 @@ static int rbd_dev_image_id(struct rbd_device *rbd_dev) * First, see if the format 2 image id file exists, and if * so, get the image's persistent id from it. */ - size = sizeof (RBD_ID_PREFIX) + rbd_dev->spec->image_name_len; + size = sizeof (RBD_ID_PREFIX) + strlen(rbd_dev->spec->image_name); object_name = kmalloc(size, GFP_NOIO); if (!object_name) return -ENOMEM; @@ -3339,21 +3804,18 @@ static int rbd_dev_image_id(struct rbd_device *rbd_dev) goto out; } - ret = rbd_req_sync_exec(rbd_dev, object_name, + ret = rbd_obj_method_sync(rbd_dev, object_name, "rbd", "get_id", NULL, 0, - response, RBD_IMAGE_ID_LEN_MAX, - CEPH_OSD_FLAG_READ, NULL); - dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret); + response, RBD_IMAGE_ID_LEN_MAX, NULL); + dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); if (ret < 0) goto out; - ret = 0; /* rbd_req_sync_exec() can return positive */ p = response; rbd_dev->spec->image_id = ceph_extract_encoded_string(&p, p + RBD_IMAGE_ID_LEN_MAX, - &rbd_dev->spec->image_id_len, - GFP_NOIO); + NULL, GFP_NOIO); if (IS_ERR(rbd_dev->spec->image_id)) { ret = PTR_ERR(rbd_dev->spec->image_id); rbd_dev->spec->image_id = NULL; @@ -3377,11 +3839,10 @@ static int rbd_dev_v1_probe(struct rbd_device *rbd_dev) rbd_dev->spec->image_id = kstrdup("", GFP_KERNEL); if (!rbd_dev->spec->image_id) return -ENOMEM; - rbd_dev->spec->image_id_len = 0; /* Record the header object name for this rbd image. */ - size = rbd_dev->spec->image_name_len + sizeof (RBD_SUFFIX); + size = strlen(rbd_dev->spec->image_name) + sizeof (RBD_SUFFIX); rbd_dev->header_name = kmalloc(size, GFP_KERNEL); if (!rbd_dev->header_name) { ret = -ENOMEM; @@ -3427,7 +3888,7 @@ static int rbd_dev_v2_probe(struct rbd_device *rbd_dev) * Image id was filled in by the caller. Record the header * object name for this rbd image. */ - size = sizeof (RBD_HEADER_PREFIX) + rbd_dev->spec->image_id_len; + size = sizeof (RBD_HEADER_PREFIX) + strlen(rbd_dev->spec->image_id); rbd_dev->header_name = kmalloc(size, GFP_KERNEL); if (!rbd_dev->header_name) return -ENOMEM; @@ -3542,7 +4003,7 @@ static int rbd_dev_probe_finish(struct rbd_device *rbd_dev) if (ret) goto err_out_bus; - ret = rbd_init_watch_dev(rbd_dev); + ret = rbd_dev_header_watch_sync(rbd_dev, 1); if (ret) goto err_out_bus; @@ -3638,6 +4099,13 @@ static ssize_t rbd_add(struct bus_type *bus, goto err_out_client; spec->pool_id = (u64) rc; + /* The ceph file layout needs to fit pool id in 32 bits */ + + if (WARN_ON(spec->pool_id > (u64) U32_MAX)) { + rc = -EIO; + goto err_out_client; + } + rbd_dev = rbd_dev_create(rbdc, spec); if (!rbd_dev) goto err_out_client; @@ -3691,15 +4159,8 @@ static void rbd_dev_release(struct device *dev) { struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); - if (rbd_dev->watch_request) { - struct ceph_client *client = rbd_dev->rbd_client->client; - - ceph_osdc_unregister_linger_request(&client->osdc, - rbd_dev->watch_request); - } if (rbd_dev->watch_event) - rbd_req_sync_unwatch(rbd_dev); - + rbd_dev_header_watch_sync(rbd_dev, 0); /* clean up and free blkdev */ rbd_free_disk(rbd_dev); @@ -3743,10 +4204,14 @@ static ssize_t rbd_remove(struct bus_type *bus, goto done; } - if (rbd_dev->open_count) { + spin_lock_irq(&rbd_dev->lock); + if (rbd_dev->open_count) ret = -EBUSY; + else + set_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags); + spin_unlock_irq(&rbd_dev->lock); + if (ret < 0) goto done; - } rbd_remove_all_snaps(rbd_dev); rbd_bus_del_dev(rbd_dev); @@ -3782,10 +4247,15 @@ static void rbd_sysfs_cleanup(void) device_unregister(&rbd_root_dev); } -int __init rbd_init(void) +static int __init rbd_init(void) { int rc; + if (!libceph_compatible(NULL)) { + rbd_warn(NULL, "libceph incompatibility (quitting)"); + + return -EINVAL; + } rc = rbd_sysfs_init(); if (rc) return rc; @@ -3793,7 +4263,7 @@ int __init rbd_init(void) return 0; } -void __exit rbd_exit(void) +static void __exit rbd_exit(void) { rbd_sysfs_cleanup(); } diff --git a/drivers/block/rsxx/Makefile b/drivers/block/rsxx/Makefile new file mode 100644 index 0000000..f35cd0b --- /dev/null +++ b/drivers/block/rsxx/Makefile @@ -0,0 +1,2 @@ +obj-$(CONFIG_BLK_DEV_RSXX) += rsxx.o +rsxx-y := config.o core.o cregs.o dev.o dma.o diff --git a/drivers/block/rsxx/config.c b/drivers/block/rsxx/config.c new file mode 100644 index 0000000..a295e7e --- /dev/null +++ b/drivers/block/rsxx/config.c @@ -0,0 +1,213 @@ +/* +* Filename: config.c +* +* +* Authors: Joshua Morris <josh.h.morris@us.ibm.com> +* Philip Kelleher <pjk1939@linux.vnet.ibm.com> +* +* (C) Copyright 2013 IBM Corporation +* +* This program is free software; you can redistribute it and/or +* modify it under the terms of the GNU General Public License as +* published by the Free Software Foundation; either version 2 of the +* License, or (at your option) any later version. +* +* This program is distributed in the hope that it will be useful, but +* WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +* General Public License for more details. +* +* You should have received a copy of the GNU General Public License +* along with this program; if not, write to the Free Software Foundation, +* Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +*/ + +#include <linux/types.h> +#include <linux/crc32.h> +#include <linux/swab.h> + +#include "rsxx_priv.h" +#include "rsxx_cfg.h" + +static void initialize_config(void *config) +{ + struct rsxx_card_cfg *cfg = config; + + cfg->hdr.version = RSXX_CFG_VERSION; + + cfg->data.block_size = RSXX_HW_BLK_SIZE; + cfg->data.stripe_size = RSXX_HW_BLK_SIZE; + cfg->data.vendor_id = RSXX_VENDOR_ID_TMS_IBM; + cfg->data.cache_order = (-1); + cfg->data.intr_coal.mode = RSXX_INTR_COAL_DISABLED; + cfg->data.intr_coal.count = 0; + cfg->data.intr_coal.latency = 0; +} + +static u32 config_data_crc32(struct rsxx_card_cfg *cfg) +{ + /* + * Return the compliment of the CRC to ensure compatibility + * (i.e. this is how early rsxx drivers did it.) + */ + + return ~crc32(~0, &cfg->data, sizeof(cfg->data)); +} + + +/*----------------- Config Byte Swap Functions -------------------*/ +static void config_hdr_be_to_cpu(struct card_cfg_hdr *hdr) +{ + hdr->version = be32_to_cpu((__force __be32) hdr->version); + hdr->crc = be32_to_cpu((__force __be32) hdr->crc); +} + +static void config_hdr_cpu_to_be(struct card_cfg_hdr *hdr) +{ + hdr->version = (__force u32) cpu_to_be32(hdr->version); + hdr->crc = (__force u32) cpu_to_be32(hdr->crc); +} + +static void config_data_swab(struct rsxx_card_cfg *cfg) +{ + u32 *data = (u32 *) &cfg->data; + int i; + + for (i = 0; i < (sizeof(cfg->data) / 4); i++) + data[i] = swab32(data[i]); +} + +static void config_data_le_to_cpu(struct rsxx_card_cfg *cfg) +{ + u32 *data = (u32 *) &cfg->data; + int i; + + for (i = 0; i < (sizeof(cfg->data) / 4); i++) + data[i] = le32_to_cpu((__force __le32) data[i]); +} + +static void config_data_cpu_to_le(struct rsxx_card_cfg *cfg) +{ + u32 *data = (u32 *) &cfg->data; + int i; + + for (i = 0; i < (sizeof(cfg->data) / 4); i++) + data[i] = (__force u32) cpu_to_le32(data[i]); +} + + +/*----------------- Config Operations ------------------*/ +static int rsxx_save_config(struct rsxx_cardinfo *card) +{ + struct rsxx_card_cfg cfg; + int st; + + memcpy(&cfg, &card->config, sizeof(cfg)); + + if (unlikely(cfg.hdr.version != RSXX_CFG_VERSION)) { + dev_err(CARD_TO_DEV(card), + "Cannot save config with invalid version %d\n", + cfg.hdr.version); + return -EINVAL; + } + + /* Convert data to little endian for the CRC calculation. */ + config_data_cpu_to_le(&cfg); + + cfg.hdr.crc = config_data_crc32(&cfg); + + /* + * Swap the data from little endian to big endian so it can be + * stored. + */ + config_data_swab(&cfg); + config_hdr_cpu_to_be(&cfg.hdr); + + st = rsxx_creg_write(card, CREG_ADD_CONFIG, sizeof(cfg), &cfg, 1); + if (st) + return st; + + return 0; +} + +int rsxx_load_config(struct rsxx_cardinfo *card) +{ + int st; + u32 crc; + + st = rsxx_creg_read(card, CREG_ADD_CONFIG, sizeof(card->config), + &card->config, 1); + if (st) { + dev_err(CARD_TO_DEV(card), + "Failed reading card config.\n"); + return st; + } + + config_hdr_be_to_cpu(&card->config.hdr); + + if (card->config.hdr.version == RSXX_CFG_VERSION) { + /* + * We calculate the CRC with the data in little endian, because + * early drivers did not take big endian CPUs into account. + * The data is always stored in big endian, so we need to byte + * swap it before calculating the CRC. + */ + + config_data_swab(&card->config); + + /* Check the CRC */ + crc = config_data_crc32(&card->config); + if (crc != card->config.hdr.crc) { + dev_err(CARD_TO_DEV(card), + "Config corruption detected!\n"); + dev_info(CARD_TO_DEV(card), + "CRC (sb x%08x is x%08x)\n", + card->config.hdr.crc, crc); + return -EIO; + } + + /* Convert the data to CPU byteorder */ + config_data_le_to_cpu(&card->config); + + } else if (card->config.hdr.version != 0) { + dev_err(CARD_TO_DEV(card), + "Invalid config version %d.\n", + card->config.hdr.version); + /* + * Config version changes require special handling from the + * user + */ + return -EINVAL; + } else { + dev_info(CARD_TO_DEV(card), + "Initializing card configuration.\n"); + initialize_config(card); + st = rsxx_save_config(card); + if (st) + return st; + } + + card->config_valid = 1; + + dev_dbg(CARD_TO_DEV(card), "version: x%08x\n", + card->config.hdr.version); + dev_dbg(CARD_TO_DEV(card), "crc: x%08x\n", + card->config.hdr.crc); + dev_dbg(CARD_TO_DEV(card), "block_size: x%08x\n", + card->config.data.block_size); + dev_dbg(CARD_TO_DEV(card), "stripe_size: x%08x\n", + card->config.data.stripe_size); + dev_dbg(CARD_TO_DEV(card), "vendor_id: x%08x\n", + card->config.data.vendor_id); + dev_dbg(CARD_TO_DEV(card), "cache_order: x%08x\n", + card->config.data.cache_order); + dev_dbg(CARD_TO_DEV(card), "mode: x%08x\n", + card->config.data.intr_coal.mode); + dev_dbg(CARD_TO_DEV(card), "count: x%08x\n", + card->config.data.intr_coal.count); + dev_dbg(CARD_TO_DEV(card), "latency: x%08x\n", + card->config.data.intr_coal.latency); + + return 0; +} + diff --git a/drivers/block/rsxx/core.c b/drivers/block/rsxx/core.c new file mode 100644 index 0000000..e516248 --- /dev/null +++ b/drivers/block/rsxx/core.c @@ -0,0 +1,649 @@ +/* +* Filename: core.c +* +* +* Authors: Joshua Morris <josh.h.morris@us.ibm.com> +* Philip Kelleher <pjk1939@linux.vnet.ibm.com> +* +* (C) Copyright 2013 IBM Corporation +* +* This program is free software; you can redistribute it and/or +* modify it under the terms of the GNU General Public License as +* published by the Free Software Foundation; either version 2 of the +* License, or (at your option) any later version. +* +* This program is distributed in the hope that it will be useful, but +* WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +* General Public License for more details. +* +* You should have received a copy of the GNU General Public License +* along with this program; if not, write to the Free Software Foundation, +* Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +*/ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/interrupt.h> +#include <linux/module.h> +#include <linux/pci.h> +#include <linux/reboot.h> +#include <linux/slab.h> +#include <linux/bitops.h> + +#include <linux/genhd.h> +#include <linux/idr.h> + +#include "rsxx_priv.h" +#include "rsxx_cfg.h" + +#define NO_LEGACY 0 + +MODULE_DESCRIPTION("IBM RamSan PCIe Flash SSD Device Driver"); +MODULE_AUTHOR("IBM <support@ramsan.com>"); +MODULE_LICENSE("GPL"); +MODULE_VERSION(DRIVER_VERSION); + +static unsigned int force_legacy = NO_LEGACY; +module_param(force_legacy, uint, 0444); +MODULE_PARM_DESC(force_legacy, "Force the use of legacy type PCI interrupts"); + +static DEFINE_IDA(rsxx_disk_ida); +static DEFINE_SPINLOCK(rsxx_ida_lock); + +/*----------------- Interrupt Control & Handling -------------------*/ +static void __enable_intr(unsigned int *mask, unsigned int intr) +{ + *mask |= intr; +} + +static void __disable_intr(unsigned int *mask, unsigned int intr) +{ + *mask &= ~intr; +} + +/* + * NOTE: Disabling the IER will disable the hardware interrupt. + * Disabling the ISR will disable the software handling of the ISR bit. + * + * Enable/Disable interrupt functions assume the card->irq_lock + * is held by the caller. + */ +void rsxx_enable_ier(struct rsxx_cardinfo *card, unsigned int intr) +{ + if (unlikely(card->halt)) + return; + + __enable_intr(&card->ier_mask, intr); + iowrite32(card->ier_mask, card->regmap + IER); +} + +void rsxx_disable_ier(struct rsxx_cardinfo *card, unsigned int intr) +{ + __disable_intr(&card->ier_mask, intr); + iowrite32(card->ier_mask, card->regmap + IER); +} + +void rsxx_enable_ier_and_isr(struct rsxx_cardinfo *card, + unsigned int intr) +{ + if (unlikely(card->halt)) + return; + + __enable_intr(&card->isr_mask, intr); + __enable_intr(&card->ier_mask, intr); + iowrite32(card->ier_mask, card->regmap + IER); +} +void rsxx_disable_ier_and_isr(struct rsxx_cardinfo *card, + unsigned int intr) +{ + __disable_intr(&card->isr_mask, intr); + __disable_intr(&card->ier_mask, intr); + iowrite32(card->ier_mask, card->regmap + IER); +} + +static irqreturn_t rsxx_isr(int irq, void *pdata) +{ + struct rsxx_cardinfo *card = pdata; + unsigned int isr; + int handled = 0; + int reread_isr; + int i; + + spin_lock(&card->irq_lock); + + do { + reread_isr = 0; + + isr = ioread32(card->regmap + ISR); + if (isr == 0xffffffff) { + /* + * A few systems seem to have an intermittent issue + * where PCI reads return all Fs, but retrying the read + * a little later will return as expected. + */ + dev_info(CARD_TO_DEV(card), + "ISR = 0xFFFFFFFF, retrying later\n"); + break; + } + + isr &= card->isr_mask; + if (!isr) + break; + + for (i = 0; i < card->n_targets; i++) { + if (isr & CR_INTR_DMA(i)) { + if (card->ier_mask & CR_INTR_DMA(i)) { + rsxx_disable_ier(card, CR_INTR_DMA(i)); + reread_isr = 1; + } + queue_work(card->ctrl[i].done_wq, + &card->ctrl[i].dma_done_work); + handled++; + } + } + + if (isr & CR_INTR_CREG) { + schedule_work(&card->creg_ctrl.done_work); + handled++; + } + + if (isr & CR_INTR_EVENT) { + schedule_work(&card->event_work); + rsxx_disable_ier_and_isr(card, CR_INTR_EVENT); + handled++; + } + } while (reread_isr); + + spin_unlock(&card->irq_lock); + + return handled ? IRQ_HANDLED : IRQ_NONE; +} + +/*----------------- Card Event Handler -------------------*/ +static char *rsxx_card_state_to_str(unsigned int state) +{ + static char *state_strings[] = { + "Unknown", "Shutdown", "Starting", "Formatting", + "Uninitialized", "Good", "Shutting Down", + "Fault", "Read Only Fault", "dStroying" + }; + + return state_strings[ffs(state)]; +} + +static void card_state_change(struct rsxx_cardinfo *card, + unsigned int new_state) +{ + int st; + + dev_info(CARD_TO_DEV(card), + "card state change detected.(%s -> %s)\n", + rsxx_card_state_to_str(card->state), + rsxx_card_state_to_str(new_state)); + + card->state = new_state; + + /* Don't attach DMA interfaces if the card has an invalid config */ + if (!card->config_valid) + return; + + switch (new_state) { + case CARD_STATE_RD_ONLY_FAULT: + dev_crit(CARD_TO_DEV(card), + "Hardware has entered read-only mode!\n"); + /* + * Fall through so the DMA devices can be attached and + * the user can attempt to pull off their data. + */ + case CARD_STATE_GOOD: + st = rsxx_get_card_size8(card, &card->size8); + if (st) + dev_err(CARD_TO_DEV(card), + "Failed attaching DMA devices\n"); + + if (card->config_valid) + set_capacity(card->gendisk, card->size8 >> 9); + break; + + case CARD_STATE_FAULT: + dev_crit(CARD_TO_DEV(card), + "Hardware Fault reported!\n"); + /* Fall through. */ + + /* Everything else, detach DMA interface if it's attached. */ + case CARD_STATE_SHUTDOWN: + case CARD_STATE_STARTING: + case CARD_STATE_FORMATTING: + case CARD_STATE_UNINITIALIZED: + case CARD_STATE_SHUTTING_DOWN: + /* + * dStroy is a term coined by marketing to represent the low level + * secure erase. + */ + case CARD_STATE_DSTROYING: + set_capacity(card->gendisk, 0); + break; + } +} + +static void card_event_handler(struct work_struct *work) +{ + struct rsxx_cardinfo *card; + unsigned int state; + unsigned long flags; + int st; + + card = container_of(work, struct rsxx_cardinfo, event_work); + + if (unlikely(card->halt)) + return; + + /* + * Enable the interrupt now to avoid any weird race conditions where a + * state change might occur while rsxx_get_card_state() is + * processing a returned creg cmd. + */ + spin_lock_irqsave(&card->irq_lock, flags); + rsxx_enable_ier_and_isr(card, CR_INTR_EVENT); + spin_unlock_irqrestore(&card->irq_lock, flags); + + st = rsxx_get_card_state(card, &state); + if (st) { + dev_info(CARD_TO_DEV(card), + "Failed reading state after event.\n"); + return; + } + + if (card->state != state) + card_state_change(card, state); + + if (card->creg_ctrl.creg_stats.stat & CREG_STAT_LOG_PENDING) + rsxx_read_hw_log(card); +} + +/*----------------- Card Operations -------------------*/ +static int card_shutdown(struct rsxx_cardinfo *card) +{ + unsigned int state; + signed long start; + const int timeout = msecs_to_jiffies(120000); + int st; + + /* We can't issue a shutdown if the card is in a transition state */ + start = jiffies; + do { + st = rsxx_get_card_state(card, &state); + if (st) + return st; + } while (state == CARD_STATE_STARTING && + (jiffies - start < timeout)); + + if (state == CARD_STATE_STARTING) + return -ETIMEDOUT; + + /* Only issue a shutdown if we need to */ + if ((state != CARD_STATE_SHUTTING_DOWN) && + (state != CARD_STATE_SHUTDOWN)) { + st = rsxx_issue_card_cmd(card, CARD_CMD_SHUTDOWN); + if (st) + return st; + } + + start = jiffies; + do { + st = rsxx_get_card_state(card, &state); + if (st) + return st; + } while (state != CARD_STATE_SHUTDOWN && + (jiffies - start < timeout)); + + if (state != CARD_STATE_SHUTDOWN) + return -ETIMEDOUT; + + return 0; +} + +/*----------------- Driver Initialization & Setup -------------------*/ +/* Returns: 0 if the driver is compatible with the device + -1 if the driver is NOT compatible with the device */ +static int rsxx_compatibility_check(struct rsxx_cardinfo *card) +{ + unsigned char pci_rev; + + pci_read_config_byte(card->dev, PCI_REVISION_ID, &pci_rev); + + if (pci_rev > RS70_PCI_REV_SUPPORTED) + return -1; + return 0; +} + +static int rsxx_pci_probe(struct pci_dev *dev, + const struct pci_device_id *id) +{ + struct rsxx_cardinfo *card; + int st; + + dev_info(&dev->dev, "PCI-Flash SSD discovered\n"); + + card = kzalloc(sizeof(*card), GFP_KERNEL); + if (!card) + return -ENOMEM; + + card->dev = dev; + pci_set_drvdata(dev, card); + + do { + if (!ida_pre_get(&rsxx_disk_ida, GFP_KERNEL)) { + st = -ENOMEM; + goto failed_ida_get; + } + + spin_lock(&rsxx_ida_lock); + st = ida_get_new(&rsxx_disk_ida, &card->disk_id); + spin_unlock(&rsxx_ida_lock); + } while (st == -EAGAIN); + + if (st) + goto failed_ida_get; + + st = pci_enable_device(dev); + if (st) + goto failed_enable; + + pci_set_master(dev); + pci_set_dma_max_seg_size(dev, RSXX_HW_BLK_SIZE); + + st = pci_set_dma_mask(dev, DMA_BIT_MASK(64)); + if (st) { + dev_err(CARD_TO_DEV(card), + "No usable DMA configuration,aborting\n"); + goto failed_dma_mask; + } + + st = pci_request_regions(dev, DRIVER_NAME); + if (st) { + dev_err(CARD_TO_DEV(card), + "Failed to request memory region\n"); + goto failed_request_regions; + } + + if (pci_resource_len(dev, 0) == 0) { + dev_err(CARD_TO_DEV(card), "BAR0 has length 0!\n"); + st = -ENOMEM; + goto failed_iomap; + } + + card->regmap = pci_iomap(dev, 0, 0); + if (!card->regmap) { + dev_err(CARD_TO_DEV(card), "Failed to map BAR0\n"); + st = -ENOMEM; + goto failed_iomap; + } + + spin_lock_init(&card->irq_lock); + card->halt = 0; + + spin_lock_irq(&card->irq_lock); + rsxx_disable_ier_and_isr(card, CR_INTR_ALL); + spin_unlock_irq(&card->irq_lock); + + if (!force_legacy) { + st = pci_enable_msi(dev); + if (st) + dev_warn(CARD_TO_DEV(card), + "Failed to enable MSI\n"); + } + + st = request_irq(dev->irq, rsxx_isr, IRQF_DISABLED | IRQF_SHARED, + DRIVER_NAME, card); + if (st) { + dev_err(CARD_TO_DEV(card), + "Failed requesting IRQ%d\n", dev->irq); + goto failed_irq; + } + + /************* Setup Processor Command Interface *************/ + rsxx_creg_setup(card); + + spin_lock_irq(&card->irq_lock); + rsxx_enable_ier_and_isr(card, CR_INTR_CREG); + spin_unlock_irq(&card->irq_lock); + + st = rsxx_compatibility_check(card); + if (st) { + dev_warn(CARD_TO_DEV(card), + "Incompatible driver detected. Please update the driver.\n"); + st = -EINVAL; + goto failed_compatiblity_check; + } + + /************* Load Card Config *************/ + st = rsxx_load_config(card); + if (st) + dev_err(CARD_TO_DEV(card), + "Failed loading card config\n"); + + /************* Setup DMA Engine *************/ + st = rsxx_get_num_targets(card, &card->n_targets); + if (st) + dev_info(CARD_TO_DEV(card), + "Failed reading the number of DMA targets\n"); + + card->ctrl = kzalloc(card->n_targets * sizeof(*card->ctrl), GFP_KERNEL); + if (!card->ctrl) { + st = -ENOMEM; + goto failed_dma_setup; + } + + st = rsxx_dma_setup(card); + if (st) { + dev_info(CARD_TO_DEV(card), + "Failed to setup DMA engine\n"); + goto failed_dma_setup; + } + + /************* Setup Card Event Handler *************/ + INIT_WORK(&card->event_work, card_event_handler); + + st = rsxx_setup_dev(card); + if (st) + goto failed_create_dev; + + rsxx_get_card_state(card, &card->state); + + dev_info(CARD_TO_DEV(card), + "card state: %s\n", + rsxx_card_state_to_str(card->state)); + + /* + * Now that the DMA Engine and devices have been setup, + * we can enable the event interrupt(it kicks off actions in + * those layers so we couldn't enable it right away.) + */ + spin_lock_irq(&card->irq_lock); + rsxx_enable_ier_and_isr(card, CR_INTR_EVENT); + spin_unlock_irq(&card->irq_lock); + + if (card->state == CARD_STATE_SHUTDOWN) { + st = rsxx_issue_card_cmd(card, CARD_CMD_STARTUP); + if (st) + dev_crit(CARD_TO_DEV(card), + "Failed issuing card startup\n"); + } else if (card->state == CARD_STATE_GOOD || + card->state == CARD_STATE_RD_ONLY_FAULT) { + st = rsxx_get_card_size8(card, &card->size8); + if (st) + card->size8 = 0; + } + + rsxx_attach_dev(card); + + return 0; + +failed_create_dev: + rsxx_dma_destroy(card); +failed_dma_setup: +failed_compatiblity_check: + spin_lock_irq(&card->irq_lock); + rsxx_disable_ier_and_isr(card, CR_INTR_ALL); + spin_unlock_irq(&card->irq_lock); + free_irq(dev->irq, card); + if (!force_legacy) + pci_disable_msi(dev); +failed_irq: + pci_iounmap(dev, card->regmap); +failed_iomap: + pci_release_regions(dev); +failed_request_regions: +failed_dma_mask: + pci_disable_device(dev); +failed_enable: + spin_lock(&rsxx_ida_lock); + ida_remove(&rsxx_disk_ida, card->disk_id); + spin_unlock(&rsxx_ida_lock); +failed_ida_get: + kfree(card); + + return st; +} + +static void rsxx_pci_remove(struct pci_dev *dev) +{ + struct rsxx_cardinfo *card = pci_get_drvdata(dev); + unsigned long flags; + int st; + int i; + + if (!card) + return; + + dev_info(CARD_TO_DEV(card), + "Removing PCI-Flash SSD.\n"); + + rsxx_detach_dev(card); + + for (i = 0; i < card->n_targets; i++) { + spin_lock_irqsave(&card->irq_lock, flags); + rsxx_disable_ier_and_isr(card, CR_INTR_DMA(i)); + spin_unlock_irqrestore(&card->irq_lock, flags); + } + + st = card_shutdown(card); + if (st) + dev_crit(CARD_TO_DEV(card), "Shutdown failed!\n"); + + /* Sync outstanding event handlers. */ + spin_lock_irqsave(&card->irq_lock, flags); + rsxx_disable_ier_and_isr(card, CR_INTR_EVENT); + spin_unlock_irqrestore(&card->irq_lock, flags); + + /* Prevent work_structs from re-queuing themselves. */ + card->halt = 1; + + cancel_work_sync(&card->event_work); + + rsxx_destroy_dev(card); + rsxx_dma_destroy(card); + + spin_lock_irqsave(&card->irq_lock, flags); + rsxx_disable_ier_and_isr(card, CR_INTR_ALL); + spin_unlock_irqrestore(&card->irq_lock, flags); + free_irq(dev->irq, card); + + if (!force_legacy) + pci_disable_msi(dev); + + rsxx_creg_destroy(card); + + pci_iounmap(dev, card->regmap); + + pci_disable_device(dev); + pci_release_regions(dev); + + kfree(card); +} + +static int rsxx_pci_suspend(struct pci_dev *dev, pm_message_t state) +{ + /* We don't support suspend at this time. */ + return -ENOSYS; +} + +static void rsxx_pci_shutdown(struct pci_dev *dev) +{ + struct rsxx_cardinfo *card = pci_get_drvdata(dev); + unsigned long flags; + int i; + + if (!card) + return; + + dev_info(CARD_TO_DEV(card), "Shutting down PCI-Flash SSD.\n"); + + rsxx_detach_dev(card); + + for (i = 0; i < card->n_targets; i++) { + spin_lock_irqsave(&card->irq_lock, flags); + rsxx_disable_ier_and_isr(card, CR_INTR_DMA(i)); + spin_unlock_irqrestore(&card->irq_lock, flags); + } + + card_shutdown(card); +} + +static DEFINE_PCI_DEVICE_TABLE(rsxx_pci_ids) = { + {PCI_DEVICE(PCI_VENDOR_ID_TMS_IBM, PCI_DEVICE_ID_RS70_FLASH)}, + {PCI_DEVICE(PCI_VENDOR_ID_TMS_IBM, PCI_DEVICE_ID_RS70D_FLASH)}, + {PCI_DEVICE(PCI_VENDOR_ID_TMS_IBM, PCI_DEVICE_ID_RS80_FLASH)}, + {PCI_DEVICE(PCI_VENDOR_ID_TMS_IBM, PCI_DEVICE_ID_RS81_FLASH)}, + {0,}, +}; + +MODULE_DEVICE_TABLE(pci, rsxx_pci_ids); + +static struct pci_driver rsxx_pci_driver = { + .name = DRIVER_NAME, + .id_table = rsxx_pci_ids, + .probe = rsxx_pci_probe, + .remove = rsxx_pci_remove, + .suspend = rsxx_pci_suspend, + .shutdown = rsxx_pci_shutdown, +}; + +static int __init rsxx_core_init(void) +{ + int st; + + st = rsxx_dev_init(); + if (st) + return st; + + st = rsxx_dma_init(); + if (st) + goto dma_init_failed; + + st = rsxx_creg_init(); + if (st) + goto creg_init_failed; + + return pci_register_driver(&rsxx_pci_driver); + +creg_init_failed: + rsxx_dma_cleanup(); +dma_init_failed: + rsxx_dev_cleanup(); + + return st; +} + +static void __exit rsxx_core_cleanup(void) +{ + pci_unregister_driver(&rsxx_pci_driver); + rsxx_creg_cleanup(); + rsxx_dma_cleanup(); + rsxx_dev_cleanup(); +} + +module_init(rsxx_core_init); +module_exit(rsxx_core_cleanup); diff --git a/drivers/block/rsxx/cregs.c b/drivers/block/rsxx/cregs.c new file mode 100644 index 0000000..80bbe63 --- /dev/null +++ b/drivers/block/rsxx/cregs.c @@ -0,0 +1,758 @@ +/* +* Filename: cregs.c +* +* +* Authors: Joshua Morris <josh.h.morris@us.ibm.com> +* Philip Kelleher <pjk1939@linux.vnet.ibm.com> +* +* (C) Copyright 2013 IBM Corporation +* +* This program is free software; you can redistribute it and/or +* modify it under the terms of the GNU General Public License as +* published by the Free Software Foundation; either version 2 of the +* License, or (at your option) any later version. +* +* This program is distributed in the hope that it will be useful, but +* WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +* General Public License for more details. +* +* You should have received a copy of the GNU General Public License +* along with this program; if not, write to the Free Software Foundation, +* Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +*/ + +#include <linux/completion.h> +#include <linux/slab.h> + +#include "rsxx_priv.h" + +#define CREG_TIMEOUT_MSEC 10000 + +typedef void (*creg_cmd_cb)(struct rsxx_cardinfo *card, + struct creg_cmd *cmd, + int st); + +struct creg_cmd { + struct list_head list; + creg_cmd_cb cb; + void *cb_private; + unsigned int op; + unsigned int addr; + int cnt8; + void *buf; + unsigned int stream; + unsigned int status; +}; + +static struct kmem_cache *creg_cmd_pool; + + +/*------------ Private Functions --------------*/ + +#if defined(__LITTLE_ENDIAN) +#define LITTLE_ENDIAN 1 +#elif defined(__BIG_ENDIAN) +#define LITTLE_ENDIAN 0 +#else +#error Unknown endianess!!! Aborting... +#endif + +static void copy_to_creg_data(struct rsxx_cardinfo *card, + int cnt8, + void *buf, + unsigned int stream) +{ + int i = 0; + u32 *data = buf; + + for (i = 0; cnt8 > 0; i++, cnt8 -= 4) { + /* + * Firmware implementation makes it necessary to byte swap on + * little endian processors. + */ + if (LITTLE_ENDIAN && stream) + iowrite32be(data[i], card->regmap + CREG_DATA(i)); + else + iowrite32(data[i], card->regmap + CREG_DATA(i)); + } +} + + +static void copy_from_creg_data(struct rsxx_cardinfo *card, + int cnt8, + void *buf, + unsigned int stream) +{ + int i = 0; + u32 *data = buf; + + for (i = 0; cnt8 > 0; i++, cnt8 -= 4) { + /* + * Firmware implementation makes it necessary to byte swap on + * little endian processors. + */ + if (LITTLE_ENDIAN && stream) + data[i] = ioread32be(card->regmap + CREG_DATA(i)); + else + data[i] = ioread32(card->regmap + CREG_DATA(i)); + } +} + +static struct creg_cmd *pop_active_cmd(struct rsxx_cardinfo *card) +{ + struct creg_cmd *cmd; + + /* + * Spin lock is needed because this can be called in atomic/interrupt + * context. + */ + spin_lock_bh(&card->creg_ctrl.lock); + cmd = card->creg_ctrl.active_cmd; + card->creg_ctrl.active_cmd = NULL; + spin_unlock_bh(&card->creg_ctrl.lock); + + return cmd; +} + +static void creg_issue_cmd(struct rsxx_cardinfo *card, struct creg_cmd *cmd) +{ + iowrite32(cmd->addr, card->regmap + CREG_ADD); + iowrite32(cmd->cnt8, card->regmap + CREG_CNT); + + if (cmd->op == CREG_OP_WRITE) { + if (cmd->buf) + copy_to_creg_data(card, cmd->cnt8, + cmd->buf, cmd->stream); + } + + /* + * Data copy must complete before initiating the command. This is + * needed for weakly ordered processors (i.e. PowerPC), so that all + * neccessary registers are written before we kick the hardware. + */ + wmb(); + + /* Setting the valid bit will kick off the command. */ + iowrite32(cmd->op, card->regmap + CREG_CMD); +} + +static void creg_kick_queue(struct rsxx_cardinfo *card) +{ + if (card->creg_ctrl.active || list_empty(&card->creg_ctrl.queue)) + return; + + card->creg_ctrl.active = 1; + card->creg_ctrl.active_cmd = list_first_entry(&card->creg_ctrl.queue, + struct creg_cmd, list); + list_del(&card->creg_ctrl.active_cmd->list); + card->creg_ctrl.q_depth--; + + /* + * We have to set the timer before we push the new command. Otherwise, + * we could create a race condition that would occur if the timer + * was not canceled, and expired after the new command was pushed, + * but before the command was issued to hardware. + */ + mod_timer(&card->creg_ctrl.cmd_timer, + jiffies + msecs_to_jiffies(CREG_TIMEOUT_MSEC)); + + creg_issue_cmd(card, card->creg_ctrl.active_cmd); +} + +static int creg_queue_cmd(struct rsxx_cardinfo *card, + unsigned int op, + unsigned int addr, + unsigned int cnt8, + void *buf, + int stream, + creg_cmd_cb callback, + void *cb_private) +{ + struct creg_cmd *cmd; + + /* Don't queue stuff up if we're halted. */ + if (unlikely(card->halt)) + return -EINVAL; + + if (card->creg_ctrl.reset) + return -EAGAIN; + + if (cnt8 > MAX_CREG_DATA8) + return -EINVAL; + + cmd = kmem_cache_alloc(creg_cmd_pool, GFP_KERNEL); + if (!cmd) + return -ENOMEM; + + INIT_LIST_HEAD(&cmd->list); + + cmd->op = op; + cmd->addr = addr; + cmd->cnt8 = cnt8; + cmd->buf = buf; + cmd->stream = stream; + cmd->cb = callback; + cmd->cb_private = cb_private; + cmd->status = 0; + + spin_lock(&card->creg_ctrl.lock); + list_add_tail(&cmd->list, &card->creg_ctrl.queue); + card->creg_ctrl.q_depth++; + creg_kick_queue(card); + spin_unlock(&card->creg_ctrl.lock); + + return 0; +} + +static void creg_cmd_timed_out(unsigned long data) +{ + struct rsxx_cardinfo *card = (struct rsxx_cardinfo *) data; + struct creg_cmd *cmd; + + cmd = pop_active_cmd(card); + if (cmd == NULL) { + card->creg_ctrl.creg_stats.creg_timeout++; + dev_warn(CARD_TO_DEV(card), + "No active command associated with timeout!\n"); + return; + } + + if (cmd->cb) + cmd->cb(card, cmd, -ETIMEDOUT); + + kmem_cache_free(creg_cmd_pool, cmd); + + + spin_lock(&card->creg_ctrl.lock); + card->creg_ctrl.active = 0; + creg_kick_queue(card); + spin_unlock(&card->creg_ctrl.lock); +} + + +static void creg_cmd_done(struct work_struct *work) +{ + struct rsxx_cardinfo *card; + struct creg_cmd *cmd; + int st = 0; + + card = container_of(work, struct rsxx_cardinfo, + creg_ctrl.done_work); + + /* + * The timer could not be cancelled for some reason, + * race to pop the active command. + */ + if (del_timer_sync(&card->creg_ctrl.cmd_timer) == 0) + card->creg_ctrl.creg_stats.failed_cancel_timer++; + + cmd = pop_active_cmd(card); + if (cmd == NULL) { + dev_err(CARD_TO_DEV(card), + "Spurious creg interrupt!\n"); + return; + } + + card->creg_ctrl.creg_stats.stat = ioread32(card->regmap + CREG_STAT); + cmd->status = card->creg_ctrl.creg_stats.stat; + if ((cmd->status & CREG_STAT_STATUS_MASK) == 0) { + dev_err(CARD_TO_DEV(card), + "Invalid status on creg command\n"); + /* + * At this point we're probably reading garbage from HW. Don't + * do anything else that could mess up the system and let + * the sync function return an error. + */ + st = -EIO; + goto creg_done; + } else if (cmd->status & CREG_STAT_ERROR) { + st = -EIO; + } + + if ((cmd->op == CREG_OP_READ)) { + unsigned int cnt8 = ioread32(card->regmap + CREG_CNT); + + /* Paranoid Sanity Checks */ + if (!cmd->buf) { + dev_err(CARD_TO_DEV(card), + "Buffer not given for read.\n"); + st = -EIO; + goto creg_done; + } + if (cnt8 != cmd->cnt8) { + dev_err(CARD_TO_DEV(card), + "count mismatch\n"); + st = -EIO; + goto creg_done; + } + + copy_from_creg_data(card, cnt8, cmd->buf, cmd->stream); + } + +creg_done: + if (cmd->cb) + cmd->cb(card, cmd, st); + + kmem_cache_free(creg_cmd_pool, cmd); + + spin_lock(&card->creg_ctrl.lock); + card->creg_ctrl.active = 0; + creg_kick_queue(card); + spin_unlock(&card->creg_ctrl.lock); +} + +static void creg_reset(struct rsxx_cardinfo *card) +{ + struct creg_cmd *cmd = NULL; + struct creg_cmd *tmp; + unsigned long flags; + + /* + * mutex_trylock is used here because if reset_lock is taken then a + * reset is already happening. So, we can just go ahead and return. + */ + if (!mutex_trylock(&card->creg_ctrl.reset_lock)) + return; + + card->creg_ctrl.reset = 1; + spin_lock_irqsave(&card->irq_lock, flags); + rsxx_disable_ier_and_isr(card, CR_INTR_CREG | CR_INTR_EVENT); + spin_unlock_irqrestore(&card->irq_lock, flags); + + dev_warn(CARD_TO_DEV(card), + "Resetting creg interface for recovery\n"); + + /* Cancel outstanding commands */ + spin_lock(&card->creg_ctrl.lock); + list_for_each_entry_safe(cmd, tmp, &card->creg_ctrl.queue, list) { + list_del(&cmd->list); + card->creg_ctrl.q_depth--; + if (cmd->cb) + cmd->cb(card, cmd, -ECANCELED); + kmem_cache_free(creg_cmd_pool, cmd); + } + + cmd = card->creg_ctrl.active_cmd; + card->creg_ctrl.active_cmd = NULL; + if (cmd) { + if (timer_pending(&card->creg_ctrl.cmd_timer)) + del_timer_sync(&card->creg_ctrl.cmd_timer); + + if (cmd->cb) + cmd->cb(card, cmd, -ECANCELED); + kmem_cache_free(creg_cmd_pool, cmd); + + card->creg_ctrl.active = 0; + } + spin_unlock(&card->creg_ctrl.lock); + + card->creg_ctrl.reset = 0; + spin_lock_irqsave(&card->irq_lock, flags); + rsxx_enable_ier_and_isr(card, CR_INTR_CREG | CR_INTR_EVENT); + spin_unlock_irqrestore(&card->irq_lock, flags); + + mutex_unlock(&card->creg_ctrl.reset_lock); +} + +/* Used for synchronous accesses */ +struct creg_completion { + struct completion *cmd_done; + int st; + u32 creg_status; +}; + +static void creg_cmd_done_cb(struct rsxx_cardinfo *card, + struct creg_cmd *cmd, + int st) +{ + struct creg_completion *cmd_completion; + + cmd_completion = cmd->cb_private; + BUG_ON(!cmd_completion); + + cmd_completion->st = st; + cmd_completion->creg_status = cmd->status; + complete(cmd_completion->cmd_done); +} + +static int __issue_creg_rw(struct rsxx_cardinfo *card, + unsigned int op, + unsigned int addr, + unsigned int cnt8, + void *buf, + int stream, + unsigned int *hw_stat) +{ + DECLARE_COMPLETION_ONSTACK(cmd_done); + struct creg_completion completion; + unsigned long timeout; + int st; + + completion.cmd_done = &cmd_done; + completion.st = 0; + completion.creg_status = 0; + + st = creg_queue_cmd(card, op, addr, cnt8, buf, stream, creg_cmd_done_cb, + &completion); + if (st) + return st; + + /* + * This timeout is neccessary for unresponsive hardware. The additional + * 20 seconds to used to guarantee that each cregs requests has time to + * complete. + */ + timeout = msecs_to_jiffies((CREG_TIMEOUT_MSEC * + card->creg_ctrl.q_depth) + 20000); + + /* + * The creg interface is guaranteed to complete. It has a timeout + * mechanism that will kick in if hardware does not respond. + */ + st = wait_for_completion_timeout(completion.cmd_done, timeout); + if (st == 0) { + /* + * This is really bad, because the kernel timer did not + * expire and notify us of a timeout! + */ + dev_crit(CARD_TO_DEV(card), + "cregs timer failed\n"); + creg_reset(card); + return -EIO; + } + + *hw_stat = completion.creg_status; + + if (completion.st) { + dev_warn(CARD_TO_DEV(card), + "creg command failed(%d x%08x)\n", + completion.st, addr); + return completion.st; + } + + return 0; +} + +static int issue_creg_rw(struct rsxx_cardinfo *card, + u32 addr, + unsigned int size8, + void *data, + int stream, + int read) +{ + unsigned int hw_stat; + unsigned int xfer; + unsigned int op; + int st; + + op = read ? CREG_OP_READ : CREG_OP_WRITE; + + do { + xfer = min_t(unsigned int, size8, MAX_CREG_DATA8); + + st = __issue_creg_rw(card, op, addr, xfer, + data, stream, &hw_stat); + if (st) + return st; + + data = (char *)data + xfer; + addr += xfer; + size8 -= xfer; + } while (size8); + + return 0; +} + +/* ---------------------------- Public API ---------------------------------- */ +int rsxx_creg_write(struct rsxx_cardinfo *card, + u32 addr, + unsigned int size8, + void *data, + int byte_stream) +{ + return issue_creg_rw(card, addr, size8, data, byte_stream, 0); +} + +int rsxx_creg_read(struct rsxx_cardinfo *card, + u32 addr, + unsigned int size8, + void *data, + int byte_stream) +{ + return issue_creg_rw(card, addr, size8, data, byte_stream, 1); +} + +int rsxx_get_card_state(struct rsxx_cardinfo *card, unsigned int *state) +{ + return rsxx_creg_read(card, CREG_ADD_CARD_STATE, + sizeof(*state), state, 0); +} + +int rsxx_get_card_size8(struct rsxx_cardinfo *card, u64 *size8) +{ + unsigned int size; + int st; + + st = rsxx_creg_read(card, CREG_ADD_CARD_SIZE, + sizeof(size), &size, 0); + if (st) + return st; + + *size8 = (u64)size * RSXX_HW_BLK_SIZE; + return 0; +} + +int rsxx_get_num_targets(struct rsxx_cardinfo *card, + unsigned int *n_targets) +{ + return rsxx_creg_read(card, CREG_ADD_NUM_TARGETS, + sizeof(*n_targets), n_targets, 0); +} + +int rsxx_get_card_capabilities(struct rsxx_cardinfo *card, + u32 *capabilities) +{ + return rsxx_creg_read(card, CREG_ADD_CAPABILITIES, + sizeof(*capabilities), capabilities, 0); +} + +int rsxx_issue_card_cmd(struct rsxx_cardinfo *card, u32 cmd) +{ + return rsxx_creg_write(card, CREG_ADD_CARD_CMD, + sizeof(cmd), &cmd, 0); +} + + +/*----------------- HW Log Functions -------------------*/ +static void hw_log_msg(struct rsxx_cardinfo *card, const char *str, int len) +{ + static char level; + + /* + * New messages start with "<#>", where # is the log level. Messages + * that extend past the log buffer will use the previous level + */ + if ((len > 3) && (str[0] == '<') && (str[2] == '>')) { + level = str[1]; + str += 3; /* Skip past the log level. */ + len -= 3; + } + + switch (level) { + case '0': + dev_emerg(CARD_TO_DEV(card), "HW: %.*s", len, str); + break; + case '1': + dev_alert(CARD_TO_DEV(card), "HW: %.*s", len, str); + break; + case '2': + dev_crit(CARD_TO_DEV(card), "HW: %.*s", len, str); + break; + case '3': + dev_err(CARD_TO_DEV(card), "HW: %.*s", len, str); + break; + case '4': + dev_warn(CARD_TO_DEV(card), "HW: %.*s", len, str); + break; + case '5': + dev_notice(CARD_TO_DEV(card), "HW: %.*s", len, str); + break; + case '6': + dev_info(CARD_TO_DEV(card), "HW: %.*s", len, str); + break; + case '7': + dev_dbg(CARD_TO_DEV(card), "HW: %.*s", len, str); + break; + default: + dev_info(CARD_TO_DEV(card), "HW: %.*s", len, str); + break; + } +} + +/* + * The substrncpy function copies the src string (which includes the + * terminating '\0' character), up to the count into the dest pointer. + * Returns the number of bytes copied to dest. + */ +static int substrncpy(char *dest, const char *src, int count) +{ + int max_cnt = count; + + while (count) { + count--; + *dest = *src; + if (*dest == '\0') + break; + src++; + dest++; + } + return max_cnt - count; +} + + +static void read_hw_log_done(struct rsxx_cardinfo *card, + struct creg_cmd *cmd, + int st) +{ + char *buf; + char *log_str; + int cnt; + int len; + int off; + + buf = cmd->buf; + off = 0; + + /* Failed getting the log message */ + if (st) + return; + + while (off < cmd->cnt8) { + log_str = &card->log.buf[card->log.buf_len]; + cnt = min(cmd->cnt8 - off, LOG_BUF_SIZE8 - card->log.buf_len); + len = substrncpy(log_str, &buf[off], cnt); + + off += len; + card->log.buf_len += len; + + /* + * Flush the log if we've hit the end of a message or if we've + * run out of buffer space. + */ + if ((log_str[len - 1] == '\0') || + (card->log.buf_len == LOG_BUF_SIZE8)) { + if (card->log.buf_len != 1) /* Don't log blank lines. */ + hw_log_msg(card, card->log.buf, + card->log.buf_len); + card->log.buf_len = 0; + } + + } + + if (cmd->status & CREG_STAT_LOG_PENDING) + rsxx_read_hw_log(card); +} + +int rsxx_read_hw_log(struct rsxx_cardinfo *card) +{ + int st; + + st = creg_queue_cmd(card, CREG_OP_READ, CREG_ADD_LOG, + sizeof(card->log.tmp), card->log.tmp, + 1, read_hw_log_done, NULL); + if (st) + dev_err(CARD_TO_DEV(card), + "Failed getting log text\n"); + + return st; +} + +/*-------------- IOCTL REG Access ------------------*/ +static int issue_reg_cmd(struct rsxx_cardinfo *card, + struct rsxx_reg_access *cmd, + int read) +{ + unsigned int op = read ? CREG_OP_READ : CREG_OP_WRITE; + + return __issue_creg_rw(card, op, cmd->addr, cmd->cnt, cmd->data, + cmd->stream, &cmd->stat); +} + +int rsxx_reg_access(struct rsxx_cardinfo *card, + struct rsxx_reg_access __user *ucmd, + int read) +{ + struct rsxx_reg_access cmd; + int st; + + st = copy_from_user(&cmd, ucmd, sizeof(cmd)); + if (st) + return -EFAULT; + + if (cmd.cnt > RSXX_MAX_REG_CNT) + return -EFAULT; + + st = issue_reg_cmd(card, &cmd, read); + if (st) + return st; + + st = put_user(cmd.stat, &ucmd->stat); + if (st) + return -EFAULT; + + if (read) { + st = copy_to_user(ucmd->data, cmd.data, cmd.cnt); + if (st) + return -EFAULT; + } + + return 0; +} + +/*------------ Initialization & Setup --------------*/ +int rsxx_creg_setup(struct rsxx_cardinfo *card) +{ + card->creg_ctrl.active_cmd = NULL; + + INIT_WORK(&card->creg_ctrl.done_work, creg_cmd_done); + mutex_init(&card->creg_ctrl.reset_lock); + INIT_LIST_HEAD(&card->creg_ctrl.queue); + spin_lock_init(&card->creg_ctrl.lock); + setup_timer(&card->creg_ctrl.cmd_timer, creg_cmd_timed_out, + (unsigned long) card); + + return 0; +} + +void rsxx_creg_destroy(struct rsxx_cardinfo *card) +{ + struct creg_cmd *cmd; + struct creg_cmd *tmp; + int cnt = 0; + + /* Cancel outstanding commands */ + spin_lock(&card->creg_ctrl.lock); + list_for_each_entry_safe(cmd, tmp, &card->creg_ctrl.queue, list) { + list_del(&cmd->list); + if (cmd->cb) + cmd->cb(card, cmd, -ECANCELED); + kmem_cache_free(creg_cmd_pool, cmd); + cnt++; + } + + if (cnt) + dev_info(CARD_TO_DEV(card), + "Canceled %d queue creg commands\n", cnt); + + cmd = card->creg_ctrl.active_cmd; + card->creg_ctrl.active_cmd = NULL; + if (cmd) { + if (timer_pending(&card->creg_ctrl.cmd_timer)) + del_timer_sync(&card->creg_ctrl.cmd_timer); + + if (cmd->cb) + cmd->cb(card, cmd, -ECANCELED); + dev_info(CARD_TO_DEV(card), + "Canceled active creg command\n"); + kmem_cache_free(creg_cmd_pool, cmd); + } + spin_unlock(&card->creg_ctrl.lock); + + cancel_work_sync(&card->creg_ctrl.done_work); +} + + +int rsxx_creg_init(void) +{ + creg_cmd_pool = KMEM_CACHE(creg_cmd, SLAB_HWCACHE_ALIGN); + if (!creg_cmd_pool) + return -ENOMEM; + + return 0; +} + +void rsxx_creg_cleanup(void) +{ + kmem_cache_destroy(creg_cmd_pool); +} diff --git a/drivers/block/rsxx/dev.c b/drivers/block/rsxx/dev.c new file mode 100644 index 0000000..4346d17 --- /dev/null +++ b/drivers/block/rsxx/dev.c @@ -0,0 +1,367 @@ +/* +* Filename: dev.c +* +* +* Authors: Joshua Morris <josh.h.morris@us.ibm.com> +* Philip Kelleher <pjk1939@linux.vnet.ibm.com> +* +* (C) Copyright 2013 IBM Corporation +* +* This program is free software; you can redistribute it and/or +* modify it under the terms of the GNU General Public License as +* published by the Free Software Foundation; either version 2 of the +* License, or (at your option) any later version. +* +* This program is distributed in the hope that it will be useful, but +* WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +* General Public License for more details. +* +* You should have received a copy of the GNU General Public License +* along with this program; if not, write to the Free Software Foundation, +* Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +*/ + +#include <linux/kernel.h> +#include <linux/interrupt.h> +#include <linux/module.h> +#include <linux/pci.h> +#include <linux/slab.h> + +#include <linux/hdreg.h> +#include <linux/genhd.h> +#include <linux/blkdev.h> +#include <linux/bio.h> + +#include <linux/fs.h> + +#include "rsxx_priv.h" + +static unsigned int blkdev_minors = 64; +module_param(blkdev_minors, uint, 0444); +MODULE_PARM_DESC(blkdev_minors, "Number of minors(partitions)"); + +/* + * For now I'm making this tweakable in case any applications hit this limit. + * If you see a "bio too big" error in the log you will need to raise this + * value. + */ +static unsigned int blkdev_max_hw_sectors = 1024; +module_param(blkdev_max_hw_sectors, uint, 0444); +MODULE_PARM_DESC(blkdev_max_hw_sectors, "Max hw sectors for a single BIO"); + +static unsigned int enable_blkdev = 1; +module_param(enable_blkdev , uint, 0444); +MODULE_PARM_DESC(enable_blkdev, "Enable block device interfaces"); + + +struct rsxx_bio_meta { + struct bio *bio; + atomic_t pending_dmas; + atomic_t error; + unsigned long start_time; +}; + +static struct kmem_cache *bio_meta_pool; + +/*----------------- Block Device Operations -----------------*/ +static int rsxx_blkdev_ioctl(struct block_device *bdev, + fmode_t mode, + unsigned int cmd, + unsigned long arg) +{ + struct rsxx_cardinfo *card = bdev->bd_disk->private_data; + + switch (cmd) { + case RSXX_GETREG: + return rsxx_reg_access(card, (void __user *)arg, 1); + case RSXX_SETREG: + return rsxx_reg_access(card, (void __user *)arg, 0); + } + + return -ENOTTY; +} + +static int rsxx_getgeo(struct block_device *bdev, struct hd_geometry *geo) +{ + struct rsxx_cardinfo *card = bdev->bd_disk->private_data; + u64 blocks = card->size8 >> 9; + + /* + * get geometry: Fake it. I haven't found any drivers that set + * geo->start, so we won't either. + */ + if (card->size8) { + geo->heads = 64; + geo->sectors = 16; + do_div(blocks, (geo->heads * geo->sectors)); + geo->cylinders = blocks; + } else { + geo->heads = 0; + geo->sectors = 0; + geo->cylinders = 0; + } + return 0; +} + +static const struct block_device_operations rsxx_fops = { + .owner = THIS_MODULE, + .getgeo = rsxx_getgeo, + .ioctl = rsxx_blkdev_ioctl, +}; + +static void disk_stats_start(struct rsxx_cardinfo *card, struct bio *bio) +{ + struct hd_struct *part0 = &card->gendisk->part0; + int rw = bio_data_dir(bio); + int cpu; + + cpu = part_stat_lock(); + + part_round_stats(cpu, part0); + part_inc_in_flight(part0, rw); + + part_stat_unlock(); +} + +static void disk_stats_complete(struct rsxx_cardinfo *card, + struct bio *bio, + unsigned long start_time) +{ + struct hd_struct *part0 = &card->gendisk->part0; + unsigned long duration = jiffies - start_time; + int rw = bio_data_dir(bio); + int cpu; + + cpu = part_stat_lock(); + + part_stat_add(cpu, part0, sectors[rw], bio_sectors(bio)); + part_stat_inc(cpu, part0, ios[rw]); + part_stat_add(cpu, part0, ticks[rw], duration); + + part_round_stats(cpu, part0); + part_dec_in_flight(part0, rw); + + part_stat_unlock(); +} + +static void bio_dma_done_cb(struct rsxx_cardinfo *card, + void *cb_data, + unsigned int error) +{ + struct rsxx_bio_meta *meta = cb_data; + + if (error) + atomic_set(&meta->error, 1); + + if (atomic_dec_and_test(&meta->pending_dmas)) { + disk_stats_complete(card, meta->bio, meta->start_time); + + bio_endio(meta->bio, atomic_read(&meta->error) ? -EIO : 0); + kmem_cache_free(bio_meta_pool, meta); + } +} + +static void rsxx_make_request(struct request_queue *q, struct bio *bio) +{ + struct rsxx_cardinfo *card = q->queuedata; + struct rsxx_bio_meta *bio_meta; + int st = -EINVAL; + + might_sleep(); + + if (unlikely(card->halt)) { + st = -EFAULT; + goto req_err; + } + + if (unlikely(card->dma_fault)) { + st = (-EFAULT); + goto req_err; + } + + if (bio->bi_size == 0) { + dev_err(CARD_TO_DEV(card), "size zero BIO!\n"); + goto req_err; + } + + bio_meta = kmem_cache_alloc(bio_meta_pool, GFP_KERNEL); + if (!bio_meta) { + st = -ENOMEM; + goto req_err; + } + + bio_meta->bio = bio; + atomic_set(&bio_meta->error, 0); + atomic_set(&bio_meta->pending_dmas, 0); + bio_meta->start_time = jiffies; + + disk_stats_start(card, bio); + + dev_dbg(CARD_TO_DEV(card), "BIO[%c]: meta: %p addr8: x%llx size: %d\n", + bio_data_dir(bio) ? 'W' : 'R', bio_meta, + (u64)bio->bi_sector << 9, bio->bi_size); + + st = rsxx_dma_queue_bio(card, bio, &bio_meta->pending_dmas, + bio_dma_done_cb, bio_meta); + if (st) + goto queue_err; + + return; + +queue_err: + kmem_cache_free(bio_meta_pool, bio_meta); +req_err: + bio_endio(bio, st); +} + +/*----------------- Device Setup -------------------*/ +static bool rsxx_discard_supported(struct rsxx_cardinfo *card) +{ + unsigned char pci_rev; + + pci_read_config_byte(card->dev, PCI_REVISION_ID, &pci_rev); + + return (pci_rev >= RSXX_DISCARD_SUPPORT); +} + +static unsigned short rsxx_get_logical_block_size( + struct rsxx_cardinfo *card) +{ + u32 capabilities = 0; + int st; + + st = rsxx_get_card_capabilities(card, &capabilities); + if (st) + dev_warn(CARD_TO_DEV(card), + "Failed reading card capabilities register\n"); + + /* Earlier firmware did not have support for 512 byte accesses */ + if (capabilities & CARD_CAP_SUBPAGE_WRITES) + return 512; + else + return RSXX_HW_BLK_SIZE; +} + +int rsxx_attach_dev(struct rsxx_cardinfo *card) +{ + mutex_lock(&card->dev_lock); + + /* The block device requires the stripe size from the config. */ + if (enable_blkdev) { + if (card->config_valid) + set_capacity(card->gendisk, card->size8 >> 9); + else + set_capacity(card->gendisk, 0); + add_disk(card->gendisk); + + card->bdev_attached = 1; + } + + mutex_unlock(&card->dev_lock); + + return 0; +} + +void rsxx_detach_dev(struct rsxx_cardinfo *card) +{ + mutex_lock(&card->dev_lock); + + if (card->bdev_attached) { + del_gendisk(card->gendisk); + card->bdev_attached = 0; + } + + mutex_unlock(&card->dev_lock); +} + +int rsxx_setup_dev(struct rsxx_cardinfo *card) +{ + unsigned short blk_size; + + mutex_init(&card->dev_lock); + + if (!enable_blkdev) + return 0; + + card->major = register_blkdev(0, DRIVER_NAME); + if (card->major < 0) { + dev_err(CARD_TO_DEV(card), "Failed to get major number\n"); + return -ENOMEM; + } + + card->queue = blk_alloc_queue(GFP_KERNEL); + if (!card->queue) { + dev_err(CARD_TO_DEV(card), "Failed queue alloc\n"); + unregister_blkdev(card->major, DRIVER_NAME); + return -ENOMEM; + } + + card->gendisk = alloc_disk(blkdev_minors); + if (!card->gendisk) { + dev_err(CARD_TO_DEV(card), "Failed disk alloc\n"); + blk_cleanup_queue(card->queue); + unregister_blkdev(card->major, DRIVER_NAME); + return -ENOMEM; + } + + blk_size = rsxx_get_logical_block_size(card); + + blk_queue_make_request(card->queue, rsxx_make_request); + blk_queue_bounce_limit(card->queue, BLK_BOUNCE_ANY); + blk_queue_dma_alignment(card->queue, blk_size - 1); + blk_queue_max_hw_sectors(card->queue, blkdev_max_hw_sectors); + blk_queue_logical_block_size(card->queue, blk_size); + blk_queue_physical_block_size(card->queue, RSXX_HW_BLK_SIZE); + + queue_flag_set_unlocked(QUEUE_FLAG_NONROT, card->queue); + if (rsxx_discard_supported(card)) { + queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, card->queue); + blk_queue_max_discard_sectors(card->queue, + RSXX_HW_BLK_SIZE >> 9); + card->queue->limits.discard_granularity = RSXX_HW_BLK_SIZE; + card->queue->limits.discard_alignment = RSXX_HW_BLK_SIZE; + card->queue->limits.discard_zeroes_data = 1; + } + + card->queue->queuedata = card; + + snprintf(card->gendisk->disk_name, sizeof(card->gendisk->disk_name), + "rsxx%d", card->disk_id); + card->gendisk->driverfs_dev = &card->dev->dev; + card->gendisk->major = card->major; + card->gendisk->first_minor = 0; + card->gendisk->fops = &rsxx_fops; + card->gendisk->private_data = card; + card->gendisk->queue = card->queue; + + return 0; +} + +void rsxx_destroy_dev(struct rsxx_cardinfo *card) +{ + if (!enable_blkdev) + return; + + put_disk(card->gendisk); + card->gendisk = NULL; + + blk_cleanup_queue(card->queue); + unregister_blkdev(card->major, DRIVER_NAME); +} + +int rsxx_dev_init(void) +{ + bio_meta_pool = KMEM_CACHE(rsxx_bio_meta, SLAB_HWCACHE_ALIGN); + if (!bio_meta_pool) + return -ENOMEM; + + return 0; +} + +void rsxx_dev_cleanup(void) +{ + kmem_cache_destroy(bio_meta_pool); +} + + diff --git a/drivers/block/rsxx/dma.c b/drivers/block/rsxx/dma.c new file mode 100644 index 0000000..63176e6 --- /dev/null +++ b/drivers/block/rsxx/dma.c @@ -0,0 +1,998 @@ +/* +* Filename: dma.c +* +* +* Authors: Joshua Morris <josh.h.morris@us.ibm.com> +* Philip Kelleher <pjk1939@linux.vnet.ibm.com> +* +* (C) Copyright 2013 IBM Corporation +* +* This program is free software; you can redistribute it and/or +* modify it under the terms of the GNU General Public License as +* published by the Free Software Foundation; either version 2 of the +* License, or (at your option) any later version. +* +* This program is distributed in the hope that it will be useful, but +* WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +* General Public License for more details. +* +* You should have received a copy of the GNU General Public License +* along with this program; if not, write to the Free Software Foundation, +* Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +*/ + +#include <linux/slab.h> +#include "rsxx_priv.h" + +struct rsxx_dma { + struct list_head list; + u8 cmd; + unsigned int laddr; /* Logical address on the ramsan */ + struct { + u32 off; + u32 cnt; + } sub_page; + dma_addr_t dma_addr; + struct page *page; + unsigned int pg_off; /* Page Offset */ + rsxx_dma_cb cb; + void *cb_data; +}; + +/* This timeout is used to detect a stalled DMA channel */ +#define DMA_ACTIVITY_TIMEOUT msecs_to_jiffies(10000) + +struct hw_status { + u8 status; + u8 tag; + __le16 count; + __le32 _rsvd2; + __le64 _rsvd3; +} __packed; + +enum rsxx_dma_status { + DMA_SW_ERR = 0x1, + DMA_HW_FAULT = 0x2, + DMA_CANCELLED = 0x4, +}; + +struct hw_cmd { + u8 command; + u8 tag; + u8 _rsvd; + u8 sub_page; /* Bit[0:2]: 512byte offset */ + /* Bit[4:6]: 512byte count */ + __le32 device_addr; + __le64 host_addr; +} __packed; + +enum rsxx_hw_cmd { + HW_CMD_BLK_DISCARD = 0x70, + HW_CMD_BLK_WRITE = 0x80, + HW_CMD_BLK_READ = 0xC0, + HW_CMD_BLK_RECON_READ = 0xE0, +}; + +enum rsxx_hw_status { + HW_STATUS_CRC = 0x01, + HW_STATUS_HARD_ERR = 0x02, + HW_STATUS_SOFT_ERR = 0x04, + HW_STATUS_FAULT = 0x08, +}; + +#define STATUS_BUFFER_SIZE8 4096 +#define COMMAND_BUFFER_SIZE8 4096 + +static struct kmem_cache *rsxx_dma_pool; + +struct dma_tracker { + int next_tag; + struct rsxx_dma *dma; +}; + +#define DMA_TRACKER_LIST_SIZE8 (sizeof(struct dma_tracker_list) + \ + (sizeof(struct dma_tracker) * RSXX_MAX_OUTSTANDING_CMDS)) + +struct dma_tracker_list { + spinlock_t lock; + int head; + struct dma_tracker list[0]; +}; + + +/*----------------- Misc Utility Functions -------------------*/ +static unsigned int rsxx_addr8_to_laddr(u64 addr8, struct rsxx_cardinfo *card) +{ + unsigned long long tgt_addr8; + + tgt_addr8 = ((addr8 >> card->_stripe.upper_shift) & + card->_stripe.upper_mask) | + ((addr8) & card->_stripe.lower_mask); + do_div(tgt_addr8, RSXX_HW_BLK_SIZE); + return tgt_addr8; +} + +static unsigned int rsxx_get_dma_tgt(struct rsxx_cardinfo *card, u64 addr8) +{ + unsigned int tgt; + + tgt = (addr8 >> card->_stripe.target_shift) & card->_stripe.target_mask; + + return tgt; +} + +static void rsxx_dma_queue_reset(struct rsxx_cardinfo *card) +{ + /* Reset all DMA Command/Status Queues */ + iowrite32(DMA_QUEUE_RESET, card->regmap + RESET); +} + +static unsigned int get_dma_size(struct rsxx_dma *dma) +{ + if (dma->sub_page.cnt) + return dma->sub_page.cnt << 9; + else + return RSXX_HW_BLK_SIZE; +} + + +/*----------------- DMA Tracker -------------------*/ +static void set_tracker_dma(struct dma_tracker_list *trackers, + int tag, + struct rsxx_dma *dma) +{ + trackers->list[tag].dma = dma; +} + +static struct rsxx_dma *get_tracker_dma(struct dma_tracker_list *trackers, + int tag) +{ + return trackers->list[tag].dma; +} + +static int pop_tracker(struct dma_tracker_list *trackers) +{ + int tag; + + spin_lock(&trackers->lock); + tag = trackers->head; + if (tag != -1) { + trackers->head = trackers->list[tag].next_tag; + trackers->list[tag].next_tag = -1; + } + spin_unlock(&trackers->lock); + + return tag; +} + +static void push_tracker(struct dma_tracker_list *trackers, int tag) +{ + spin_lock(&trackers->lock); + trackers->list[tag].next_tag = trackers->head; + trackers->head = tag; + trackers->list[tag].dma = NULL; + spin_unlock(&trackers->lock); +} + + +/*----------------- Interrupt Coalescing -------------*/ +/* + * Interrupt Coalescing Register Format: + * Interrupt Timer (64ns units) [15:0] + * Interrupt Count [24:16] + * Reserved [31:25] +*/ +#define INTR_COAL_LATENCY_MASK (0x0000ffff) + +#define INTR_COAL_COUNT_SHIFT 16 +#define INTR_COAL_COUNT_BITS 9 +#define INTR_COAL_COUNT_MASK (((1 << INTR_COAL_COUNT_BITS) - 1) << \ + INTR_COAL_COUNT_SHIFT) +#define INTR_COAL_LATENCY_UNITS_NS 64 + + +static u32 dma_intr_coal_val(u32 mode, u32 count, u32 latency) +{ + u32 latency_units = latency / INTR_COAL_LATENCY_UNITS_NS; + + if (mode == RSXX_INTR_COAL_DISABLED) + return 0; + + return ((count << INTR_COAL_COUNT_SHIFT) & INTR_COAL_COUNT_MASK) | + (latency_units & INTR_COAL_LATENCY_MASK); + +} + +static void dma_intr_coal_auto_tune(struct rsxx_cardinfo *card) +{ + int i; + u32 q_depth = 0; + u32 intr_coal; + + if (card->config.data.intr_coal.mode != RSXX_INTR_COAL_AUTO_TUNE) + return; + + for (i = 0; i < card->n_targets; i++) + q_depth += atomic_read(&card->ctrl[i].stats.hw_q_depth); + + intr_coal = dma_intr_coal_val(card->config.data.intr_coal.mode, + q_depth / 2, + card->config.data.intr_coal.latency); + iowrite32(intr_coal, card->regmap + INTR_COAL); +} + +/*----------------- RSXX DMA Handling -------------------*/ +static void rsxx_complete_dma(struct rsxx_cardinfo *card, + struct rsxx_dma *dma, + unsigned int status) +{ + if (status & DMA_SW_ERR) + printk_ratelimited(KERN_ERR + "SW Error in DMA(cmd x%02x, laddr x%08x)\n", + dma->cmd, dma->laddr); + if (status & DMA_HW_FAULT) + printk_ratelimited(KERN_ERR + "HW Fault in DMA(cmd x%02x, laddr x%08x)\n", + dma->cmd, dma->laddr); + if (status & DMA_CANCELLED) + printk_ratelimited(KERN_ERR + "DMA Cancelled(cmd x%02x, laddr x%08x)\n", + dma->cmd, dma->laddr); + + if (dma->dma_addr) + pci_unmap_page(card->dev, dma->dma_addr, get_dma_size(dma), + dma->cmd == HW_CMD_BLK_WRITE ? + PCI_DMA_TODEVICE : + PCI_DMA_FROMDEVICE); + + if (dma->cb) + dma->cb(card, dma->cb_data, status ? 1 : 0); + + kmem_cache_free(rsxx_dma_pool, dma); +} + +static void rsxx_requeue_dma(struct rsxx_dma_ctrl *ctrl, + struct rsxx_dma *dma) +{ + /* + * Requeued DMAs go to the front of the queue so they are issued + * first. + */ + spin_lock(&ctrl->queue_lock); + list_add(&dma->list, &ctrl->queue); + spin_unlock(&ctrl->queue_lock); +} + +static void rsxx_handle_dma_error(struct rsxx_dma_ctrl *ctrl, + struct rsxx_dma *dma, + u8 hw_st) +{ + unsigned int status = 0; + int requeue_cmd = 0; + + dev_dbg(CARD_TO_DEV(ctrl->card), + "Handling DMA error(cmd x%02x, laddr x%08x st:x%02x)\n", + dma->cmd, dma->laddr, hw_st); + + if (hw_st & HW_STATUS_CRC) + ctrl->stats.crc_errors++; + if (hw_st & HW_STATUS_HARD_ERR) + ctrl->stats.hard_errors++; + if (hw_st & HW_STATUS_SOFT_ERR) + ctrl->stats.soft_errors++; + + switch (dma->cmd) { + case HW_CMD_BLK_READ: + if (hw_st & (HW_STATUS_CRC | HW_STATUS_HARD_ERR)) { + if (ctrl->card->scrub_hard) { + dma->cmd = HW_CMD_BLK_RECON_READ; + requeue_cmd = 1; + ctrl->stats.reads_retried++; + } else { + status |= DMA_HW_FAULT; + ctrl->stats.reads_failed++; + } + } else if (hw_st & HW_STATUS_FAULT) { + status |= DMA_HW_FAULT; + ctrl->stats.reads_failed++; + } + + break; + case HW_CMD_BLK_RECON_READ: + if (hw_st & (HW_STATUS_CRC | HW_STATUS_HARD_ERR)) { + /* Data could not be reconstructed. */ + status |= DMA_HW_FAULT; + ctrl->stats.reads_failed++; + } + + break; + case HW_CMD_BLK_WRITE: + status |= DMA_HW_FAULT; + ctrl->stats.writes_failed++; + + break; + case HW_CMD_BLK_DISCARD: + status |= DMA_HW_FAULT; + ctrl->stats.discards_failed++; + + break; + default: + dev_err(CARD_TO_DEV(ctrl->card), + "Unknown command in DMA!(cmd: x%02x " + "laddr x%08x st: x%02x\n", + dma->cmd, dma->laddr, hw_st); + status |= DMA_SW_ERR; + + break; + } + + if (requeue_cmd) + rsxx_requeue_dma(ctrl, dma); + else + rsxx_complete_dma(ctrl->card, dma, status); +} + +static void dma_engine_stalled(unsigned long data) +{ + struct rsxx_dma_ctrl *ctrl = (struct rsxx_dma_ctrl *)data; + + if (atomic_read(&ctrl->stats.hw_q_depth) == 0) + return; + + if (ctrl->cmd.idx != ioread32(ctrl->regmap + SW_CMD_IDX)) { + /* + * The dma engine was stalled because the SW_CMD_IDX write + * was lost. Issue it again to recover. + */ + dev_warn(CARD_TO_DEV(ctrl->card), + "SW_CMD_IDX write was lost, re-writing...\n"); + iowrite32(ctrl->cmd.idx, ctrl->regmap + SW_CMD_IDX); + mod_timer(&ctrl->activity_timer, + jiffies + DMA_ACTIVITY_TIMEOUT); + } else { + dev_warn(CARD_TO_DEV(ctrl->card), + "DMA channel %d has stalled, faulting interface.\n", + ctrl->id); + ctrl->card->dma_fault = 1; + } +} + +static void rsxx_issue_dmas(struct work_struct *work) +{ + struct rsxx_dma_ctrl *ctrl; + struct rsxx_dma *dma; + int tag; + int cmds_pending = 0; + struct hw_cmd *hw_cmd_buf; + + ctrl = container_of(work, struct rsxx_dma_ctrl, issue_dma_work); + hw_cmd_buf = ctrl->cmd.buf; + + if (unlikely(ctrl->card->halt)) + return; + + while (1) { + spin_lock(&ctrl->queue_lock); + if (list_empty(&ctrl->queue)) { + spin_unlock(&ctrl->queue_lock); + break; + } + spin_unlock(&ctrl->queue_lock); + + tag = pop_tracker(ctrl->trackers); + if (tag == -1) + break; + + spin_lock(&ctrl->queue_lock); + dma = list_entry(ctrl->queue.next, struct rsxx_dma, list); + list_del(&dma->list); + ctrl->stats.sw_q_depth--; + spin_unlock(&ctrl->queue_lock); + + /* + * This will catch any DMAs that slipped in right before the + * fault, but was queued after all the other DMAs were + * cancelled. + */ + if (unlikely(ctrl->card->dma_fault)) { + push_tracker(ctrl->trackers, tag); + rsxx_complete_dma(ctrl->card, dma, DMA_CANCELLED); + continue; + } + + set_tracker_dma(ctrl->trackers, tag, dma); + hw_cmd_buf[ctrl->cmd.idx].command = dma->cmd; + hw_cmd_buf[ctrl->cmd.idx].tag = tag; + hw_cmd_buf[ctrl->cmd.idx]._rsvd = 0; + hw_cmd_buf[ctrl->cmd.idx].sub_page = + ((dma->sub_page.cnt & 0x7) << 4) | + (dma->sub_page.off & 0x7); + + hw_cmd_buf[ctrl->cmd.idx].device_addr = + cpu_to_le32(dma->laddr); + + hw_cmd_buf[ctrl->cmd.idx].host_addr = + cpu_to_le64(dma->dma_addr); + + dev_dbg(CARD_TO_DEV(ctrl->card), + "Issue DMA%d(laddr %d tag %d) to idx %d\n", + ctrl->id, dma->laddr, tag, ctrl->cmd.idx); + + ctrl->cmd.idx = (ctrl->cmd.idx + 1) & RSXX_CS_IDX_MASK; + cmds_pending++; + + if (dma->cmd == HW_CMD_BLK_WRITE) + ctrl->stats.writes_issued++; + else if (dma->cmd == HW_CMD_BLK_DISCARD) + ctrl->stats.discards_issued++; + else + ctrl->stats.reads_issued++; + } + + /* Let HW know we've queued commands. */ + if (cmds_pending) { + /* + * We must guarantee that the CPU writes to 'ctrl->cmd.buf' + * (which is in PCI-consistent system-memory) from the loop + * above make it into the coherency domain before the + * following PIO "trigger" updating the cmd.idx. A WMB is + * sufficient. We need not explicitly CPU cache-flush since + * the memory is a PCI-consistent (ie; coherent) mapping. + */ + wmb(); + + atomic_add(cmds_pending, &ctrl->stats.hw_q_depth); + mod_timer(&ctrl->activity_timer, + jiffies + DMA_ACTIVITY_TIMEOUT); + iowrite32(ctrl->cmd.idx, ctrl->regmap + SW_CMD_IDX); + } +} + +static void rsxx_dma_done(struct work_struct *work) +{ + struct rsxx_dma_ctrl *ctrl; + struct rsxx_dma *dma; + unsigned long flags; + u16 count; + u8 status; + u8 tag; + struct hw_status *hw_st_buf; + + ctrl = container_of(work, struct rsxx_dma_ctrl, dma_done_work); + hw_st_buf = ctrl->status.buf; + + if (unlikely(ctrl->card->halt) || + unlikely(ctrl->card->dma_fault)) + return; + + count = le16_to_cpu(hw_st_buf[ctrl->status.idx].count); + + while (count == ctrl->e_cnt) { + /* + * The read memory-barrier is necessary to keep aggressive + * processors/optimizers (such as the PPC Apple G5) from + * reordering the following status-buffer tag & status read + * *before* the count read on subsequent iterations of the + * loop! + */ + rmb(); + + status = hw_st_buf[ctrl->status.idx].status; + tag = hw_st_buf[ctrl->status.idx].tag; + + dma = get_tracker_dma(ctrl->trackers, tag); + if (dma == NULL) { + spin_lock_irqsave(&ctrl->card->irq_lock, flags); + rsxx_disable_ier(ctrl->card, CR_INTR_DMA_ALL); + spin_unlock_irqrestore(&ctrl->card->irq_lock, flags); + + dev_err(CARD_TO_DEV(ctrl->card), + "No tracker for tag %d " + "(idx %d id %d)\n", + tag, ctrl->status.idx, ctrl->id); + return; + } + + dev_dbg(CARD_TO_DEV(ctrl->card), + "Completing DMA%d" + "(laddr x%x tag %d st: x%x cnt: x%04x) from idx %d.\n", + ctrl->id, dma->laddr, tag, status, count, + ctrl->status.idx); + + atomic_dec(&ctrl->stats.hw_q_depth); + + mod_timer(&ctrl->activity_timer, + jiffies + DMA_ACTIVITY_TIMEOUT); + + if (status) + rsxx_handle_dma_error(ctrl, dma, status); + else + rsxx_complete_dma(ctrl->card, dma, 0); + + push_tracker(ctrl->trackers, tag); + + ctrl->status.idx = (ctrl->status.idx + 1) & + RSXX_CS_IDX_MASK; + ctrl->e_cnt++; + + count = le16_to_cpu(hw_st_buf[ctrl->status.idx].count); + } + + dma_intr_coal_auto_tune(ctrl->card); + + if (atomic_read(&ctrl->stats.hw_q_depth) == 0) + del_timer_sync(&ctrl->activity_timer); + + spin_lock_irqsave(&ctrl->card->irq_lock, flags); + rsxx_enable_ier(ctrl->card, CR_INTR_DMA(ctrl->id)); + spin_unlock_irqrestore(&ctrl->card->irq_lock, flags); + + spin_lock(&ctrl->queue_lock); + if (ctrl->stats.sw_q_depth) + queue_work(ctrl->issue_wq, &ctrl->issue_dma_work); + spin_unlock(&ctrl->queue_lock); +} + +static int rsxx_cleanup_dma_queue(struct rsxx_cardinfo *card, + struct list_head *q) +{ + struct rsxx_dma *dma; + struct rsxx_dma *tmp; + int cnt = 0; + + list_for_each_entry_safe(dma, tmp, q, list) { + list_del(&dma->list); + + if (dma->dma_addr) + pci_unmap_page(card->dev, dma->dma_addr, + get_dma_size(dma), + (dma->cmd == HW_CMD_BLK_WRITE) ? + PCI_DMA_TODEVICE : + PCI_DMA_FROMDEVICE); + kmem_cache_free(rsxx_dma_pool, dma); + cnt++; + } + + return cnt; +} + +static int rsxx_queue_discard(struct rsxx_cardinfo *card, + struct list_head *q, + unsigned int laddr, + rsxx_dma_cb cb, + void *cb_data) +{ + struct rsxx_dma *dma; + + dma = kmem_cache_alloc(rsxx_dma_pool, GFP_KERNEL); + if (!dma) + return -ENOMEM; + + dma->cmd = HW_CMD_BLK_DISCARD; + dma->laddr = laddr; + dma->dma_addr = 0; + dma->sub_page.off = 0; + dma->sub_page.cnt = 0; + dma->page = NULL; + dma->pg_off = 0; + dma->cb = cb; + dma->cb_data = cb_data; + + dev_dbg(CARD_TO_DEV(card), "Queuing[D] laddr %x\n", dma->laddr); + + list_add_tail(&dma->list, q); + + return 0; +} + +static int rsxx_queue_dma(struct rsxx_cardinfo *card, + struct list_head *q, + int dir, + unsigned int dma_off, + unsigned int dma_len, + unsigned int laddr, + struct page *page, + unsigned int pg_off, + rsxx_dma_cb cb, + void *cb_data) +{ + struct rsxx_dma *dma; + + dma = kmem_cache_alloc(rsxx_dma_pool, GFP_KERNEL); + if (!dma) + return -ENOMEM; + + dma->dma_addr = pci_map_page(card->dev, page, pg_off, dma_len, + dir ? PCI_DMA_TODEVICE : + PCI_DMA_FROMDEVICE); + if (!dma->dma_addr) { + kmem_cache_free(rsxx_dma_pool, dma); + return -ENOMEM; + } + + dma->cmd = dir ? HW_CMD_BLK_WRITE : HW_CMD_BLK_READ; + dma->laddr = laddr; + dma->sub_page.off = (dma_off >> 9); + dma->sub_page.cnt = (dma_len >> 9); + dma->page = page; + dma->pg_off = pg_off; + dma->cb = cb; + dma->cb_data = cb_data; + + dev_dbg(CARD_TO_DEV(card), + "Queuing[%c] laddr %x off %d cnt %d page %p pg_off %d\n", + dir ? 'W' : 'R', dma->laddr, dma->sub_page.off, + dma->sub_page.cnt, dma->page, dma->pg_off); + + /* Queue the DMA */ + list_add_tail(&dma->list, q); + + return 0; +} + +int rsxx_dma_queue_bio(struct rsxx_cardinfo *card, + struct bio *bio, + atomic_t *n_dmas, + rsxx_dma_cb cb, + void *cb_data) +{ + struct list_head dma_list[RSXX_MAX_TARGETS]; + struct bio_vec *bvec; + unsigned long long addr8; + unsigned int laddr; + unsigned int bv_len; + unsigned int bv_off; + unsigned int dma_off; + unsigned int dma_len; + int dma_cnt[RSXX_MAX_TARGETS]; + int tgt; + int st; + int i; + + addr8 = bio->bi_sector << 9; /* sectors are 512 bytes */ + atomic_set(n_dmas, 0); + + for (i = 0; i < card->n_targets; i++) { + INIT_LIST_HEAD(&dma_list[i]); + dma_cnt[i] = 0; + } + + if (bio->bi_rw & REQ_DISCARD) { + bv_len = bio->bi_size; + + while (bv_len > 0) { + tgt = rsxx_get_dma_tgt(card, addr8); + laddr = rsxx_addr8_to_laddr(addr8, card); + + st = rsxx_queue_discard(card, &dma_list[tgt], laddr, + cb, cb_data); + if (st) + goto bvec_err; + + dma_cnt[tgt]++; + atomic_inc(n_dmas); + addr8 += RSXX_HW_BLK_SIZE; + bv_len -= RSXX_HW_BLK_SIZE; + } + } else { + bio_for_each_segment(bvec, bio, i) { + bv_len = bvec->bv_len; + bv_off = bvec->bv_offset; + + while (bv_len > 0) { + tgt = rsxx_get_dma_tgt(card, addr8); + laddr = rsxx_addr8_to_laddr(addr8, card); + dma_off = addr8 & RSXX_HW_BLK_MASK; + dma_len = min(bv_len, + RSXX_HW_BLK_SIZE - dma_off); + + st = rsxx_queue_dma(card, &dma_list[tgt], + bio_data_dir(bio), + dma_off, dma_len, + laddr, bvec->bv_page, + bv_off, cb, cb_data); + if (st) + goto bvec_err; + + dma_cnt[tgt]++; + atomic_inc(n_dmas); + addr8 += dma_len; + bv_off += dma_len; + bv_len -= dma_len; + } + } + } + + for (i = 0; i < card->n_targets; i++) { + if (!list_empty(&dma_list[i])) { + spin_lock(&card->ctrl[i].queue_lock); + card->ctrl[i].stats.sw_q_depth += dma_cnt[i]; + list_splice_tail(&dma_list[i], &card->ctrl[i].queue); + spin_unlock(&card->ctrl[i].queue_lock); + + queue_work(card->ctrl[i].issue_wq, + &card->ctrl[i].issue_dma_work); + } + } + + return 0; + +bvec_err: + for (i = 0; i < card->n_targets; i++) + rsxx_cleanup_dma_queue(card, &dma_list[i]); + + return st; +} + + +/*----------------- DMA Engine Initialization & Setup -------------------*/ +static int rsxx_dma_ctrl_init(struct pci_dev *dev, + struct rsxx_dma_ctrl *ctrl) +{ + int i; + + memset(&ctrl->stats, 0, sizeof(ctrl->stats)); + + ctrl->status.buf = pci_alloc_consistent(dev, STATUS_BUFFER_SIZE8, + &ctrl->status.dma_addr); + ctrl->cmd.buf = pci_alloc_consistent(dev, COMMAND_BUFFER_SIZE8, + &ctrl->cmd.dma_addr); + if (ctrl->status.buf == NULL || ctrl->cmd.buf == NULL) + return -ENOMEM; + + ctrl->trackers = vmalloc(DMA_TRACKER_LIST_SIZE8); + if (!ctrl->trackers) + return -ENOMEM; + + ctrl->trackers->head = 0; + for (i = 0; i < RSXX_MAX_OUTSTANDING_CMDS; i++) { + ctrl->trackers->list[i].next_tag = i + 1; + ctrl->trackers->list[i].dma = NULL; + } + ctrl->trackers->list[RSXX_MAX_OUTSTANDING_CMDS-1].next_tag = -1; + spin_lock_init(&ctrl->trackers->lock); + + spin_lock_init(&ctrl->queue_lock); + INIT_LIST_HEAD(&ctrl->queue); + + setup_timer(&ctrl->activity_timer, dma_engine_stalled, + (unsigned long)ctrl); + + ctrl->issue_wq = alloc_ordered_workqueue(DRIVER_NAME"_issue", 0); + if (!ctrl->issue_wq) + return -ENOMEM; + + ctrl->done_wq = alloc_ordered_workqueue(DRIVER_NAME"_done", 0); + if (!ctrl->done_wq) + return -ENOMEM; + + INIT_WORK(&ctrl->issue_dma_work, rsxx_issue_dmas); + INIT_WORK(&ctrl->dma_done_work, rsxx_dma_done); + + memset(ctrl->status.buf, 0xac, STATUS_BUFFER_SIZE8); + iowrite32(lower_32_bits(ctrl->status.dma_addr), + ctrl->regmap + SB_ADD_LO); + iowrite32(upper_32_bits(ctrl->status.dma_addr), + ctrl->regmap + SB_ADD_HI); + + memset(ctrl->cmd.buf, 0x83, COMMAND_BUFFER_SIZE8); + iowrite32(lower_32_bits(ctrl->cmd.dma_addr), ctrl->regmap + CB_ADD_LO); + iowrite32(upper_32_bits(ctrl->cmd.dma_addr), ctrl->regmap + CB_ADD_HI); + + ctrl->status.idx = ioread32(ctrl->regmap + HW_STATUS_CNT); + if (ctrl->status.idx > RSXX_MAX_OUTSTANDING_CMDS) { + dev_crit(&dev->dev, "Failed reading status cnt x%x\n", + ctrl->status.idx); + return -EINVAL; + } + iowrite32(ctrl->status.idx, ctrl->regmap + HW_STATUS_CNT); + iowrite32(ctrl->status.idx, ctrl->regmap + SW_STATUS_CNT); + + ctrl->cmd.idx = ioread32(ctrl->regmap + HW_CMD_IDX); + if (ctrl->cmd.idx > RSXX_MAX_OUTSTANDING_CMDS) { + dev_crit(&dev->dev, "Failed reading cmd cnt x%x\n", + ctrl->status.idx); + return -EINVAL; + } + iowrite32(ctrl->cmd.idx, ctrl->regmap + HW_CMD_IDX); + iowrite32(ctrl->cmd.idx, ctrl->regmap + SW_CMD_IDX); + + wmb(); + + return 0; +} + +static int rsxx_dma_stripe_setup(struct rsxx_cardinfo *card, + unsigned int stripe_size8) +{ + if (!is_power_of_2(stripe_size8)) { + dev_err(CARD_TO_DEV(card), + "stripe_size is NOT a power of 2!\n"); + return -EINVAL; + } + + card->_stripe.lower_mask = stripe_size8 - 1; + + card->_stripe.upper_mask = ~(card->_stripe.lower_mask); + card->_stripe.upper_shift = ffs(card->n_targets) - 1; + + card->_stripe.target_mask = card->n_targets - 1; + card->_stripe.target_shift = ffs(stripe_size8) - 1; + + dev_dbg(CARD_TO_DEV(card), "_stripe.lower_mask = x%016llx\n", + card->_stripe.lower_mask); + dev_dbg(CARD_TO_DEV(card), "_stripe.upper_shift = x%016llx\n", + card->_stripe.upper_shift); + dev_dbg(CARD_TO_DEV(card), "_stripe.upper_mask = x%016llx\n", + card->_stripe.upper_mask); + dev_dbg(CARD_TO_DEV(card), "_stripe.target_mask = x%016llx\n", + card->_stripe.target_mask); + dev_dbg(CARD_TO_DEV(card), "_stripe.target_shift = x%016llx\n", + card->_stripe.target_shift); + + return 0; +} + +static int rsxx_dma_configure(struct rsxx_cardinfo *card) +{ + u32 intr_coal; + + intr_coal = dma_intr_coal_val(card->config.data.intr_coal.mode, + card->config.data.intr_coal.count, + card->config.data.intr_coal.latency); + iowrite32(intr_coal, card->regmap + INTR_COAL); + + return rsxx_dma_stripe_setup(card, card->config.data.stripe_size); +} + +int rsxx_dma_setup(struct rsxx_cardinfo *card) +{ + unsigned long flags; + int st; + int i; + + dev_info(CARD_TO_DEV(card), + "Initializing %d DMA targets\n", + card->n_targets); + + /* Regmap is divided up into 4K chunks. One for each DMA channel */ + for (i = 0; i < card->n_targets; i++) + card->ctrl[i].regmap = card->regmap + (i * 4096); + + card->dma_fault = 0; + + /* Reset the DMA queues */ + rsxx_dma_queue_reset(card); + + /************* Setup DMA Control *************/ + for (i = 0; i < card->n_targets; i++) { + st = rsxx_dma_ctrl_init(card->dev, &card->ctrl[i]); + if (st) + goto failed_dma_setup; + + card->ctrl[i].card = card; + card->ctrl[i].id = i; + } + + card->scrub_hard = 1; + + if (card->config_valid) + rsxx_dma_configure(card); + + /* Enable the interrupts after all setup has completed. */ + for (i = 0; i < card->n_targets; i++) { + spin_lock_irqsave(&card->irq_lock, flags); + rsxx_enable_ier_and_isr(card, CR_INTR_DMA(i)); + spin_unlock_irqrestore(&card->irq_lock, flags); + } + + return 0; + +failed_dma_setup: + for (i = 0; i < card->n_targets; i++) { + struct rsxx_dma_ctrl *ctrl = &card->ctrl[i]; + + if (ctrl->issue_wq) { + destroy_workqueue(ctrl->issue_wq); + ctrl->issue_wq = NULL; + } + + if (ctrl->done_wq) { + destroy_workqueue(ctrl->done_wq); + ctrl->done_wq = NULL; + } + + if (ctrl->trackers) + vfree(ctrl->trackers); + + if (ctrl->status.buf) + pci_free_consistent(card->dev, STATUS_BUFFER_SIZE8, + ctrl->status.buf, + ctrl->status.dma_addr); + if (ctrl->cmd.buf) + pci_free_consistent(card->dev, COMMAND_BUFFER_SIZE8, + ctrl->cmd.buf, ctrl->cmd.dma_addr); + } + + return st; +} + + +void rsxx_dma_destroy(struct rsxx_cardinfo *card) +{ + struct rsxx_dma_ctrl *ctrl; + struct rsxx_dma *dma; + int i, j; + int cnt = 0; + + for (i = 0; i < card->n_targets; i++) { + ctrl = &card->ctrl[i]; + + if (ctrl->issue_wq) { + destroy_workqueue(ctrl->issue_wq); + ctrl->issue_wq = NULL; + } + + if (ctrl->done_wq) { + destroy_workqueue(ctrl->done_wq); + ctrl->done_wq = NULL; + } + + if (timer_pending(&ctrl->activity_timer)) + del_timer_sync(&ctrl->activity_timer); + + /* Clean up the DMA queue */ + spin_lock(&ctrl->queue_lock); + cnt = rsxx_cleanup_dma_queue(card, &ctrl->queue); + spin_unlock(&ctrl->queue_lock); + + if (cnt) + dev_info(CARD_TO_DEV(card), + "Freed %d queued DMAs on channel %d\n", + cnt, i); + + /* Clean up issued DMAs */ + for (j = 0; j < RSXX_MAX_OUTSTANDING_CMDS; j++) { + dma = get_tracker_dma(ctrl->trackers, j); + if (dma) { + pci_unmap_page(card->dev, dma->dma_addr, + get_dma_size(dma), + (dma->cmd == HW_CMD_BLK_WRITE) ? + PCI_DMA_TODEVICE : + PCI_DMA_FROMDEVICE); + kmem_cache_free(rsxx_dma_pool, dma); + cnt++; + } + } + + if (cnt) + dev_info(CARD_TO_DEV(card), + "Freed %d pending DMAs on channel %d\n", + cnt, i); + + vfree(ctrl->trackers); + + pci_free_consistent(card->dev, STATUS_BUFFER_SIZE8, + ctrl->status.buf, ctrl->status.dma_addr); + pci_free_consistent(card->dev, COMMAND_BUFFER_SIZE8, + ctrl->cmd.buf, ctrl->cmd.dma_addr); + } +} + + +int rsxx_dma_init(void) +{ + rsxx_dma_pool = KMEM_CACHE(rsxx_dma, SLAB_HWCACHE_ALIGN); + if (!rsxx_dma_pool) + return -ENOMEM; + + return 0; +} + + +void rsxx_dma_cleanup(void) +{ + kmem_cache_destroy(rsxx_dma_pool); +} + diff --git a/drivers/block/rsxx/rsxx.h b/drivers/block/rsxx/rsxx.h new file mode 100644 index 0000000..2e50b65 --- /dev/null +++ b/drivers/block/rsxx/rsxx.h @@ -0,0 +1,45 @@ +/* +* Filename: rsxx.h +* +* +* Authors: Joshua Morris <josh.h.morris@us.ibm.com> +* Philip Kelleher <pjk1939@linux.vnet.ibm.com> +* +* (C) Copyright 2013 IBM Corporation +* +* This program is free software; you can redistribute it and/or +* modify it under the terms of the GNU General Public License as +* published by the Free Software Foundation; either version 2 of the +* License, or (at your option) any later version. +* +* This program is distributed in the hope that it will be useful, but +* WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +* General Public License for more details. +* +* You should have received a copy of the GNU General Public License +* along with this program; if not, write to the Free Software Foundation, +* Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +*/ + +#ifndef __RSXX_H__ +#define __RSXX_H__ + +/*----------------- IOCTL Definitions -------------------*/ + +struct rsxx_reg_access { + __u32 addr; + __u32 cnt; + __u32 stat; + __u32 stream; + __u32 data[8]; +}; + +#define RSXX_MAX_REG_CNT (8 * (sizeof(__u32))) + +#define RSXX_IOC_MAGIC 'r' + +#define RSXX_GETREG _IOWR(RSXX_IOC_MAGIC, 0x20, struct rsxx_reg_access) +#define RSXX_SETREG _IOWR(RSXX_IOC_MAGIC, 0x21, struct rsxx_reg_access) + +#endif /* __RSXX_H_ */ diff --git a/drivers/block/rsxx/rsxx_cfg.h b/drivers/block/rsxx/rsxx_cfg.h new file mode 100644 index 0000000..c025fe5 --- /dev/null +++ b/drivers/block/rsxx/rsxx_cfg.h @@ -0,0 +1,72 @@ +/* +* Filename: rsXX_cfg.h +* +* +* Authors: Joshua Morris <josh.h.morris@us.ibm.com> +* Philip Kelleher <pjk1939@linux.vnet.ibm.com> +* +* (C) Copyright 2013 IBM Corporation +* +* This program is free software; you can redistribute it and/or +* modify it under the terms of the GNU General Public License as +* published by the Free Software Foundation; either version 2 of the +* License, or (at your option) any later version. +* +* This program is distributed in the hope that it will be useful, but +* WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +* General Public License for more details. +* +* You should have received a copy of the GNU General Public License +* along with this program; if not, write to the Free Software Foundation, +* Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +*/ + +#ifndef __RSXX_CFG_H__ +#define __RSXX_CFG_H__ + +/* NOTE: Config values will be saved in network byte order (i.e. Big endian) */ +#include <linux/types.h> + +/* + * The card config version must match the driver's expected version. If it does + * not, the DMA interfaces will not be attached and the user will need to + * initialize/upgrade the card configuration using the card config utility. + */ +#define RSXX_CFG_VERSION 4 + +struct card_cfg_hdr { + __u32 version; + __u32 crc; +}; + +struct card_cfg_data { + __u32 block_size; + __u32 stripe_size; + __u32 vendor_id; + __u32 cache_order; + struct { + __u32 mode; /* Disabled, manual, auto-tune... */ + __u32 count; /* Number of intr to coalesce */ + __u32 latency;/* Max wait time (in ns) */ + } intr_coal; +}; + +struct rsxx_card_cfg { + struct card_cfg_hdr hdr; + struct card_cfg_data data; +}; + +/* Vendor ID Values */ +#define RSXX_VENDOR_ID_TMS_IBM 0 +#define RSXX_VENDOR_ID_DSI 1 +#define RSXX_VENDOR_COUNT 2 + +/* Interrupt Coalescing Values */ +#define RSXX_INTR_COAL_DISABLED 0 +#define RSXX_INTR_COAL_EXPLICIT 1 +#define RSXX_INTR_COAL_AUTO_TUNE 2 + + +#endif /* __RSXX_CFG_H__ */ + diff --git a/drivers/block/rsxx/rsxx_priv.h b/drivers/block/rsxx/rsxx_priv.h new file mode 100644 index 0000000..a1ac907 --- /dev/null +++ b/drivers/block/rsxx/rsxx_priv.h @@ -0,0 +1,399 @@ +/* +* Filename: rsxx_priv.h +* +* +* Authors: Joshua Morris <josh.h.morris@us.ibm.com> +* Philip Kelleher <pjk1939@linux.vnet.ibm.com> +* +* (C) Copyright 2013 IBM Corporation +* +* This program is free software; you can redistribute it and/or +* modify it under the terms of the GNU General Public License as +* published by the Free Software Foundation; either version 2 of the +* License, or (at your option) any later version. +* +* This program is distributed in the hope that it will be useful, but +* WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +* General Public License for more details. +* +* You should have received a copy of the GNU General Public License +* along with this program; if not, write to the Free Software Foundation, +* Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +*/ + +#ifndef __RSXX_PRIV_H__ +#define __RSXX_PRIV_H__ + +#include <linux/version.h> +#include <linux/semaphore.h> + +#include <linux/fs.h> +#include <linux/interrupt.h> +#include <linux/mutex.h> +#include <linux/pci.h> +#include <linux/spinlock.h> +#include <linux/sysfs.h> +#include <linux/workqueue.h> +#include <linux/bio.h> +#include <linux/vmalloc.h> +#include <linux/timer.h> +#include <linux/ioctl.h> + +#include "rsxx.h" +#include "rsxx_cfg.h" + +struct proc_cmd; + +#define PCI_VENDOR_ID_TMS_IBM 0x15B6 +#define PCI_DEVICE_ID_RS70_FLASH 0x0019 +#define PCI_DEVICE_ID_RS70D_FLASH 0x001A +#define PCI_DEVICE_ID_RS80_FLASH 0x001C +#define PCI_DEVICE_ID_RS81_FLASH 0x001E + +#define RS70_PCI_REV_SUPPORTED 4 + +#define DRIVER_NAME "rsxx" +#define DRIVER_VERSION "3.7" + +/* Block size is 4096 */ +#define RSXX_HW_BLK_SHIFT 12 +#define RSXX_HW_BLK_SIZE (1 << RSXX_HW_BLK_SHIFT) +#define RSXX_HW_BLK_MASK (RSXX_HW_BLK_SIZE - 1) + +#define MAX_CREG_DATA8 32 +#define LOG_BUF_SIZE8 128 + +#define RSXX_MAX_OUTSTANDING_CMDS 255 +#define RSXX_CS_IDX_MASK 0xff + +#define RSXX_MAX_TARGETS 8 + +struct dma_tracker_list; + +/* DMA Command/Status Buffer structure */ +struct rsxx_cs_buffer { + dma_addr_t dma_addr; + void *buf; + u32 idx; +}; + +struct rsxx_dma_stats { + u32 crc_errors; + u32 hard_errors; + u32 soft_errors; + u32 writes_issued; + u32 writes_failed; + u32 reads_issued; + u32 reads_failed; + u32 reads_retried; + u32 discards_issued; + u32 discards_failed; + u32 done_rescheduled; + u32 issue_rescheduled; + u32 sw_q_depth; /* Number of DMAs on the SW queue. */ + atomic_t hw_q_depth; /* Number of DMAs queued to HW. */ +}; + +struct rsxx_dma_ctrl { + struct rsxx_cardinfo *card; + int id; + void __iomem *regmap; + struct rsxx_cs_buffer status; + struct rsxx_cs_buffer cmd; + u16 e_cnt; + spinlock_t queue_lock; + struct list_head queue; + struct workqueue_struct *issue_wq; + struct work_struct issue_dma_work; + struct workqueue_struct *done_wq; + struct work_struct dma_done_work; + struct timer_list activity_timer; + struct dma_tracker_list *trackers; + struct rsxx_dma_stats stats; +}; + +struct rsxx_cardinfo { + struct pci_dev *dev; + unsigned int halt; + + void __iomem *regmap; + spinlock_t irq_lock; + unsigned int isr_mask; + unsigned int ier_mask; + + struct rsxx_card_cfg config; + int config_valid; + + /* Embedded CPU Communication */ + struct { + spinlock_t lock; + bool active; + struct creg_cmd *active_cmd; + struct work_struct done_work; + struct list_head queue; + unsigned int q_depth; + /* Cache the creg status to prevent ioreads */ + struct { + u32 stat; + u32 failed_cancel_timer; + u32 creg_timeout; + } creg_stats; + struct timer_list cmd_timer; + struct mutex reset_lock; + int reset; + } creg_ctrl; + + struct { + char tmp[MAX_CREG_DATA8]; + char buf[LOG_BUF_SIZE8]; /* terminated */ + int buf_len; + } log; + + struct work_struct event_work; + unsigned int state; + u64 size8; + + /* Lock the device attach/detach function */ + struct mutex dev_lock; + + /* Block Device Variables */ + bool bdev_attached; + int disk_id; + int major; + struct request_queue *queue; + struct gendisk *gendisk; + struct { + /* Used to convert a byte address to a device address. */ + u64 lower_mask; + u64 upper_shift; + u64 upper_mask; + u64 target_mask; + u64 target_shift; + } _stripe; + unsigned int dma_fault; + + int scrub_hard; + + int n_targets; + struct rsxx_dma_ctrl *ctrl; +}; + +enum rsxx_pci_regmap { + HWID = 0x00, /* Hardware Identification Register */ + SCRATCH = 0x04, /* Scratch/Debug Register */ + RESET = 0x08, /* Reset Register */ + ISR = 0x10, /* Interrupt Status Register */ + IER = 0x14, /* Interrupt Enable Register */ + IPR = 0x18, /* Interrupt Poll Register */ + CB_ADD_LO = 0x20, /* Command Host Buffer Address [31:0] */ + CB_ADD_HI = 0x24, /* Command Host Buffer Address [63:32]*/ + HW_CMD_IDX = 0x28, /* Hardware Processed Command Index */ + SW_CMD_IDX = 0x2C, /* Software Processed Command Index */ + SB_ADD_LO = 0x30, /* Status Host Buffer Address [31:0] */ + SB_ADD_HI = 0x34, /* Status Host Buffer Address [63:32] */ + HW_STATUS_CNT = 0x38, /* Hardware Status Counter */ + SW_STATUS_CNT = 0x3C, /* Deprecated */ + CREG_CMD = 0x40, /* CPU Command Register */ + CREG_ADD = 0x44, /* CPU Address Register */ + CREG_CNT = 0x48, /* CPU Count Register */ + CREG_STAT = 0x4C, /* CPU Status Register */ + CREG_DATA0 = 0x50, /* CPU Data Registers */ + CREG_DATA1 = 0x54, + CREG_DATA2 = 0x58, + CREG_DATA3 = 0x5C, + CREG_DATA4 = 0x60, + CREG_DATA5 = 0x64, + CREG_DATA6 = 0x68, + CREG_DATA7 = 0x6c, + INTR_COAL = 0x70, /* Interrupt Coalescing Register */ + HW_ERROR = 0x74, /* Card Error Register */ + PCI_DEBUG0 = 0x78, /* PCI Debug Registers */ + PCI_DEBUG1 = 0x7C, + PCI_DEBUG2 = 0x80, + PCI_DEBUG3 = 0x84, + PCI_DEBUG4 = 0x88, + PCI_DEBUG5 = 0x8C, + PCI_DEBUG6 = 0x90, + PCI_DEBUG7 = 0x94, + PCI_POWER_THROTTLE = 0x98, + PERF_CTRL = 0x9c, + PERF_TIMER_LO = 0xa0, + PERF_TIMER_HI = 0xa4, + PERF_RD512_LO = 0xa8, + PERF_RD512_HI = 0xac, + PERF_WR512_LO = 0xb0, + PERF_WR512_HI = 0xb4, +}; + +enum rsxx_intr { + CR_INTR_DMA0 = 0x00000001, + CR_INTR_CREG = 0x00000002, + CR_INTR_DMA1 = 0x00000004, + CR_INTR_EVENT = 0x00000008, + CR_INTR_DMA2 = 0x00000010, + CR_INTR_DMA3 = 0x00000020, + CR_INTR_DMA4 = 0x00000040, + CR_INTR_DMA5 = 0x00000080, + CR_INTR_DMA6 = 0x00000100, + CR_INTR_DMA7 = 0x00000200, + CR_INTR_DMA_ALL = 0x000003f5, + CR_INTR_ALL = 0xffffffff, +}; + +static inline int CR_INTR_DMA(int N) +{ + static const unsigned int _CR_INTR_DMA[] = { + CR_INTR_DMA0, CR_INTR_DMA1, CR_INTR_DMA2, CR_INTR_DMA3, + CR_INTR_DMA4, CR_INTR_DMA5, CR_INTR_DMA6, CR_INTR_DMA7 + }; + return _CR_INTR_DMA[N]; +} +enum rsxx_pci_reset { + DMA_QUEUE_RESET = 0x00000001, +}; + +enum rsxx_pci_revision { + RSXX_DISCARD_SUPPORT = 2, +}; + +enum rsxx_creg_cmd { + CREG_CMD_TAG_MASK = 0x0000FF00, + CREG_OP_WRITE = 0x000000C0, + CREG_OP_READ = 0x000000E0, +}; + +enum rsxx_creg_addr { + CREG_ADD_CARD_CMD = 0x80001000, + CREG_ADD_CARD_STATE = 0x80001004, + CREG_ADD_CARD_SIZE = 0x8000100c, + CREG_ADD_CAPABILITIES = 0x80001050, + CREG_ADD_LOG = 0x80002000, + CREG_ADD_NUM_TARGETS = 0x80003000, + CREG_ADD_CONFIG = 0xB0000000, +}; + +enum rsxx_creg_card_cmd { + CARD_CMD_STARTUP = 1, + CARD_CMD_SHUTDOWN = 2, + CARD_CMD_LOW_LEVEL_FORMAT = 3, + CARD_CMD_FPGA_RECONFIG_BR = 4, + CARD_CMD_FPGA_RECONFIG_MAIN = 5, + CARD_CMD_BACKUP = 6, + CARD_CMD_RESET = 7, + CARD_CMD_deprecated = 8, + CARD_CMD_UNINITIALIZE = 9, + CARD_CMD_DSTROY_EMERGENCY = 10, + CARD_CMD_DSTROY_NORMAL = 11, + CARD_CMD_DSTROY_EXTENDED = 12, + CARD_CMD_DSTROY_ABORT = 13, +}; + +enum rsxx_card_state { + CARD_STATE_SHUTDOWN = 0x00000001, + CARD_STATE_STARTING = 0x00000002, + CARD_STATE_FORMATTING = 0x00000004, + CARD_STATE_UNINITIALIZED = 0x00000008, + CARD_STATE_GOOD = 0x00000010, + CARD_STATE_SHUTTING_DOWN = 0x00000020, + CARD_STATE_FAULT = 0x00000040, + CARD_STATE_RD_ONLY_FAULT = 0x00000080, + CARD_STATE_DSTROYING = 0x00000100, +}; + +enum rsxx_led { + LED_DEFAULT = 0x0, + LED_IDENTIFY = 0x1, + LED_SOAK = 0x2, +}; + +enum rsxx_creg_flash_lock { + CREG_FLASH_LOCK = 1, + CREG_FLASH_UNLOCK = 2, +}; + +enum rsxx_card_capabilities { + CARD_CAP_SUBPAGE_WRITES = 0x00000080, +}; + +enum rsxx_creg_stat { + CREG_STAT_STATUS_MASK = 0x00000003, + CREG_STAT_SUCCESS = 0x1, + CREG_STAT_ERROR = 0x2, + CREG_STAT_CHAR_PENDING = 0x00000004, /* Character I/O pending bit */ + CREG_STAT_LOG_PENDING = 0x00000008, /* HW log message pending bit */ + CREG_STAT_TAG_MASK = 0x0000ff00, +}; + +static inline unsigned int CREG_DATA(int N) +{ + return CREG_DATA0 + (N << 2); +} + +/*----------------- Convenient Log Wrappers -------------------*/ +#define CARD_TO_DEV(__CARD) (&(__CARD)->dev->dev) + +/***** config.c *****/ +int rsxx_load_config(struct rsxx_cardinfo *card); + +/***** core.c *****/ +void rsxx_enable_ier(struct rsxx_cardinfo *card, unsigned int intr); +void rsxx_disable_ier(struct rsxx_cardinfo *card, unsigned int intr); +void rsxx_enable_ier_and_isr(struct rsxx_cardinfo *card, + unsigned int intr); +void rsxx_disable_ier_and_isr(struct rsxx_cardinfo *card, + unsigned int intr); + +/***** dev.c *****/ +int rsxx_attach_dev(struct rsxx_cardinfo *card); +void rsxx_detach_dev(struct rsxx_cardinfo *card); +int rsxx_setup_dev(struct rsxx_cardinfo *card); +void rsxx_destroy_dev(struct rsxx_cardinfo *card); +int rsxx_dev_init(void); +void rsxx_dev_cleanup(void); + +/***** dma.c ****/ +typedef void (*rsxx_dma_cb)(struct rsxx_cardinfo *card, + void *cb_data, + unsigned int status); +int rsxx_dma_setup(struct rsxx_cardinfo *card); +void rsxx_dma_destroy(struct rsxx_cardinfo *card); +int rsxx_dma_init(void); +void rsxx_dma_cleanup(void); +int rsxx_dma_queue_bio(struct rsxx_cardinfo *card, + struct bio *bio, + atomic_t *n_dmas, + rsxx_dma_cb cb, + void *cb_data); + +/***** cregs.c *****/ +int rsxx_creg_write(struct rsxx_cardinfo *card, u32 addr, + unsigned int size8, + void *data, + int byte_stream); +int rsxx_creg_read(struct rsxx_cardinfo *card, + u32 addr, + unsigned int size8, + void *data, + int byte_stream); +int rsxx_read_hw_log(struct rsxx_cardinfo *card); +int rsxx_get_card_state(struct rsxx_cardinfo *card, + unsigned int *state); +int rsxx_get_card_size8(struct rsxx_cardinfo *card, u64 *size8); +int rsxx_get_num_targets(struct rsxx_cardinfo *card, + unsigned int *n_targets); +int rsxx_get_card_capabilities(struct rsxx_cardinfo *card, + u32 *capabilities); +int rsxx_issue_card_cmd(struct rsxx_cardinfo *card, u32 cmd); +int rsxx_creg_setup(struct rsxx_cardinfo *card); +void rsxx_creg_destroy(struct rsxx_cardinfo *card); +int rsxx_creg_init(void); +void rsxx_creg_cleanup(void); + +int rsxx_reg_access(struct rsxx_cardinfo *card, + struct rsxx_reg_access __user *ucmd, + int read); + + + +#endif /* __DRIVERS_BLOCK_RSXX_H__ */ diff --git a/drivers/block/swim.c b/drivers/block/swim.c index 765fa2b..8766a22 100644 --- a/drivers/block/swim.c +++ b/drivers/block/swim.c @@ -844,6 +844,7 @@ static int swim_floppy_init(struct swim_priv *swd) swd->unit[drive].swd = swd; } + spin_lock_init(&swd->lock); swd->queue = blk_init_queue(do_fd_request, &swd->lock); if (!swd->queue) { err = -ENOMEM; diff --git a/drivers/block/swim3.c b/drivers/block/swim3.c index 57763c5..758f2ac 100644 --- a/drivers/block/swim3.c +++ b/drivers/block/swim3.c @@ -1090,10 +1090,13 @@ static const struct block_device_operations floppy_fops = { static void swim3_mb_event(struct macio_dev* mdev, int mb_state) { struct floppy_state *fs = macio_get_drvdata(mdev); - struct swim3 __iomem *sw = fs->swim3; + struct swim3 __iomem *sw; if (!fs) return; + + sw = fs->swim3; + if (mb_state != MB_FD) return; diff --git a/drivers/block/xd.c b/drivers/block/xd.c deleted file mode 100644 index ff54052..0000000 --- a/drivers/block/xd.c +++ /dev/null @@ -1,1123 +0,0 @@ -/* - * This file contains the driver for an XT hard disk controller - * (at least the DTC 5150X) for Linux. - * - * Author: Pat Mackinlay, pat@it.com.au - * Date: 29/09/92 - * - * Revised: 01/01/93, ... - * - * Ref: DTC 5150X Controller Specification (thanks to Kevin Fowler, - * kevinf@agora.rain.com) - * Also thanks to: Salvador Abreu, Dave Thaler, Risto Kankkunen and - * Wim Van Dorst. - * - * Revised: 04/04/94 by Risto Kankkunen - * Moved the detection code from xd_init() to xd_geninit() as it needed - * interrupts enabled and Linus didn't want to enable them in that first - * phase. xd_geninit() is the place to do these kinds of things anyway, - * he says. - * - * Modularized: 04/10/96 by Todd Fries, tfries@umr.edu - * - * Revised: 13/12/97 by Andrzej Krzysztofowicz, ankry@mif.pg.gda.pl - * Fixed some problems with disk initialization and module initiation. - * Added support for manual geometry setting (except Seagate controllers) - * in form: - * xd_geo=<cyl_xda>,<head_xda>,<sec_xda>[,<cyl_xdb>,<head_xdb>,<sec_xdb>] - * Recovered DMA access. Abridged messages. Added support for DTC5051CX, - * WD1002-27X & XEBEC controllers. Driver uses now some jumper settings. - * Extended ioctl() support. - * - * Bugfix: 15/02/01, Paul G. - inform queue layer of tiny xd_maxsect. - * - */ - -#include <linux/module.h> -#include <linux/errno.h> -#include <linux/interrupt.h> -#include <linux/mm.h> -#include <linux/fs.h> -#include <linux/kernel.h> -#include <linux/timer.h> -#include <linux/genhd.h> -#include <linux/hdreg.h> -#include <linux/ioport.h> -#include <linux/init.h> -#include <linux/wait.h> -#include <linux/blkdev.h> -#include <linux/mutex.h> -#include <linux/blkpg.h> -#include <linux/delay.h> -#include <linux/io.h> -#include <linux/gfp.h> - -#include <asm/uaccess.h> -#include <asm/dma.h> - -#include "xd.h" - -static DEFINE_MUTEX(xd_mutex); -static void __init do_xd_setup (int *integers); -#ifdef MODULE -static int xd[5] = { -1,-1,-1,-1, }; -#endif - -#define XD_DONT_USE_DMA 0 /* Initial value. may be overriden using - "nodma" module option */ -#define XD_INIT_DISK_DELAY (30) /* 30 ms delay during disk initialization */ - -/* Above may need to be increased if a problem with the 2nd drive detection - (ST11M controller) or resetting a controller (WD) appears */ - -static XD_INFO xd_info[XD_MAXDRIVES]; - -/* If you try this driver and find that your card is not detected by the driver at bootup, you need to add your BIOS - signature and details to the following list of signatures. A BIOS signature is a string embedded into the first - few bytes of your controller's on-board ROM BIOS. To find out what yours is, use something like MS-DOS's DEBUG - command. Run DEBUG, and then you can examine your BIOS signature with: - - d xxxx:0000 - - where xxxx is the segment of your controller (like C800 or D000 or something). On the ASCII dump at the right, you should - be able to see a string mentioning the manufacturer's copyright etc. Add this string into the table below. The parameters - in the table are, in order: - - offset ; this is the offset (in bytes) from the start of your ROM where the signature starts - signature ; this is the actual text of the signature - xd_?_init_controller ; this is the controller init routine used by your controller - xd_?_init_drive ; this is the drive init routine used by your controller - - The controllers directly supported at the moment are: DTC 5150x, WD 1004A27X, ST11M/R and override. If your controller is - made by the same manufacturer as one of these, try using the same init routines as they do. If that doesn't work, your - best bet is to use the "override" routines. These routines use a "portable" method of getting the disk's geometry, and - may work with your card. If none of these seem to work, try sending me some email and I'll see what I can do <grin>. - - NOTE: You can now specify your XT controller's parameters from the command line in the form xd=TYPE,IRQ,IO,DMA. The driver - should be able to detect your drive's geometry from this info. (eg: xd=0,5,0x320,3 is the "standard"). */ - -#include <asm/page.h> -#define xd_dma_mem_alloc(size) __get_dma_pages(GFP_KERNEL,get_order(size)) -#define xd_dma_mem_free(addr, size) free_pages(addr, get_order(size)) -static char *xd_dma_buffer; - -static XD_SIGNATURE xd_sigs[] __initdata = { - { 0x0000,"Override geometry handler",NULL,xd_override_init_drive,"n unknown" }, /* Pat Mackinlay, pat@it.com.au */ - { 0x0008,"[BXD06 (C) DTC 17-MAY-1985]",xd_dtc_init_controller,xd_dtc5150cx_init_drive," DTC 5150CX" }, /* Andrzej Krzysztofowicz, ankry@mif.pg.gda.pl */ - { 0x000B,"CRD18A Not an IBM rom. (C) Copyright Data Technology Corp. 05/31/88",xd_dtc_init_controller,xd_dtc_init_drive," DTC 5150X" }, /* Todd Fries, tfries@umr.edu */ - { 0x000B,"CXD23A Not an IBM ROM (C)Copyright Data Technology Corp 12/03/88",xd_dtc_init_controller,xd_dtc_init_drive," DTC 5150X" }, /* Pat Mackinlay, pat@it.com.au */ - { 0x0008,"07/15/86(C) Copyright 1986 Western Digital Corp.",xd_wd_init_controller,xd_wd_init_drive," Western Dig. 1002-27X" }, /* Andrzej Krzysztofowicz, ankry@mif.pg.gda.pl */ - { 0x0008,"06/24/88(C) Copyright 1988 Western Digital Corp.",xd_wd_init_controller,xd_wd_init_drive," Western Dig. WDXT-GEN2" }, /* Dan Newcombe, newcombe@aa.csc.peachnet.edu */ - { 0x0015,"SEAGATE ST11 BIOS REVISION",xd_seagate_init_controller,xd_seagate_init_drive," Seagate ST11M/R" }, /* Salvador Abreu, spa@fct.unl.pt */ - { 0x0010,"ST11R BIOS",xd_seagate_init_controller,xd_seagate_init_drive," Seagate ST11M/R" }, /* Risto Kankkunen, risto.kankkunen@cs.helsinki.fi */ - { 0x0010,"ST11 BIOS v1.7",xd_seagate_init_controller,xd_seagate_init_drive," Seagate ST11R" }, /* Alan Hourihane, alanh@fairlite.demon.co.uk */ - { 0x1000,"(c)Copyright 1987 SMS",xd_omti_init_controller,xd_omti_init_drive,"n OMTI 5520" }, /* Dirk Melchers, dirk@merlin.nbg.sub.org */ - { 0x0006,"COPYRIGHT XEBEC (C) 1984",xd_xebec_init_controller,xd_xebec_init_drive," XEBEC" }, /* Andrzej Krzysztofowicz, ankry@mif.pg.gda.pl */ - { 0x0008,"(C) Copyright 1984 Western Digital Corp", xd_wd_init_controller, xd_wd_init_drive," Western Dig. 1002s-wx2" }, - { 0x0008,"(C) Copyright 1986 Western Digital Corporation", xd_wd_init_controller, xd_wd_init_drive," 1986 Western Digital" }, /* jfree@sovereign.org */ -}; - -static unsigned int xd_bases[] __initdata = -{ - 0xC8000, 0xCA000, 0xCC000, - 0xCE000, 0xD0000, 0xD2000, - 0xD4000, 0xD6000, 0xD8000, - 0xDA000, 0xDC000, 0xDE000, - 0xE0000 -}; - -static DEFINE_SPINLOCK(xd_lock); - -static struct gendisk *xd_gendisk[2]; - -static int xd_getgeo(struct block_device *bdev, struct hd_geometry *geo); - -static const struct block_device_operations xd_fops = { - .owner = THIS_MODULE, - .ioctl = xd_ioctl, - .getgeo = xd_getgeo, -}; -static DECLARE_WAIT_QUEUE_HEAD(xd_wait_int); -static u_char xd_drives, xd_irq = 5, xd_dma = 3, xd_maxsectors; -static u_char xd_override __initdata = 0, xd_type __initdata = 0; -static u_short xd_iobase = 0x320; -static int xd_geo[XD_MAXDRIVES*3] __initdata = { 0, }; - -static volatile int xdc_busy; -static struct timer_list xd_watchdog_int; - -static volatile u_char xd_error; -static bool nodma = XD_DONT_USE_DMA; - -static struct request_queue *xd_queue; - -/* xd_init: register the block device number and set up pointer tables */ -static int __init xd_init(void) -{ - u_char i,controller; - unsigned int address; - int err; - -#ifdef MODULE - { - u_char count = 0; - for (i = 4; i > 0; i--) - if (((xd[i] = xd[i-1]) >= 0) && !count) - count = i; - if ((xd[0] = count)) - do_xd_setup(xd); - } -#endif - - init_timer (&xd_watchdog_int); xd_watchdog_int.function = xd_watchdog; - - err = -EBUSY; - if (register_blkdev(XT_DISK_MAJOR, "xd")) - goto out1; - - err = -ENOMEM; - xd_queue = blk_init_queue(do_xd_request, &xd_lock); - if (!xd_queue) - goto out1a; - - if (xd_detect(&controller,&address)) { - - printk("Detected a%s controller (type %d) at address %06x\n", - xd_sigs[controller].name,controller,address); - if (!request_region(xd_iobase,4,"xd")) { - printk("xd: Ports at 0x%x are not available\n", - xd_iobase); - goto out2; - } - if (controller) - xd_sigs[controller].init_controller(address); - xd_drives = xd_initdrives(xd_sigs[controller].init_drive); - - printk("Detected %d hard drive%s (using IRQ%d & DMA%d)\n", - xd_drives,xd_drives == 1 ? "" : "s",xd_irq,xd_dma); - } - - /* - * With the drive detected, xd_maxsectors should now be known. - * If xd_maxsectors is 0, nothing was detected and we fall through - * to return -ENODEV - */ - if (!xd_dma_buffer && xd_maxsectors) { - xd_dma_buffer = (char *)xd_dma_mem_alloc(xd_maxsectors * 0x200); - if (!xd_dma_buffer) { - printk(KERN_ERR "xd: Out of memory.\n"); - goto out3; - } - } - - err = -ENODEV; - if (!xd_drives) - goto out3; - - for (i = 0; i < xd_drives; i++) { - XD_INFO *p = &xd_info[i]; - struct gendisk *disk = alloc_disk(64); - if (!disk) - goto Enomem; - p->unit = i; - disk->major = XT_DISK_MAJOR; - disk->first_minor = i<<6; - sprintf(disk->disk_name, "xd%c", i+'a'); - disk->fops = &xd_fops; - disk->private_data = p; - disk->queue = xd_queue; - set_capacity(disk, p->heads * p->cylinders * p->sectors); - printk(" %s: CHS=%d/%d/%d\n", disk->disk_name, - p->cylinders, p->heads, p->sectors); - xd_gendisk[i] = disk; - } - - err = -EBUSY; - if (request_irq(xd_irq,xd_interrupt_handler, 0, "XT hard disk", NULL)) { - printk("xd: unable to get IRQ%d\n",xd_irq); - goto out4; - } - - if (request_dma(xd_dma,"xd")) { - printk("xd: unable to get DMA%d\n",xd_dma); - goto out5; - } - - /* xd_maxsectors depends on controller - so set after detection */ - blk_queue_max_hw_sectors(xd_queue, xd_maxsectors); - - for (i = 0; i < xd_drives; i++) - add_disk(xd_gendisk[i]); - - return 0; - -out5: - free_irq(xd_irq, NULL); -out4: - for (i = 0; i < xd_drives; i++) - put_disk(xd_gendisk[i]); -out3: - if (xd_maxsectors) - release_region(xd_iobase,4); - - if (xd_dma_buffer) - xd_dma_mem_free((unsigned long)xd_dma_buffer, - xd_maxsectors * 0x200); -out2: - blk_cleanup_queue(xd_queue); -out1a: - unregister_blkdev(XT_DISK_MAJOR, "xd"); -out1: - return err; -Enomem: - err = -ENOMEM; - while (i--) - put_disk(xd_gendisk[i]); - goto out3; -} - -/* xd_detect: scan the possible BIOS ROM locations for the signature strings */ -static u_char __init xd_detect (u_char *controller, unsigned int *address) -{ - int i, j; - - if (xd_override) - { - *controller = xd_type; - *address = 0; - return(1); - } - - for (i = 0; i < ARRAY_SIZE(xd_bases); i++) { - void __iomem *p = ioremap(xd_bases[i], 0x2000); - if (!p) - continue; - for (j = 1; j < ARRAY_SIZE(xd_sigs); j++) { - const char *s = xd_sigs[j].string; - if (check_signature(p + xd_sigs[j].offset, s, strlen(s))) { - *controller = j; - xd_type = j; - *address = xd_bases[i]; - iounmap(p); - return 1; - } - } - iounmap(p); - } - return 0; -} - -/* do_xd_request: handle an incoming request */ -static void do_xd_request (struct request_queue * q) -{ - struct request *req; - - if (xdc_busy) - return; - - req = blk_fetch_request(q); - while (req) { - unsigned block = blk_rq_pos(req); - unsigned count = blk_rq_cur_sectors(req); - XD_INFO *disk = req->rq_disk->private_data; - int res = -EIO; - int retry; - - if (req->cmd_type != REQ_TYPE_FS) - goto done; - if (block + count > get_capacity(req->rq_disk)) - goto done; - for (retry = 0; (retry < XD_RETRIES) && !res; retry++) - res = xd_readwrite(rq_data_dir(req), disk, req->buffer, - block, count); - done: - /* wrap up, 0 = success, -errno = fail */ - if (!__blk_end_request_cur(req, res)) - req = blk_fetch_request(q); - } -} - -static int xd_getgeo(struct block_device *bdev, struct hd_geometry *geo) -{ - XD_INFO *p = bdev->bd_disk->private_data; - - geo->heads = p->heads; - geo->sectors = p->sectors; - geo->cylinders = p->cylinders; - return 0; -} - -/* xd_ioctl: handle device ioctl's */ -static int xd_locked_ioctl(struct block_device *bdev, fmode_t mode, u_int cmd, u_long arg) -{ - switch (cmd) { - case HDIO_SET_DMA: - if (!capable(CAP_SYS_ADMIN)) return -EACCES; - if (xdc_busy) return -EBUSY; - nodma = !arg; - if (nodma && xd_dma_buffer) { - xd_dma_mem_free((unsigned long)xd_dma_buffer, - xd_maxsectors * 0x200); - xd_dma_buffer = NULL; - } else if (!nodma && !xd_dma_buffer) { - xd_dma_buffer = (char *)xd_dma_mem_alloc(xd_maxsectors * 0x200); - if (!xd_dma_buffer) { - nodma = XD_DONT_USE_DMA; - return -ENOMEM; - } - } - return 0; - case HDIO_GET_DMA: - return put_user(!nodma, (long __user *) arg); - case HDIO_GET_MULTCOUNT: - return put_user(xd_maxsectors, (long __user *) arg); - default: - return -EINVAL; - } -} - -static int xd_ioctl(struct block_device *bdev, fmode_t mode, - unsigned int cmd, unsigned long param) -{ - int ret; - - mutex_lock(&xd_mutex); - ret = xd_locked_ioctl(bdev, mode, cmd, param); - mutex_unlock(&xd_mutex); - - return ret; -} - -/* xd_readwrite: handle a read/write request */ -static int xd_readwrite (u_char operation,XD_INFO *p,char *buffer,u_int block,u_int count) -{ - int drive = p->unit; - u_char cmdblk[6],sense[4]; - u_short track,cylinder; - u_char head,sector,control,mode = PIO_MODE,temp; - char **real_buffer; - register int i; - -#ifdef DEBUG_READWRITE - printk("xd_readwrite: operation = %s, drive = %d, buffer = 0x%X, block = %d, count = %d\n",operation == READ ? "read" : "write",drive,buffer,block,count); -#endif /* DEBUG_READWRITE */ - - spin_unlock_irq(&xd_lock); - - control = p->control; - if (!xd_dma_buffer) - xd_dma_buffer = (char *)xd_dma_mem_alloc(xd_maxsectors * 0x200); - while (count) { - temp = count < xd_maxsectors ? count : xd_maxsectors; - - track = block / p->sectors; - head = track % p->heads; - cylinder = track / p->heads; - sector = block % p->sectors; - -#ifdef DEBUG_READWRITE - printk("xd_readwrite: drive = %d, head = %d, cylinder = %d, sector = %d, count = %d\n",drive,head,cylinder,sector,temp); -#endif /* DEBUG_READWRITE */ - - if (xd_dma_buffer) { - mode = xd_setup_dma(operation == READ ? DMA_MODE_READ : DMA_MODE_WRITE,(u_char *)(xd_dma_buffer),temp * 0x200); - real_buffer = &xd_dma_buffer; - for (i=0; i < (temp * 0x200); i++) - xd_dma_buffer[i] = buffer[i]; - } - else - real_buffer = &buffer; - - xd_build(cmdblk,operation == READ ? CMD_READ : CMD_WRITE,drive,head,cylinder,sector,temp & 0xFF,control); - - switch (xd_command(cmdblk,mode,(u_char *)(*real_buffer),(u_char *)(*real_buffer),sense,XD_TIMEOUT)) { - case 1: - printk("xd%c: %s timeout, recalibrating drive\n",'a'+drive,(operation == READ ? "read" : "write")); - xd_recalibrate(drive); - spin_lock_irq(&xd_lock); - return -EIO; - case 2: - if (sense[0] & 0x30) { - printk("xd%c: %s - ",'a'+drive,(operation == READ ? "reading" : "writing")); - switch ((sense[0] & 0x30) >> 4) { - case 0: printk("drive error, code = 0x%X",sense[0] & 0x0F); - break; - case 1: printk("controller error, code = 0x%X",sense[0] & 0x0F); - break; - case 2: printk("command error, code = 0x%X",sense[0] & 0x0F); - break; - case 3: printk("miscellaneous error, code = 0x%X",sense[0] & 0x0F); - break; - } - } - if (sense[0] & 0x80) - printk(" - CHS = %d/%d/%d\n",((sense[2] & 0xC0) << 2) | sense[3],sense[1] & 0x1F,sense[2] & 0x3F); - /* reported drive number = (sense[1] & 0xE0) >> 5 */ - else - printk(" - no valid disk address\n"); - spin_lock_irq(&xd_lock); - return -EIO; - } - if (xd_dma_buffer) - for (i=0; i < (temp * 0x200); i++) - buffer[i] = xd_dma_buffer[i]; - - count -= temp, buffer += temp * 0x200, block += temp; - } - spin_lock_irq(&xd_lock); - return 0; -} - -/* xd_recalibrate: recalibrate a given drive and reset controller if necessary */ -static void xd_recalibrate (u_char drive) -{ - u_char cmdblk[6]; - - xd_build(cmdblk,CMD_RECALIBRATE,drive,0,0,0,0,0); - if (xd_command(cmdblk,PIO_MODE,NULL,NULL,NULL,XD_TIMEOUT * 8)) - printk("xd%c: warning! error recalibrating, controller may be unstable\n", 'a'+drive); -} - -/* xd_interrupt_handler: interrupt service routine */ -static irqreturn_t xd_interrupt_handler(int irq, void *dev_id) -{ - if (inb(XD_STATUS) & STAT_INTERRUPT) { /* check if it was our device */ -#ifdef DEBUG_OTHER - printk("xd_interrupt_handler: interrupt detected\n"); -#endif /* DEBUG_OTHER */ - outb(0,XD_CONTROL); /* acknowledge interrupt */ - wake_up(&xd_wait_int); /* and wake up sleeping processes */ - return IRQ_HANDLED; - } - else - printk("xd: unexpected interrupt\n"); - return IRQ_NONE; -} - -/* xd_setup_dma: set up the DMA controller for a data transfer */ -static u_char xd_setup_dma (u_char mode,u_char *buffer,u_int count) -{ - unsigned long f; - - if (nodma) - return (PIO_MODE); - if (((unsigned long) buffer & 0xFFFF0000) != (((unsigned long) buffer + count) & 0xFFFF0000)) { -#ifdef DEBUG_OTHER - printk("xd_setup_dma: using PIO, transfer overlaps 64k boundary\n"); -#endif /* DEBUG_OTHER */ - return (PIO_MODE); - } - - f=claim_dma_lock(); - disable_dma(xd_dma); - clear_dma_ff(xd_dma); - set_dma_mode(xd_dma,mode); - set_dma_addr(xd_dma, (unsigned long) buffer); - set_dma_count(xd_dma,count); - - release_dma_lock(f); - - return (DMA_MODE); /* use DMA and INT */ -} - -/* xd_build: put stuff into an array in a format suitable for the controller */ -static u_char *xd_build (u_char *cmdblk,u_char command,u_char drive,u_char head,u_short cylinder,u_char sector,u_char count,u_char control) -{ - cmdblk[0] = command; - cmdblk[1] = ((drive & 0x07) << 5) | (head & 0x1F); - cmdblk[2] = ((cylinder & 0x300) >> 2) | (sector & 0x3F); - cmdblk[3] = cylinder & 0xFF; - cmdblk[4] = count; - cmdblk[5] = control; - - return (cmdblk); -} - -static void xd_watchdog (unsigned long unused) -{ - xd_error = 1; - wake_up(&xd_wait_int); -} - -/* xd_waitport: waits until port & mask == flags or a timeout occurs. return 1 for a timeout */ -static inline u_char xd_waitport (u_short port,u_char flags,u_char mask,u_long timeout) -{ - u_long expiry = jiffies + timeout; - int success; - - xdc_busy = 1; - while ((success = ((inb(port) & mask) != flags)) && time_before(jiffies, expiry)) - schedule_timeout_uninterruptible(1); - xdc_busy = 0; - return (success); -} - -static inline u_int xd_wait_for_IRQ (void) -{ - unsigned long flags; - xd_watchdog_int.expires = jiffies + 8 * HZ; - add_timer(&xd_watchdog_int); - - flags=claim_dma_lock(); - enable_dma(xd_dma); - release_dma_lock(flags); - - sleep_on(&xd_wait_int); - del_timer(&xd_watchdog_int); - xdc_busy = 0; - - flags=claim_dma_lock(); - disable_dma(xd_dma); - release_dma_lock(flags); - - if (xd_error) { - printk("xd: missed IRQ - command aborted\n"); - xd_error = 0; - return (1); - } - return (0); -} - -/* xd_command: handle all data transfers necessary for a single command */ -static u_int xd_command (u_char *command,u_char mode,u_char *indata,u_char *outdata,u_char *sense,u_long timeout) -{ - u_char cmdblk[6],csb,complete = 0; - -#ifdef DEBUG_COMMAND - printk("xd_command: command = 0x%X, mode = 0x%X, indata = 0x%X, outdata = 0x%X, sense = 0x%X\n",command,mode,indata,outdata,sense); -#endif /* DEBUG_COMMAND */ - - outb(0,XD_SELECT); - outb(mode,XD_CONTROL); - - if (xd_waitport(XD_STATUS,STAT_SELECT,STAT_SELECT,timeout)) - return (1); - - while (!complete) { - if (xd_waitport(XD_STATUS,STAT_READY,STAT_READY,timeout)) - return (1); - - switch (inb(XD_STATUS) & (STAT_COMMAND | STAT_INPUT)) { - case 0: - if (mode == DMA_MODE) { - if (xd_wait_for_IRQ()) - return (1); - } else - outb(outdata ? *outdata++ : 0,XD_DATA); - break; - case STAT_INPUT: - if (mode == DMA_MODE) { - if (xd_wait_for_IRQ()) - return (1); - } else - if (indata) - *indata++ = inb(XD_DATA); - else - inb(XD_DATA); - break; - case STAT_COMMAND: - outb(command ? *command++ : 0,XD_DATA); - break; - case STAT_COMMAND | STAT_INPUT: - complete = 1; - break; - } - } - csb = inb(XD_DATA); - - if (xd_waitport(XD_STATUS,0,STAT_SELECT,timeout)) /* wait until deselected */ - return (1); - - if (csb & CSB_ERROR) { /* read sense data if error */ - xd_build(cmdblk,CMD_SENSE,(csb & CSB_LUN) >> 5,0,0,0,0,0); - if (xd_command(cmdblk,0,sense,NULL,NULL,XD_TIMEOUT)) - printk("xd: warning! sense command failed!\n"); - } - -#ifdef DEBUG_COMMAND - printk("xd_command: completed with csb = 0x%X\n",csb); -#endif /* DEBUG_COMMAND */ - - return (csb & CSB_ERROR); -} - -static u_char __init xd_initdrives (void (*init_drive)(u_char drive)) -{ - u_char cmdblk[6],i,count = 0; - - for (i = 0; i < XD_MAXDRIVES; i++) { - xd_build(cmdblk,CMD_TESTREADY,i,0,0,0,0,0); - if (!xd_command(cmdblk,PIO_MODE,NULL,NULL,NULL,XD_TIMEOUT*8)) { - msleep_interruptible(XD_INIT_DISK_DELAY); - - init_drive(count); - count++; - - msleep_interruptible(XD_INIT_DISK_DELAY); - } - } - return (count); -} - -static void __init xd_manual_geo_set (u_char drive) -{ - xd_info[drive].heads = (u_char)(xd_geo[3 * drive + 1]); - xd_info[drive].cylinders = (u_short)(xd_geo[3 * drive]); - xd_info[drive].sectors = (u_char)(xd_geo[3 * drive + 2]); -} - -static void __init xd_dtc_init_controller (unsigned int address) -{ - switch (address) { - case 0x00000: - case 0xC8000: break; /*initial: 0x320 */ - case 0xCA000: xd_iobase = 0x324; - case 0xD0000: /*5150CX*/ - case 0xD8000: break; /*5150CX & 5150XL*/ - default: printk("xd_dtc_init_controller: unsupported BIOS address %06x\n",address); - break; - } - xd_maxsectors = 0x01; /* my card seems to have trouble doing multi-block transfers? */ - - outb(0,XD_RESET); /* reset the controller */ -} - - -static void __init xd_dtc5150cx_init_drive (u_char drive) -{ - /* values from controller's BIOS - BIOS chip may be removed */ - static u_short geometry_table[][4] = { - {0x200,8,0x200,0x100}, - {0x267,2,0x267,0x267}, - {0x264,4,0x264,0x80}, - {0x132,4,0x132,0x0}, - {0x132,2,0x80, 0x132}, - {0x177,8,0x177,0x0}, - {0x132,8,0x84, 0x0}, - {}, /* not used */ - {0x132,6,0x80, 0x100}, - {0x200,6,0x100,0x100}, - {0x264,2,0x264,0x80}, - {0x280,4,0x280,0x100}, - {0x2B9,3,0x2B9,0x2B9}, - {0x2B9,5,0x2B9,0x2B9}, - {0x280,6,0x280,0x100}, - {0x132,4,0x132,0x0}}; - u_char n; - - n = inb(XD_JUMPER); - n = (drive ? n : (n >> 2)) & 0x33; - n = (n | (n >> 2)) & 0x0F; - if (xd_geo[3*drive]) - xd_manual_geo_set(drive); - else - if (n != 7) { - xd_info[drive].heads = (u_char)(geometry_table[n][1]); /* heads */ - xd_info[drive].cylinders = geometry_table[n][0]; /* cylinders */ - xd_info[drive].sectors = 17; /* sectors */ -#if 0 - xd_info[drive].rwrite = geometry_table[n][2]; /* reduced write */ - xd_info[drive].precomp = geometry_table[n][3] /* write precomp */ - xd_info[drive].ecc = 0x0B; /* ecc length */ -#endif /* 0 */ - } - else { - printk("xd%c: undetermined drive geometry\n",'a'+drive); - return; - } - xd_info[drive].control = 5; /* control byte */ - xd_setparam(CMD_DTCSETPARAM,drive,xd_info[drive].heads,xd_info[drive].cylinders,geometry_table[n][2],geometry_table[n][3],0x0B); - xd_recalibrate(drive); -} - -static void __init xd_dtc_init_drive (u_char drive) -{ - u_char cmdblk[6],buf[64]; - - xd_build(cmdblk,CMD_DTCGETGEOM,drive,0,0,0,0,0); - if (!xd_command(cmdblk,PIO_MODE,buf,NULL,NULL,XD_TIMEOUT * 2)) { - xd_info[drive].heads = buf[0x0A]; /* heads */ - xd_info[drive].cylinders = ((u_short *) (buf))[0x04]; /* cylinders */ - xd_info[drive].sectors = 17; /* sectors */ - if (xd_geo[3*drive]) - xd_manual_geo_set(drive); -#if 0 - xd_info[drive].rwrite = ((u_short *) (buf + 1))[0x05]; /* reduced write */ - xd_info[drive].precomp = ((u_short *) (buf + 1))[0x06]; /* write precomp */ - xd_info[drive].ecc = buf[0x0F]; /* ecc length */ -#endif /* 0 */ - xd_info[drive].control = 0; /* control byte */ - - xd_setparam(CMD_DTCSETPARAM,drive,xd_info[drive].heads,xd_info[drive].cylinders,((u_short *) (buf + 1))[0x05],((u_short *) (buf + 1))[0x06],buf[0x0F]); - xd_build(cmdblk,CMD_DTCSETSTEP,drive,0,0,0,0,7); - if (xd_command(cmdblk,PIO_MODE,NULL,NULL,NULL,XD_TIMEOUT * 2)) - printk("xd_dtc_init_drive: error setting step rate for xd%c\n", 'a'+drive); - } - else - printk("xd_dtc_init_drive: error reading geometry for xd%c\n", 'a'+drive); -} - -static void __init xd_wd_init_controller (unsigned int address) -{ - switch (address) { - case 0x00000: - case 0xC8000: break; /*initial: 0x320 */ - case 0xCA000: xd_iobase = 0x324; break; - case 0xCC000: xd_iobase = 0x328; break; - case 0xCE000: xd_iobase = 0x32C; break; - case 0xD0000: xd_iobase = 0x328; break; /* ? */ - case 0xD8000: xd_iobase = 0x32C; break; /* ? */ - default: printk("xd_wd_init_controller: unsupported BIOS address %06x\n",address); - break; - } - xd_maxsectors = 0x01; /* this one doesn't wrap properly either... */ - - outb(0,XD_RESET); /* reset the controller */ - - msleep(XD_INIT_DISK_DELAY); -} - -static void __init xd_wd_init_drive (u_char drive) -{ - /* values from controller's BIOS - BIOS may be disabled */ - static u_short geometry_table[][4] = { - {0x264,4,0x1C2,0x1C2}, /* common part */ - {0x132,4,0x099,0x0}, - {0x267,2,0x1C2,0x1C2}, - {0x267,4,0x1C2,0x1C2}, - - {0x334,6,0x335,0x335}, /* 1004 series RLL */ - {0x30E,4,0x30F,0x3DC}, - {0x30E,2,0x30F,0x30F}, - {0x267,4,0x268,0x268}, - - {0x3D5,5,0x3D6,0x3D6}, /* 1002 series RLL */ - {0x3DB,7,0x3DC,0x3DC}, - {0x264,4,0x265,0x265}, - {0x267,4,0x268,0x268}}; - - u_char cmdblk[6],buf[0x200]; - u_char n = 0,rll,jumper_state,use_jumper_geo; - u_char wd_1002 = (xd_sigs[xd_type].string[7] == '6'); - - jumper_state = ~(inb(0x322)); - if (jumper_state & 0x40) - xd_irq = 9; - rll = (jumper_state & 0x30) ? (0x04 << wd_1002) : 0; - xd_build(cmdblk,CMD_READ,drive,0,0,0,1,0); - if (!xd_command(cmdblk,PIO_MODE,buf,NULL,NULL,XD_TIMEOUT * 2)) { - xd_info[drive].heads = buf[0x1AF]; /* heads */ - xd_info[drive].cylinders = ((u_short *) (buf + 1))[0xD6]; /* cylinders */ - xd_info[drive].sectors = 17; /* sectors */ - if (xd_geo[3*drive]) - xd_manual_geo_set(drive); -#if 0 - xd_info[drive].rwrite = ((u_short *) (buf))[0xD8]; /* reduced write */ - xd_info[drive].wprecomp = ((u_short *) (buf))[0xDA]; /* write precomp */ - xd_info[drive].ecc = buf[0x1B4]; /* ecc length */ -#endif /* 0 */ - xd_info[drive].control = buf[0x1B5]; /* control byte */ - use_jumper_geo = !(xd_info[drive].heads) || !(xd_info[drive].cylinders); - if (xd_geo[3*drive]) { - xd_manual_geo_set(drive); - xd_info[drive].control = rll ? 7 : 5; - } - else if (use_jumper_geo) { - n = (((jumper_state & 0x0F) >> (drive << 1)) & 0x03) | rll; - xd_info[drive].cylinders = geometry_table[n][0]; - xd_info[drive].heads = (u_char)(geometry_table[n][1]); - xd_info[drive].control = rll ? 7 : 5; -#if 0 - xd_info[drive].rwrite = geometry_table[n][2]; - xd_info[drive].wprecomp = geometry_table[n][3]; - xd_info[drive].ecc = 0x0B; -#endif /* 0 */ - } - if (!wd_1002) { - if (use_jumper_geo) - xd_setparam(CMD_WDSETPARAM,drive,xd_info[drive].heads,xd_info[drive].cylinders, - geometry_table[n][2],geometry_table[n][3],0x0B); - else - xd_setparam(CMD_WDSETPARAM,drive,xd_info[drive].heads,xd_info[drive].cylinders, - ((u_short *) (buf))[0xD8],((u_short *) (buf))[0xDA],buf[0x1B4]); - } - /* 1002 based RLL controller requests converted addressing, but reports physical - (physical 26 sec., logical 17 sec.) - 1004 based ???? */ - if (rll & wd_1002) { - if ((xd_info[drive].cylinders *= 26, - xd_info[drive].cylinders /= 17) > 1023) - xd_info[drive].cylinders = 1023; /* 1024 ? */ -#if 0 - xd_info[drive].rwrite *= 26; - xd_info[drive].rwrite /= 17; - xd_info[drive].wprecomp *= 26 - xd_info[drive].wprecomp /= 17; -#endif /* 0 */ - } - } - else - printk("xd_wd_init_drive: error reading geometry for xd%c\n",'a'+drive); - -} - -static void __init xd_seagate_init_controller (unsigned int address) -{ - switch (address) { - case 0x00000: - case 0xC8000: break; /*initial: 0x320 */ - case 0xD0000: xd_iobase = 0x324; break; - case 0xD8000: xd_iobase = 0x328; break; - case 0xE0000: xd_iobase = 0x32C; break; - default: printk("xd_seagate_init_controller: unsupported BIOS address %06x\n",address); - break; - } - xd_maxsectors = 0x40; - - outb(0,XD_RESET); /* reset the controller */ -} - -static void __init xd_seagate_init_drive (u_char drive) -{ - u_char cmdblk[6],buf[0x200]; - - xd_build(cmdblk,CMD_ST11GETGEOM,drive,0,0,0,1,0); - if (!xd_command(cmdblk,PIO_MODE,buf,NULL,NULL,XD_TIMEOUT * 2)) { - xd_info[drive].heads = buf[0x04]; /* heads */ - xd_info[drive].cylinders = (buf[0x02] << 8) | buf[0x03]; /* cylinders */ - xd_info[drive].sectors = buf[0x05]; /* sectors */ - xd_info[drive].control = 0; /* control byte */ - } - else - printk("xd_seagate_init_drive: error reading geometry from xd%c\n", 'a'+drive); -} - -/* Omti support courtesy Dirk Melchers */ -static void __init xd_omti_init_controller (unsigned int address) -{ - switch (address) { - case 0x00000: - case 0xC8000: break; /*initial: 0x320 */ - case 0xD0000: xd_iobase = 0x324; break; - case 0xD8000: xd_iobase = 0x328; break; - case 0xE0000: xd_iobase = 0x32C; break; - default: printk("xd_omti_init_controller: unsupported BIOS address %06x\n",address); - break; - } - - xd_maxsectors = 0x40; - - outb(0,XD_RESET); /* reset the controller */ -} - -static void __init xd_omti_init_drive (u_char drive) -{ - /* gets infos from drive */ - xd_override_init_drive(drive); - - /* set other parameters, Hardcoded, not that nice :-) */ - xd_info[drive].control = 2; -} - -/* Xebec support (AK) */ -static void __init xd_xebec_init_controller (unsigned int address) -{ -/* iobase may be set manually in range 0x300 - 0x33C - irq may be set manually to 2(9),3,4,5,6,7 - dma may be set manually to 1,2,3 - (How to detect them ???) -BIOS address may be set manually in range 0x0 - 0xF8000 -If you need non-standard settings use the xd=... command */ - - switch (address) { - case 0x00000: - case 0xC8000: /* initially: xd_iobase==0x320 */ - case 0xD0000: - case 0xD2000: - case 0xD4000: - case 0xD6000: - case 0xD8000: - case 0xDA000: - case 0xDC000: - case 0xDE000: - case 0xE0000: break; - default: printk("xd_xebec_init_controller: unsupported BIOS address %06x\n",address); - break; - } - - xd_maxsectors = 0x01; - outb(0,XD_RESET); /* reset the controller */ - - msleep(XD_INIT_DISK_DELAY); -} - -static void __init xd_xebec_init_drive (u_char drive) -{ - /* values from controller's BIOS - BIOS chip may be removed */ - static u_short geometry_table[][5] = { - {0x132,4,0x080,0x080,0x7}, - {0x132,4,0x080,0x080,0x17}, - {0x264,2,0x100,0x100,0x7}, - {0x264,2,0x100,0x100,0x17}, - {0x132,8,0x080,0x080,0x7}, - {0x132,8,0x080,0x080,0x17}, - {0x264,4,0x100,0x100,0x6}, - {0x264,4,0x100,0x100,0x17}, - {0x2BC,5,0x2BC,0x12C,0x6}, - {0x3A5,4,0x3A5,0x3A5,0x7}, - {0x26C,6,0x26C,0x26C,0x7}, - {0x200,8,0x200,0x100,0x17}, - {0x400,5,0x400,0x400,0x7}, - {0x400,6,0x400,0x400,0x7}, - {0x264,8,0x264,0x200,0x17}, - {0x33E,7,0x33E,0x200,0x7}}; - u_char n; - - n = inb(XD_JUMPER) & 0x0F; /* BIOS's drive number: same geometry - is assumed for BOTH drives */ - if (xd_geo[3*drive]) - xd_manual_geo_set(drive); - else { - xd_info[drive].heads = (u_char)(geometry_table[n][1]); /* heads */ - xd_info[drive].cylinders = geometry_table[n][0]; /* cylinders */ - xd_info[drive].sectors = 17; /* sectors */ -#if 0 - xd_info[drive].rwrite = geometry_table[n][2]; /* reduced write */ - xd_info[drive].precomp = geometry_table[n][3] /* write precomp */ - xd_info[drive].ecc = 0x0B; /* ecc length */ -#endif /* 0 */ - } - xd_info[drive].control = geometry_table[n][4]; /* control byte */ - xd_setparam(CMD_XBSETPARAM,drive,xd_info[drive].heads,xd_info[drive].cylinders,geometry_table[n][2],geometry_table[n][3],0x0B); - xd_recalibrate(drive); -} - -/* xd_override_init_drive: this finds disk geometry in a "binary search" style, narrowing in on the "correct" number of heads - etc. by trying values until it gets the highest successful value. Idea courtesy Salvador Abreu (spa@fct.unl.pt). */ -static void __init xd_override_init_drive (u_char drive) -{ - u_short min[] = { 0,0,0 },max[] = { 16,1024,64 },test[] = { 0,0,0 }; - u_char cmdblk[6],i; - - if (xd_geo[3*drive]) - xd_manual_geo_set(drive); - else { - for (i = 0; i < 3; i++) { - while (min[i] != max[i] - 1) { - test[i] = (min[i] + max[i]) / 2; - xd_build(cmdblk,CMD_SEEK,drive,(u_char) test[0],(u_short) test[1],(u_char) test[2],0,0); - if (!xd_command(cmdblk,PIO_MODE,NULL,NULL,NULL,XD_TIMEOUT * 2)) - min[i] = test[i]; - else - max[i] = test[i]; - } - test[i] = min[i]; - } - xd_info[drive].heads = (u_char) min[0] + 1; - xd_info[drive].cylinders = (u_short) min[1] + 1; - xd_info[drive].sectors = (u_char) min[2] + 1; - } - xd_info[drive].control = 0; -} - -/* xd_setup: initialise controller from command line parameters */ -static void __init do_xd_setup (int *integers) -{ - switch (integers[0]) { - case 4: if (integers[4] < 0) - nodma = 1; - else if (integers[4] < 8) - xd_dma = integers[4]; - case 3: if ((integers[3] > 0) && (integers[3] <= 0x3FC)) - xd_iobase = integers[3]; - case 2: if ((integers[2] > 0) && (integers[2] < 16)) - xd_irq = integers[2]; - case 1: xd_override = 1; - if ((integers[1] >= 0) && (integers[1] < ARRAY_SIZE(xd_sigs))) - xd_type = integers[1]; - case 0: break; - default:printk("xd: too many parameters for xd\n"); - } - xd_maxsectors = 0x01; -} - -/* xd_setparam: set the drive characteristics */ -static void __init xd_setparam (u_char command,u_char drive,u_char heads,u_short cylinders,u_short rwrite,u_short wprecomp,u_char ecc) -{ - u_char cmdblk[14]; - - xd_build(cmdblk,command,drive,0,0,0,0,0); - cmdblk[6] = (u_char) (cylinders >> 8) & 0x03; - cmdblk[7] = (u_char) (cylinders & 0xFF); - cmdblk[8] = heads & 0x1F; - cmdblk[9] = (u_char) (rwrite >> 8) & 0x03; - cmdblk[10] = (u_char) (rwrite & 0xFF); - cmdblk[11] = (u_char) (wprecomp >> 8) & 0x03; - cmdblk[12] = (u_char) (wprecomp & 0xFF); - cmdblk[13] = ecc; - - /* Some controllers require geometry info as data, not command */ - - if (xd_command(cmdblk,PIO_MODE,NULL,&cmdblk[6],NULL,XD_TIMEOUT * 2)) - printk("xd: error setting characteristics for xd%c\n", 'a'+drive); -} - - -#ifdef MODULE - -module_param_array(xd, int, NULL, 0); -module_param_array(xd_geo, int, NULL, 0); -module_param(nodma, bool, 0); - -MODULE_LICENSE("GPL"); - -void cleanup_module(void) -{ - int i; - unregister_blkdev(XT_DISK_MAJOR, "xd"); - for (i = 0; i < xd_drives; i++) { - del_gendisk(xd_gendisk[i]); - put_disk(xd_gendisk[i]); - } - blk_cleanup_queue(xd_queue); - release_region(xd_iobase,4); - if (xd_drives) { - free_irq(xd_irq, NULL); - free_dma(xd_dma); - if (xd_dma_buffer) - xd_dma_mem_free((unsigned long)xd_dma_buffer, xd_maxsectors * 0x200); - } -} -#else - -static int __init xd_setup (char *str) -{ - int ints[5]; - get_options (str, ARRAY_SIZE (ints), ints); - do_xd_setup (ints); - return 1; -} - -/* xd_manual_geo_init: initialise drive geometry from command line parameters - (used only for WD drives) */ -static int __init xd_manual_geo_init (char *str) -{ - int i, integers[1 + 3*XD_MAXDRIVES]; - - get_options (str, ARRAY_SIZE (integers), integers); - if (integers[0]%3 != 0) { - printk("xd: incorrect number of parameters for xd_geo\n"); - return 1; - } - for (i = 0; (i < integers[0]) && (i < 3*XD_MAXDRIVES); i++) - xd_geo[i] = integers[i+1]; - return 1; -} - -__setup ("xd=", xd_setup); -__setup ("xd_geo=", xd_manual_geo_init); - -#endif /* MODULE */ - -module_init(xd_init); -MODULE_ALIAS_BLOCKDEV_MAJOR(XT_DISK_MAJOR); diff --git a/drivers/block/xd.h b/drivers/block/xd.h deleted file mode 100644 index 37cacef..0000000 --- a/drivers/block/xd.h +++ /dev/null @@ -1,134 +0,0 @@ -#ifndef _LINUX_XD_H -#define _LINUX_XD_H - -/* - * This file contains the definitions for the IO ports and errors etc. for XT hard disk controllers (at least the DTC 5150X). - * - * Author: Pat Mackinlay, pat@it.com.au - * Date: 29/09/92 - * - * Revised: 01/01/93, ... - * - * Ref: DTC 5150X Controller Specification (thanks to Kevin Fowler, kevinf@agora.rain.com) - * Also thanks to: Salvador Abreu, Dave Thaler, Risto Kankkunen and Wim Van Dorst. - */ - -#include <linux/interrupt.h> - -/* XT hard disk controller registers */ -#define XD_DATA (xd_iobase + 0x00) /* data RW register */ -#define XD_RESET (xd_iobase + 0x01) /* reset WO register */ -#define XD_STATUS (xd_iobase + 0x01) /* status RO register */ -#define XD_SELECT (xd_iobase + 0x02) /* select WO register */ -#define XD_JUMPER (xd_iobase + 0x02) /* jumper RO register */ -#define XD_CONTROL (xd_iobase + 0x03) /* DMAE/INTE WO register */ -#define XD_RESERVED (xd_iobase + 0x03) /* reserved */ - -/* XT hard disk controller commands (incomplete list) */ -#define CMD_TESTREADY 0x00 /* test drive ready */ -#define CMD_RECALIBRATE 0x01 /* recalibrate drive */ -#define CMD_SENSE 0x03 /* request sense */ -#define CMD_FORMATDRV 0x04 /* format drive */ -#define CMD_VERIFY 0x05 /* read verify */ -#define CMD_FORMATTRK 0x06 /* format track */ -#define CMD_FORMATBAD 0x07 /* format bad track */ -#define CMD_READ 0x08 /* read */ -#define CMD_WRITE 0x0A /* write */ -#define CMD_SEEK 0x0B /* seek */ - -/* Controller specific commands */ -#define CMD_DTCSETPARAM 0x0C /* set drive parameters (DTC 5150X & CX only?) */ -#define CMD_DTCGETECC 0x0D /* get ecc error length (DTC 5150X only?) */ -#define CMD_DTCREADBUF 0x0E /* read sector buffer (DTC 5150X only?) */ -#define CMD_DTCWRITEBUF 0x0F /* write sector buffer (DTC 5150X only?) */ -#define CMD_DTCREMAPTRK 0x11 /* assign alternate track (DTC 5150X only?) */ -#define CMD_DTCGETPARAM 0xFB /* get drive parameters (DTC 5150X only?) */ -#define CMD_DTCSETSTEP 0xFC /* set step rate (DTC 5150X only?) */ -#define CMD_DTCSETGEOM 0xFE /* set geometry data (DTC 5150X only?) */ -#define CMD_DTCGETGEOM 0xFF /* get geometry data (DTC 5150X only?) */ -#define CMD_ST11GETGEOM 0xF8 /* get geometry data (Seagate ST11R/M only?) */ -#define CMD_WDSETPARAM 0x0C /* set drive parameters (WD 1004A27X only?) */ -#define CMD_XBSETPARAM 0x0C /* set drive parameters (XEBEC only?) */ - -/* Bits for command status byte */ -#define CSB_ERROR 0x02 /* error */ -#define CSB_LUN 0x20 /* logical Unit Number */ - -/* XT hard disk controller status bits */ -#define STAT_READY 0x01 /* controller is ready */ -#define STAT_INPUT 0x02 /* data flowing from controller to host */ -#define STAT_COMMAND 0x04 /* controller in command phase */ -#define STAT_SELECT 0x08 /* controller is selected */ -#define STAT_REQUEST 0x10 /* controller requesting data */ -#define STAT_INTERRUPT 0x20 /* controller requesting interrupt */ - -/* XT hard disk controller control bits */ -#define PIO_MODE 0x00 /* control bits to set for PIO */ -#define DMA_MODE 0x03 /* control bits to set for DMA & interrupt */ - -#define XD_MAXDRIVES 2 /* maximum 2 drives */ -#define XD_TIMEOUT HZ /* 1 second timeout */ -#define XD_RETRIES 4 /* maximum 4 retries */ - -#undef DEBUG /* define for debugging output */ - -#ifdef DEBUG - #define DEBUG_STARTUP /* debug driver initialisation */ - #define DEBUG_OVERRIDE /* debug override geometry detection */ - #define DEBUG_READWRITE /* debug each read/write command */ - #define DEBUG_OTHER /* debug misc. interrupt/DMA stuff */ - #define DEBUG_COMMAND /* debug each controller command */ -#endif /* DEBUG */ - -/* this structure defines the XT drives and their types */ -typedef struct { - u_char heads; - u_short cylinders; - u_char sectors; - u_char control; - int unit; -} XD_INFO; - -/* this structure defines a ROM BIOS signature */ -typedef struct { - unsigned int offset; - const char *string; - void (*init_controller)(unsigned int address); - void (*init_drive)(u_char drive); - const char *name; -} XD_SIGNATURE; - -#ifndef MODULE -static int xd_manual_geo_init (char *command); -#endif /* MODULE */ -static u_char xd_detect (u_char *controller, unsigned int *address); -static u_char xd_initdrives (void (*init_drive)(u_char drive)); - -static void do_xd_request (struct request_queue * q); -static int xd_ioctl (struct block_device *bdev,fmode_t mode,unsigned int cmd,unsigned long arg); -static int xd_readwrite (u_char operation,XD_INFO *disk,char *buffer,u_int block,u_int count); -static void xd_recalibrate (u_char drive); - -static irqreturn_t xd_interrupt_handler(int irq, void *dev_id); -static u_char xd_setup_dma (u_char opcode,u_char *buffer,u_int count); -static u_char *xd_build (u_char *cmdblk,u_char command,u_char drive,u_char head,u_short cylinder,u_char sector,u_char count,u_char control); -static void xd_watchdog (unsigned long unused); -static inline u_char xd_waitport (u_short port,u_char flags,u_char mask,u_long timeout); -static u_int xd_command (u_char *command,u_char mode,u_char *indata,u_char *outdata,u_char *sense,u_long timeout); - -/* card specific setup and geometry gathering code */ -static void xd_dtc_init_controller (unsigned int address); -static void xd_dtc5150cx_init_drive (u_char drive); -static void xd_dtc_init_drive (u_char drive); -static void xd_wd_init_controller (unsigned int address); -static void xd_wd_init_drive (u_char drive); -static void xd_seagate_init_controller (unsigned int address); -static void xd_seagate_init_drive (u_char drive); -static void xd_omti_init_controller (unsigned int address); -static void xd_omti_init_drive (u_char drive); -static void xd_xebec_init_controller (unsigned int address); -static void xd_xebec_init_drive (u_char drive); -static void xd_setparam (u_char command,u_char drive,u_char heads,u_short cylinders,u_short rwrite,u_short wprecomp,u_char ecc); -static void xd_override_init_drive (u_char drive); - -#endif /* _LINUX_XD_H */ diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c index 5ac841f..de1f319 100644 --- a/drivers/block/xen-blkback/blkback.c +++ b/drivers/block/xen-blkback/blkback.c @@ -46,6 +46,7 @@ #include <xen/xen.h> #include <asm/xen/hypervisor.h> #include <asm/xen/hypercall.h> +#include <xen/balloon.h> #include "common.h" /* @@ -239,6 +240,7 @@ static void free_persistent_gnts(struct rb_root *root, unsigned int num) ret = gnttab_unmap_refs(unmap, NULL, pages, segs_to_unmap); BUG_ON(ret); + free_xenballooned_pages(segs_to_unmap, pages); segs_to_unmap = 0; } @@ -527,8 +529,8 @@ static int xen_blkbk_map(struct blkif_request *req, GFP_KERNEL); if (!persistent_gnt) return -ENOMEM; - persistent_gnt->page = alloc_page(GFP_KERNEL); - if (!persistent_gnt->page) { + if (alloc_xenballooned_pages(1, &persistent_gnt->page, + false)) { kfree(persistent_gnt); return -ENOMEM; } @@ -879,7 +881,6 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif, goto fail_response; } - preq.dev = req->u.rw.handle; preq.sector_number = req->u.rw.sector_number; preq.nr_sects = 0; diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c index 6398072..5e237f6 100644 --- a/drivers/block/xen-blkback/xenbus.c +++ b/drivers/block/xen-blkback/xenbus.c @@ -367,6 +367,7 @@ static int xen_blkbk_remove(struct xenbus_device *dev) be->blkif = NULL; } + kfree(be->mode); kfree(be); dev_set_drvdata(&dev->dev, NULL); return 0; @@ -502,6 +503,7 @@ static void backend_changed(struct xenbus_watch *watch, = container_of(watch, struct backend_info, backend_watch); struct xenbus_device *dev = be->dev; int cdrom = 0; + unsigned long handle; char *device_type; DPRINTK(""); @@ -521,10 +523,10 @@ static void backend_changed(struct xenbus_watch *watch, return; } - if ((be->major || be->minor) && - ((be->major != major) || (be->minor != minor))) { - pr_warn(DRV_PFX "changing physical device (from %x:%x to %x:%x) not supported.\n", - be->major, be->minor, major, minor); + if (be->major | be->minor) { + if (be->major != major || be->minor != minor) + pr_warn(DRV_PFX "changing physical device (from %x:%x to %x:%x) not supported.\n", + be->major, be->minor, major, minor); return; } @@ -542,36 +544,33 @@ static void backend_changed(struct xenbus_watch *watch, kfree(device_type); } - if (be->major == 0 && be->minor == 0) { - /* Front end dir is a number, which is used as the handle. */ - - char *p = strrchr(dev->otherend, '/') + 1; - long handle; - err = strict_strtoul(p, 0, &handle); - if (err) - return; + /* Front end dir is a number, which is used as the handle. */ + err = strict_strtoul(strrchr(dev->otherend, '/') + 1, 0, &handle); + if (err) + return; - be->major = major; - be->minor = minor; + be->major = major; + be->minor = minor; - err = xen_vbd_create(be->blkif, handle, major, minor, - (NULL == strchr(be->mode, 'w')), cdrom); - if (err) { - be->major = 0; - be->minor = 0; - xenbus_dev_fatal(dev, err, "creating vbd structure"); - return; - } + err = xen_vbd_create(be->blkif, handle, major, minor, + !strchr(be->mode, 'w'), cdrom); + if (err) + xenbus_dev_fatal(dev, err, "creating vbd structure"); + else { err = xenvbd_sysfs_addif(dev); if (err) { xen_vbd_free(&be->blkif->vbd); - be->major = 0; - be->minor = 0; xenbus_dev_fatal(dev, err, "creating sysfs entries"); - return; } + } + if (err) { + kfree(be->mode); + be->mode = NULL; + be->major = 0; + be->minor = 0; + } else { /* We're potentially connected now */ xen_update_blkif_status(be->blkif); } diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index 11043c1..c3dae2e 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -791,7 +791,7 @@ static void blkif_restart_queue(struct work_struct *work) static void blkif_free(struct blkfront_info *info, int suspend) { struct llist_node *all_gnts; - struct grant *persistent_gnt; + struct grant *persistent_gnt, *tmp; struct llist_node *n; /* Prevent new requests being issued until we fix things up. */ @@ -805,10 +805,17 @@ static void blkif_free(struct blkfront_info *info, int suspend) /* Remove all persistent grants */ if (info->persistent_gnts_c) { all_gnts = llist_del_all(&info->persistent_gnts); - llist_for_each_entry_safe(persistent_gnt, n, all_gnts, node) { + persistent_gnt = llist_entry(all_gnts, typeof(*(persistent_gnt)), node); + while (persistent_gnt) { gnttab_end_foreign_access(persistent_gnt->gref, 0, 0UL); __free_page(pfn_to_page(persistent_gnt->pfn)); - kfree(persistent_gnt); + tmp = persistent_gnt; + n = persistent_gnt->node.next; + if (n) + persistent_gnt = llist_entry(n, typeof(*(persistent_gnt)), node); + else + persistent_gnt = NULL; + kfree(tmp); } info->persistent_gnts_c = 0; } |