From 2c5816b4beccc8ba709144539f6fdd764f8fa49c Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 10 Nov 2015 10:32:36 +0100 Subject: cuse: fix memory leak The problem is that fuse_dev_alloc() acquires an extra reference to cc.fc, and the original ref count is never dropped. Reported-by: Colin Ian King Signed-off-by: Miklos Szeredi Fixes: cc080e9e9be1 ("fuse: introduce per-instance fuse_dev structure") Cc: # v4.2+ diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c index eae2c11..8e3ee19 100644 --- a/fs/fuse/cuse.c +++ b/fs/fuse/cuse.c @@ -549,6 +549,8 @@ static int cuse_channel_release(struct inode *inode, struct file *file) unregister_chrdev_region(cc->cdev->dev, 1); cdev_del(cc->cdev); } + /* Base reference is now owned by "fud" */ + fuse_conn_put(&cc->fc); rc = fuse_dev_release(inode, file); /* puts the base reference */ -- cgit v0.10.2 From 3ca8138f014a913f98e6ef40e939868e1e9ea876 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Mon, 12 Oct 2015 16:33:44 +0300 Subject: fuse: break infinite loop in fuse_fill_write_pages() I got a report about unkillable task eating CPU. Further investigation shows, that the problem is in the fuse_fill_write_pages() function. If iov's first segment has zero length, we get an infinite loop, because we never reach iov_iter_advance() call. Fix this by calling iov_iter_advance() before repeating an attempt to copy data from userspace. A similar problem is described in 124d3b7041f ("fix writev regression: pan hanging unkillable and un-straceable"). If zero-length segmend is followed by segment with invalid address, iov_iter_fault_in_readable() checks only first segment (zero-length), iov_iter_copy_from_user_atomic() skips it, fails at second and returns zero -> goto again without skipping zero-length segment. Patch calls iov_iter_advance() before goto again: we'll skip zero-length segment at second iteraction and iov_iter_fault_in_readable() will detect invalid address. Special thanks to Konstantin Khlebnikov, who helped a lot with the commit description. Cc: Andrew Morton Cc: Maxim Patlasov Cc: Konstantin Khlebnikov Signed-off-by: Roman Gushchin Signed-off-by: Miklos Szeredi Fixes: ea9b9907b82a ("fuse: implement perform_write") Cc: diff --git a/fs/fuse/file.c b/fs/fuse/file.c index f523f2f..195476a 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1049,6 +1049,7 @@ static ssize_t fuse_fill_write_pages(struct fuse_req *req, tmp = iov_iter_copy_from_user_atomic(page, ii, offset, bytes); flush_dcache_page(page); + iov_iter_advance(ii, tmp); if (!tmp) { unlock_page(page); page_cache_release(page); @@ -1061,7 +1062,6 @@ static ssize_t fuse_fill_write_pages(struct fuse_req *req, req->page_descs[req->num_pages].length = tmp; req->num_pages++; - iov_iter_advance(ii, tmp); count += tmp; pos += tmp; offset += tmp; -- cgit v0.10.2 From 34c06254ff82a815fdccdfae7517a06c9b768cee Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 5 Nov 2015 00:12:24 -0500 Subject: cgroup: fix cftype->file_offset handling 6f60eade2433 ("cgroup: generalize obtaining the handles of and notifying cgroup files") introduced cftype->file_offset so that the handles for per-css file instances can be recorded. These handles then can be used, for example, to generate file modified notifications. Unfortunately, it made the wrong assumption that files are created once for a given css and removed on its destruction. Due to the dependencies among subsystems, a css may be hidden from userland and then later shown again. This is implemented by removing and re-creating the affected files, so the associated kernfs_node for a given cgroup file may change over time. This incorrect assumption led to the corruption of css->files lists. Reimplement cftype->file_offset handling so that cgroup_file->kn is protected by a lock and updated as files are created and destroyed. This also makes keeping them on per-cgroup list unnecessary. Signed-off-by: Tejun Heo Reported-by: James Sedgwick Fixes: 6f60eade2433 ("cgroup: generalize obtaining the handles of and notifying cgroup files") Acked-by: Johannes Weiner Acked-by: Zefan Li diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 60d44b2..869fd4a 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -90,7 +90,6 @@ enum { */ struct cgroup_file { /* do not access any fields from outside cgroup core */ - struct list_head node; /* anchored at css->files */ struct kernfs_node *kn; }; @@ -134,9 +133,6 @@ struct cgroup_subsys_state { */ u64 serial_nr; - /* all cgroup_files associated with this css */ - struct list_head files; - /* percpu_ref killing and RCU release */ struct rcu_head rcu_head; struct work_struct destroy_work; diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 22e3754..f640830 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -88,6 +88,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from); int cgroup_add_dfl_cftypes(struct cgroup_subsys *ss, struct cftype *cfts); int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts); int cgroup_rm_cftypes(struct cftype *cfts); +void cgroup_file_notify(struct cgroup_file *cfile); char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen); int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry); @@ -516,19 +517,6 @@ static inline void pr_cont_cgroup_path(struct cgroup *cgrp) pr_cont_kernfs_path(cgrp->kn); } -/** - * cgroup_file_notify - generate a file modified event for a cgroup_file - * @cfile: target cgroup_file - * - * @cfile must have been obtained by setting cftype->file_offset. - */ -static inline void cgroup_file_notify(struct cgroup_file *cfile) -{ - /* might not have been created due to one of the CFTYPE selector flags */ - if (cfile->kn) - kernfs_notify(cfile->kn); -} - #else /* !CONFIG_CGROUPS */ struct cgroup_subsys_state; diff --git a/kernel/cgroup.c b/kernel/cgroup.c index f1603c1..b316deb 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -98,6 +98,12 @@ static DEFINE_SPINLOCK(css_set_lock); static DEFINE_SPINLOCK(cgroup_idr_lock); /* + * Protects cgroup_file->kn for !self csses. It synchronizes notifications + * against file removal/re-creation across css hiding. + */ +static DEFINE_SPINLOCK(cgroup_file_kn_lock); + +/* * Protects cgroup_subsys->release_agent_path. Modifying it also requires * cgroup_mutex. Reading requires either cgroup_mutex or this spinlock. */ @@ -1393,6 +1399,16 @@ static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) char name[CGROUP_FILE_NAME_MAX]; lockdep_assert_held(&cgroup_mutex); + + if (cft->file_offset) { + struct cgroup_subsys_state *css = cgroup_css(cgrp, cft->ss); + struct cgroup_file *cfile = (void *)css + cft->file_offset; + + spin_lock_irq(&cgroup_file_kn_lock); + cfile->kn = NULL; + spin_unlock_irq(&cgroup_file_kn_lock); + } + kernfs_remove_by_name(cgrp->kn, cgroup_file_name(cgrp, cft, name)); } @@ -1856,7 +1872,6 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) INIT_LIST_HEAD(&cgrp->self.sibling); INIT_LIST_HEAD(&cgrp->self.children); - INIT_LIST_HEAD(&cgrp->self.files); INIT_LIST_HEAD(&cgrp->cset_links); INIT_LIST_HEAD(&cgrp->pidlists); mutex_init(&cgrp->pidlist_mutex); @@ -3313,9 +3328,9 @@ static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp, if (cft->file_offset) { struct cgroup_file *cfile = (void *)css + cft->file_offset; - kernfs_get(kn); + spin_lock_irq(&cgroup_file_kn_lock); cfile->kn = kn; - list_add(&cfile->node, &css->files); + spin_unlock_irq(&cgroup_file_kn_lock); } return 0; @@ -3553,6 +3568,22 @@ int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) } /** + * cgroup_file_notify - generate a file modified event for a cgroup_file + * @cfile: target cgroup_file + * + * @cfile must have been obtained by setting cftype->file_offset. + */ +void cgroup_file_notify(struct cgroup_file *cfile) +{ + unsigned long flags; + + spin_lock_irqsave(&cgroup_file_kn_lock, flags); + if (cfile->kn) + kernfs_notify(cfile->kn); + spin_unlock_irqrestore(&cgroup_file_kn_lock, flags); +} + +/** * cgroup_task_count - count the number of tasks in a cgroup. * @cgrp: the cgroup in question * @@ -4613,13 +4644,9 @@ static void css_free_work_fn(struct work_struct *work) container_of(work, struct cgroup_subsys_state, destroy_work); struct cgroup_subsys *ss = css->ss; struct cgroup *cgrp = css->cgroup; - struct cgroup_file *cfile; percpu_ref_exit(&css->refcnt); - list_for_each_entry(cfile, &css->files, node) - kernfs_put(cfile->kn); - if (ss) { /* css free path */ int id = css->id; @@ -4724,7 +4751,6 @@ static void init_and_link_css(struct cgroup_subsys_state *css, css->ss = ss; INIT_LIST_HEAD(&css->sibling); INIT_LIST_HEAD(&css->children); - INIT_LIST_HEAD(&css->files); css->serial_nr = css_serial_nr_next++; if (cgroup_parent(cgrp)) { -- cgit v0.10.2 From c5967b79ecabe2baca40658d9073e28b30d7f6cf Mon Sep 17 00:00:00 2001 From: "Charles_Rose@Dell.com" Date: Fri, 6 Nov 2015 14:18:56 -0600 Subject: ahci: Add Device ID for Intel Sunrise Point PCH This patch adds missing AHCI RAID SATA Device IDs for the Intel Sunrise Point PCH. Signed-off-by: Nanda Kishore Chinna Signed-off-by: Charles Rose Signed-off-by: Tejun Heo diff --git a/drivers/ata/ahci.c b/drivers/ata/ahci.c index ff02bb4..ee3fc84 100644 --- a/drivers/ata/ahci.c +++ b/drivers/ata/ahci.c @@ -350,8 +350,10 @@ static const struct pci_device_id ahci_pci_tbl[] = { { PCI_VDEVICE(INTEL, 0x9d03), board_ahci }, /* Sunrise Point-LP AHCI */ { PCI_VDEVICE(INTEL, 0x9d05), board_ahci }, /* Sunrise Point-LP RAID */ { PCI_VDEVICE(INTEL, 0x9d07), board_ahci }, /* Sunrise Point-LP RAID */ + { PCI_VDEVICE(INTEL, 0xa102), board_ahci }, /* Sunrise Point-H AHCI */ { PCI_VDEVICE(INTEL, 0xa103), board_ahci }, /* Sunrise Point-H AHCI */ { PCI_VDEVICE(INTEL, 0xa105), board_ahci }, /* Sunrise Point-H RAID */ + { PCI_VDEVICE(INTEL, 0xa106), board_ahci }, /* Sunrise Point-H RAID */ { PCI_VDEVICE(INTEL, 0xa107), board_ahci }, /* Sunrise Point-H RAID */ { PCI_VDEVICE(INTEL, 0xa10f), board_ahci }, /* Sunrise Point-H RAID */ -- cgit v0.10.2 From 4d92f0099a06ef0e36c7673f7c090f1a448b2d1b Mon Sep 17 00:00:00 2001 From: Alexandra Yates Date: Mon, 16 Nov 2015 11:22:16 -0500 Subject: ahci: Order SATA device IDs for codename Lewisburg This change was to preserve the ascending order of device IDs. There was an exception with the first two Lewisburg device IDs to keep all device IDs of the same kind grouped by code name. Signed-off-by: Alexandra Yates signed-off-by: Tejun Heo diff --git a/drivers/ata/ahci.c b/drivers/ata/ahci.c index ee3fc84..cdfbcc5 100644 --- a/drivers/ata/ahci.c +++ b/drivers/ata/ahci.c @@ -314,16 +314,6 @@ static const struct pci_device_id ahci_pci_tbl[] = { { PCI_VDEVICE(INTEL, 0x1f37), board_ahci_avn }, /* Avoton RAID */ { PCI_VDEVICE(INTEL, 0x1f3e), board_ahci_avn }, /* Avoton RAID */ { PCI_VDEVICE(INTEL, 0x1f3f), board_ahci_avn }, /* Avoton RAID */ - { PCI_VDEVICE(INTEL, 0xa182), board_ahci }, /* Lewisburg AHCI*/ - { PCI_VDEVICE(INTEL, 0xa202), board_ahci }, /* Lewisburg AHCI*/ - { PCI_VDEVICE(INTEL, 0xa184), board_ahci }, /* Lewisburg RAID*/ - { PCI_VDEVICE(INTEL, 0xa204), board_ahci }, /* Lewisburg RAID*/ - { PCI_VDEVICE(INTEL, 0xa186), board_ahci }, /* Lewisburg RAID*/ - { PCI_VDEVICE(INTEL, 0xa206), board_ahci }, /* Lewisburg RAID*/ - { PCI_VDEVICE(INTEL, 0x2822), board_ahci }, /* Lewisburg RAID*/ - { PCI_VDEVICE(INTEL, 0x2826), board_ahci }, /* Lewisburg RAID*/ - { PCI_VDEVICE(INTEL, 0xa18e), board_ahci }, /* Lewisburg RAID*/ - { PCI_VDEVICE(INTEL, 0xa20e), board_ahci }, /* Lewisburg RAID*/ { PCI_VDEVICE(INTEL, 0x2823), board_ahci }, /* Wellsburg RAID */ { PCI_VDEVICE(INTEL, 0x2827), board_ahci }, /* Wellsburg RAID */ { PCI_VDEVICE(INTEL, 0x8d02), board_ahci }, /* Wellsburg AHCI */ @@ -356,6 +346,16 @@ static const struct pci_device_id ahci_pci_tbl[] = { { PCI_VDEVICE(INTEL, 0xa106), board_ahci }, /* Sunrise Point-H RAID */ { PCI_VDEVICE(INTEL, 0xa107), board_ahci }, /* Sunrise Point-H RAID */ { PCI_VDEVICE(INTEL, 0xa10f), board_ahci }, /* Sunrise Point-H RAID */ + { PCI_VDEVICE(INTEL, 0x2822), board_ahci }, /* Lewisburg RAID*/ + { PCI_VDEVICE(INTEL, 0x2826), board_ahci }, /* Lewisburg RAID*/ + { PCI_VDEVICE(INTEL, 0xa182), board_ahci }, /* Lewisburg AHCI*/ + { PCI_VDEVICE(INTEL, 0xa184), board_ahci }, /* Lewisburg RAID*/ + { PCI_VDEVICE(INTEL, 0xa186), board_ahci }, /* Lewisburg RAID*/ + { PCI_VDEVICE(INTEL, 0xa18e), board_ahci }, /* Lewisburg RAID*/ + { PCI_VDEVICE(INTEL, 0xa202), board_ahci }, /* Lewisburg AHCI*/ + { PCI_VDEVICE(INTEL, 0xa204), board_ahci }, /* Lewisburg RAID*/ + { PCI_VDEVICE(INTEL, 0xa206), board_ahci }, /* Lewisburg RAID*/ + { PCI_VDEVICE(INTEL, 0xa20e), board_ahci }, /* Lewisburg RAID*/ /* JMicron 360/1/3/5/6, match class to avoid IDE function */ { PCI_VENDOR_ID_JMICRON, PCI_ANY_ID, PCI_ANY_ID, PCI_ANY_ID, -- cgit v0.10.2 From ce36f6ad862fc3bc2466c89de17694e582a778b5 Mon Sep 17 00:00:00 2001 From: Pankaj Dubey Date: Fri, 13 Nov 2015 14:59:36 +0530 Subject: ARM: EXYNOS: Fix potential NULL pointer access in exynos_sys_powerdown_conf If no platform devices binded to the driver but driver itself loaded and exynos_sys_powerdown_conf is called from arch/arm/mach-exynos/{suspend.c, pm.c} it will result in NULL pointer access, to prevent this added check on pmu_context for NULL. Signed-off-by: Pankaj Dubey Reviewed-by: Krzysztof Kozlowski Tested-by: Krzysztof Kozlowski Signed-off-by: Krzysztof Kozlowski diff --git a/arch/arm/mach-exynos/pmu.c b/arch/arm/mach-exynos/pmu.c index de68938..c21e41d 100644 --- a/arch/arm/mach-exynos/pmu.c +++ b/arch/arm/mach-exynos/pmu.c @@ -748,8 +748,12 @@ static void exynos5_powerdown_conf(enum sys_powerdown mode) void exynos_sys_powerdown_conf(enum sys_powerdown mode) { unsigned int i; + const struct exynos_pmu_data *pmu_data; + + if (!pmu_context) + return; - const struct exynos_pmu_data *pmu_data = pmu_context->pmu_data; + pmu_data = pmu_context->pmu_data; if (pmu_data->powerdown_conf) pmu_data->powerdown_conf(mode); -- cgit v0.10.2 From 183e53e8ddf4165c3763181682189362d6b403f7 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 18 Nov 2015 17:18:40 +0100 Subject: usb: musb: USB_TI_CPPI41_DMA requires dmaengine support The CPPI-4.1 driver selects TI_CPPI41, which is a dmaengine driver and that may not be available when CONFIG_DMADEVICES is not set: warning: (USB_TI_CPPI41_DMA) selects TI_CPPI41 which has unmet direct dependencies (DMADEVICES && ARCH_OMAP) This adds an extra dependency to avoid generating warnings in randconfig builds. Ideally we'd remove the 'select' statement, but that has the potential to break defconfig files. Signed-off-by: Arnd Bergmann Fixes: 411dd19c682d ("usb: musb: Kconfig: Select the DMA driver if DMA mode of MUSB is enabled") Signed-off-by: Felipe Balbi diff --git a/drivers/usb/musb/Kconfig b/drivers/usb/musb/Kconfig index 1f2037b..45c83ba 100644 --- a/drivers/usb/musb/Kconfig +++ b/drivers/usb/musb/Kconfig @@ -159,7 +159,7 @@ config USB_TI_CPPI_DMA config USB_TI_CPPI41_DMA bool 'TI CPPI 4.1 (AM335x)' - depends on ARCH_OMAP + depends on ARCH_OMAP && DMADEVICES select TI_CPPI41 config USB_TUSB_OMAP_DMA -- cgit v0.10.2 From 7fe9a937d5156761cf79dc354f616c726b5c09e2 Mon Sep 17 00:00:00 2001 From: Daniel Walter Date: Wed, 18 Nov 2015 17:15:49 +0100 Subject: usb: gadget: functionfs: fix missing access_ok checks use safe copy_*_user instead of unsafe __copy_*_user functions when accessing userland memory. Signed-off-by: Daniel Walter Signed-off-by: Felipe Balbi diff --git a/drivers/usb/gadget/function/f_fs.c b/drivers/usb/gadget/function/f_fs.c index adc6d52..cf43e9e 100644 --- a/drivers/usb/gadget/function/f_fs.c +++ b/drivers/usb/gadget/function/f_fs.c @@ -423,7 +423,7 @@ static ssize_t __ffs_ep0_read_events(struct ffs_data *ffs, char __user *buf, spin_unlock_irq(&ffs->ev.waitq.lock); mutex_unlock(&ffs->mutex); - return unlikely(__copy_to_user(buf, events, size)) ? -EFAULT : size; + return unlikely(copy_to_user(buf, events, size)) ? -EFAULT : size; } static ssize_t ffs_ep0_read(struct file *file, char __user *buf, @@ -513,7 +513,7 @@ static ssize_t ffs_ep0_read(struct file *file, char __user *buf, /* unlocks spinlock */ ret = __ffs_ep0_queue_wait(ffs, data, len); - if (likely(ret > 0) && unlikely(__copy_to_user(buf, data, len))) + if (likely(ret > 0) && unlikely(copy_to_user(buf, data, len))) ret = -EFAULT; goto done_mutex; @@ -3493,7 +3493,7 @@ static char *ffs_prepare_buffer(const char __user *buf, size_t len) if (unlikely(!data)) return ERR_PTR(-ENOMEM); - if (unlikely(__copy_from_user(data, buf, len))) { + if (unlikely(copy_from_user(data, buf, len))) { kfree(data); return ERR_PTR(-EFAULT); } -- cgit v0.10.2 From f66541ba02d5cb619b84fa4c5b43b3bcc8764716 Mon Sep 17 00:00:00 2001 From: Jyri Sarha Date: Tue, 17 Nov 2015 11:56:44 +0200 Subject: clk: gpio: Get parent clk names in of_gpio_clk_setup() Get parent clk names in of_gpio_clk_setup() and store the names in struct clk_gpio_delayed_register_data instead of doing it from the clk provider's get() callback. of_clk_get_parent_name() can't be called in struct of_clk_provider's get() callback since it may make a call to of_clk_get_from_provider() and this in turn tries to recursively lock of_clk_mutex. Signed-off-by: Jyri Sarha Cc: Sergej Sawazki Fixes: 0a4807c2f9a4 ("clk: Make of_clk_get_parent_name() robust with #clock-cells = 1") Signed-off-by: Stephen Boyd diff --git a/drivers/clk/clk-gpio.c b/drivers/clk/clk-gpio.c index 10819e2..335322d 100644 --- a/drivers/clk/clk-gpio.c +++ b/drivers/clk/clk-gpio.c @@ -209,6 +209,8 @@ EXPORT_SYMBOL_GPL(clk_register_gpio_mux); struct clk_gpio_delayed_register_data { const char *gpio_name; + int num_parents; + const char **parent_names; struct device_node *node; struct mutex lock; struct clk *clk; @@ -222,8 +224,6 @@ static struct clk *of_clk_gpio_delayed_register_get( { struct clk_gpio_delayed_register_data *data = _data; struct clk *clk; - const char **parent_names; - int i, num_parents; int gpio; enum of_gpio_flags of_flags; @@ -248,26 +248,14 @@ static struct clk *of_clk_gpio_delayed_register_get( return ERR_PTR(gpio); } - num_parents = of_clk_get_parent_count(data->node); - - parent_names = kcalloc(num_parents, sizeof(char *), GFP_KERNEL); - if (!parent_names) { - clk = ERR_PTR(-ENOMEM); - goto out; - } - - for (i = 0; i < num_parents; i++) - parent_names[i] = of_clk_get_parent_name(data->node, i); - - clk = data->clk_register_get(data->node->name, parent_names, - num_parents, gpio, of_flags & OF_GPIO_ACTIVE_LOW); + clk = data->clk_register_get(data->node->name, data->parent_names, + data->num_parents, gpio, of_flags & OF_GPIO_ACTIVE_LOW); if (IS_ERR(clk)) goto out; data->clk = clk; out: mutex_unlock(&data->lock); - kfree(parent_names); return clk; } @@ -296,11 +284,24 @@ static void __init of_gpio_clk_setup(struct device_node *node, unsigned gpio, bool active_low)) { struct clk_gpio_delayed_register_data *data; + const char **parent_names; + int i, num_parents; data = kzalloc(sizeof(*data), GFP_KERNEL); if (!data) return; + num_parents = of_clk_get_parent_count(node); + + parent_names = kcalloc(num_parents, sizeof(char *), GFP_KERNEL); + if (!parent_names) + return; + + for (i = 0; i < num_parents; i++) + parent_names[i] = of_clk_get_parent_name(node, i); + + data->num_parents = num_parents; + data->parent_names = parent_names; data->node = node; data->gpio_name = gpio_name; data->clk_register_get = clk_register_get; -- cgit v0.10.2 From 391e6dcb37857d5659b53def2f41e2f56850d33c Mon Sep 17 00:00:00 2001 From: Felipe Balbi Date: Wed, 18 Nov 2015 17:06:00 -0600 Subject: usb: gadget: pxa27x: fix suspend callback pxa27x disconnects pullups on suspend but doesn't notify the gadget driver about it, so gadget driver can't disable the endpoints it was using. This causes problems on resume because gadget core will think endpoints are still enabled and just ignore the following usb_ep_enable(). Fix this problem by calling gadget_driver->disconnect(). Cc: # v3.10+ Tested-by: Robert Jarzmik Signed-off-by: Felipe Balbi diff --git a/drivers/usb/gadget/udc/pxa27x_udc.c b/drivers/usb/gadget/udc/pxa27x_udc.c index 670ac0b..001a3b7 100644 --- a/drivers/usb/gadget/udc/pxa27x_udc.c +++ b/drivers/usb/gadget/udc/pxa27x_udc.c @@ -2536,6 +2536,9 @@ static int pxa_udc_suspend(struct platform_device *_dev, pm_message_t state) udc->pullup_resume = udc->pullup_on; dplus_pullup(udc, 0); + if (udc->driver) + udc->driver->disconnect(&udc->gadget); + return 0; } -- cgit v0.10.2 From ac722e302cc057a65cb1d975f8b17e8e11144946 Mon Sep 17 00:00:00 2001 From: Peter Chen Date: Thu, 19 Nov 2015 11:47:47 +0800 Subject: usb: kconfig: fix warning of select USB_OTG When choose randconfig for kernel build, it reports below warning: "warning: (USB_OTG_FSM && FSL_USB2_OTG && USB_MV_OTG) selects USB_OTG which has unmet direct dependencies (USB_SUPPORT && USB && PM)" In fact, USB_OTG is visible symbol and depends on PM, so the driver needs to depend on it to reduce dependency problem. Signed-off-by: Peter Chen Reported-by: Arnd Bergmann Cc: Felipe Balbi Acked-by: Arnd Bergmann Signed-off-by: Felipe Balbi diff --git a/drivers/usb/core/Kconfig b/drivers/usb/core/Kconfig index a99c89e..dd28010 100644 --- a/drivers/usb/core/Kconfig +++ b/drivers/usb/core/Kconfig @@ -77,8 +77,7 @@ config USB_OTG_BLACKLIST_HUB config USB_OTG_FSM tristate "USB 2.0 OTG FSM implementation" - depends on USB - select USB_OTG + depends on USB && USB_OTG select USB_PHY help Implements OTG Finite State Machine as specified in On-The-Go diff --git a/drivers/usb/phy/Kconfig b/drivers/usb/phy/Kconfig index 1731324..22e8ecb 100644 --- a/drivers/usb/phy/Kconfig +++ b/drivers/usb/phy/Kconfig @@ -21,7 +21,6 @@ config AB8500_USB config FSL_USB2_OTG bool "Freescale USB OTG Transceiver Driver" depends on USB_EHCI_FSL && USB_FSL_USB2 && USB_OTG_FSM && PM - select USB_OTG select USB_PHY help Enable this to support Freescale USB OTG transceiver. @@ -168,8 +167,7 @@ config USB_QCOM_8X16_PHY config USB_MV_OTG tristate "Marvell USB OTG support" - depends on USB_EHCI_MV && USB_MV_UDC && PM - select USB_OTG + depends on USB_EHCI_MV && USB_MV_UDC && PM && USB_OTG select USB_PHY help Say Y here if you want to build Marvell USB OTG transciever -- cgit v0.10.2 From c9eb29503e9655e70448bbbf3697d08a56d24854 Mon Sep 17 00:00:00 2001 From: Yoshihiro Shimoda Date: Wed, 18 Nov 2015 14:33:35 +0900 Subject: usb: renesas_usbhs: gadget: Fix NULL pointer dereference in usbhsg_ep_dequeue() This patch fixes an issue that NULL pointer dereference happens when a gadget driver calls usb_ep_dequeue() for ep0 after disconnected a usb cable. This is because that usbhsg_try_stop() will call usbhsg_ep_disable(&dcp->ep) when a usb cable is disconnected and the pipe of dcp (ep0) is set to NULL. Signed-off-by: Yoshihiro Shimoda Signed-off-by: Felipe Balbi diff --git a/drivers/usb/renesas_usbhs/mod_gadget.c b/drivers/usb/renesas_usbhs/mod_gadget.c index de4f97d..8f7a78e 100644 --- a/drivers/usb/renesas_usbhs/mod_gadget.c +++ b/drivers/usb/renesas_usbhs/mod_gadget.c @@ -131,7 +131,8 @@ static void __usbhsg_queue_pop(struct usbhsg_uep *uep, struct device *dev = usbhsg_gpriv_to_dev(gpriv); struct usbhs_priv *priv = usbhsg_gpriv_to_priv(gpriv); - dev_dbg(dev, "pipe %d : queue pop\n", usbhs_pipe_number(pipe)); + if (pipe) + dev_dbg(dev, "pipe %d : queue pop\n", usbhs_pipe_number(pipe)); ureq->req.status = status; spin_unlock(usbhs_priv_to_lock(priv)); @@ -685,7 +686,13 @@ static int usbhsg_ep_dequeue(struct usb_ep *ep, struct usb_request *req) struct usbhsg_request *ureq = usbhsg_req_to_ureq(req); struct usbhs_pipe *pipe = usbhsg_uep_to_pipe(uep); - usbhs_pkt_pop(pipe, usbhsg_ureq_to_pkt(ureq)); + if (pipe) + usbhs_pkt_pop(pipe, usbhsg_ureq_to_pkt(ureq)); + + /* + * To dequeue a request, this driver should call the usbhsg_queue_pop() + * even if the pipe is NULL. + */ usbhsg_queue_pop(uep, ureq, -ECONNRESET); return 0; -- cgit v0.10.2 From d0464bcf12af54bafe02dd23ac5bd75d825f9fdd Mon Sep 17 00:00:00 2001 From: John Youn Date: Fri, 13 Nov 2015 17:02:10 +0000 Subject: usb: dwc2: Make PHY optional Fixes commit 09a75e857790 ("usb: dwc2: refactor common low-level hw code to platform.c") The above commit consolidated the low-level phy access into a common location. This change introduced a check from the gadget requiring that a PHY is specified. This requirement never existed on the host side and broke some platforms when it was moved into platform.c. The gadget doesn't require the PHY either so remove the check. Acked-by: Eric Anholt Reported-by: Stefan Wahren Cc: Marek Szyprowski Signed-off-by: John Youn Tested-by: Marek Szyprowski Fixes: 09a75e857790 ("usb: dwc2: refactor common low-level hw code to platform.c") Signed-off-by: Felipe Balbi diff --git a/drivers/usb/dwc2/platform.c b/drivers/usb/dwc2/platform.c index e61d773..60b12a4 100644 --- a/drivers/usb/dwc2/platform.c +++ b/drivers/usb/dwc2/platform.c @@ -229,11 +229,6 @@ static int dwc2_lowlevel_hw_init(struct dwc2_hsotg *hsotg) hsotg->phyif = GUSBCFG_PHYIF8; } - if (!hsotg->phy && !hsotg->uphy && !hsotg->plat) { - dev_err(hsotg->dev, "no platform data or transceiver defined\n"); - return -EPROBE_DEFER; - } - /* Clock */ hsotg->clk = devm_clk_get(hsotg->dev, "otg"); if (IS_ERR(hsotg->clk)) { -- cgit v0.10.2 From 6c2dad69163fdb4ea82344dcba360fc00b4adda4 Mon Sep 17 00:00:00 2001 From: Stefan Wahren Date: Fri, 13 Nov 2015 17:02:11 +0000 Subject: usb: dwc2: Return errors from PHY When searching for PHYs, any error was treated as if the PHY did not exist or was not specified. Thus the probe function did not correctly return error conditions such as -EPROBE_DEFER. Fixed so that only a non-existing PHY is ignored and any other error is returned. Acked-by: Eric Anholt Reported-by: Alexander Aring Signed-off-by: John Youn Signed-off-by: Stefan Wahren Tested-by: Marek Szyprowski Signed-off-by: Felipe Balbi diff --git a/drivers/usb/dwc2/platform.c b/drivers/usb/dwc2/platform.c index 60b12a4..d28f72e 100644 --- a/drivers/usb/dwc2/platform.c +++ b/drivers/usb/dwc2/platform.c @@ -212,14 +212,41 @@ static int dwc2_lowlevel_hw_init(struct dwc2_hsotg *hsotg) */ hsotg->phy = devm_phy_get(hsotg->dev, "usb2-phy"); if (IS_ERR(hsotg->phy)) { - hsotg->phy = NULL; + ret = PTR_ERR(hsotg->phy); + switch (ret) { + case -ENODEV: + case -ENOSYS: + hsotg->phy = NULL; + break; + case -EPROBE_DEFER: + return ret; + default: + dev_err(hsotg->dev, "error getting phy %d\n", ret); + return ret; + } + } + + if (!hsotg->phy) { hsotg->uphy = devm_usb_get_phy(hsotg->dev, USB_PHY_TYPE_USB2); - if (IS_ERR(hsotg->uphy)) - hsotg->uphy = NULL; - else - hsotg->plat = dev_get_platdata(hsotg->dev); + if (IS_ERR(hsotg->uphy)) { + ret = PTR_ERR(hsotg->uphy); + switch (ret) { + case -ENODEV: + case -ENXIO: + hsotg->uphy = NULL; + break; + case -EPROBE_DEFER: + return ret; + default: + dev_err(hsotg->dev, "error getting usb phy %d\n", + ret); + return ret; + } + } } + hsotg->plat = dev_get_platdata(hsotg->dev); + if (hsotg->phy) { /* * If using the generic PHY framework, check if the PHY bus -- cgit v0.10.2 From 8aa90cf2a28645c6733f8879c5fe1848c5d510b7 Mon Sep 17 00:00:00 2001 From: Stefan Wahren Date: Fri, 13 Nov 2015 17:02:12 +0000 Subject: usb: dwc2: make otg clk optional Fixes commit 09a75e857790 ("usb: dwc2: refactor common low-level hw code to platform.c") The above commit consolidated the low-level phy access into a common location. This change made the otg clk a requirement and broke some platforms when it was moved into platform.c. So make clk handling optional again. Acked-by: Eric Anholt Signed-off-by: Stefan Wahren Cc: Marek Szyprowski Acked-by: John Youn Tested-by: Marek Szyprowski Fixes: 09a75e857790 ("usb: dwc2: refactor common low-level hw code to platform.c") Signed-off-by: Felipe Balbi diff --git a/drivers/usb/dwc2/platform.c b/drivers/usb/dwc2/platform.c index d28f72e..91e2e6b 100644 --- a/drivers/usb/dwc2/platform.c +++ b/drivers/usb/dwc2/platform.c @@ -125,9 +125,11 @@ static int __dwc2_lowlevel_hw_enable(struct dwc2_hsotg *hsotg) if (ret) return ret; - ret = clk_prepare_enable(hsotg->clk); - if (ret) - return ret; + if (hsotg->clk) { + ret = clk_prepare_enable(hsotg->clk); + if (ret) + return ret; + } if (hsotg->uphy) ret = usb_phy_init(hsotg->uphy); @@ -175,7 +177,8 @@ static int __dwc2_lowlevel_hw_disable(struct dwc2_hsotg *hsotg) if (ret) return ret; - clk_disable_unprepare(hsotg->clk); + if (hsotg->clk) + clk_disable_unprepare(hsotg->clk); ret = regulator_bulk_disable(ARRAY_SIZE(hsotg->supplies), hsotg->supplies); -- cgit v0.10.2 From e9ca7e4bb30cbf4c8d2e2e34681a967c10b3ff54 Mon Sep 17 00:00:00 2001 From: "Felipe F. Tonello" Date: Tue, 10 Nov 2015 17:52:03 +0000 Subject: usb: gadget: f_midi: Transmit data only when IN ep is enabled This makes sure f_midi doesn't try to enqueue data when the IN endpoint is disabled, ie, USB cable is disconnected. Reviewed-by: Robert Baldyga Signed-off-by: Felipe F. Tonello Signed-off-by: Felipe Balbi diff --git a/drivers/usb/gadget/function/f_midi.c b/drivers/usb/gadget/function/f_midi.c index 42acb45..7877aa8 100644 --- a/drivers/usb/gadget/function/f_midi.c +++ b/drivers/usb/gadget/function/f_midi.c @@ -545,7 +545,7 @@ static void f_midi_transmit(struct f_midi *midi, struct usb_request *req) } } - if (req->length > 0) { + if (req->length > 0 && ep->enabled) { int err; err = usb_ep_queue(ep, req, GFP_ATOMIC); -- cgit v0.10.2 From ad0d1a058eac46503edbc510d1ce44c5df8e0c91 Mon Sep 17 00:00:00 2001 From: "Felipe F. Tonello" Date: Tue, 10 Nov 2015 17:52:06 +0000 Subject: usb: gadget: f_midi: fix leak on failed to enqueue out requests This patch fixes a memory leak that occurs when an endpoint fails to enqueue the request. If that happens the complete function will never be called, thus never freeing the request. Signed-off-by: Felipe F. Tonello Signed-off-by: Felipe Balbi diff --git a/drivers/usb/gadget/function/f_midi.c b/drivers/usb/gadget/function/f_midi.c index 7877aa8..898a570 100644 --- a/drivers/usb/gadget/function/f_midi.c +++ b/drivers/usb/gadget/function/f_midi.c @@ -370,6 +370,7 @@ static int f_midi_set_alt(struct usb_function *f, unsigned intf, unsigned alt) if (err) { ERROR(midi, "%s queue req: %d\n", midi->out_ep->name, err); + free_ep_req(midi->out_ep, req); } } -- cgit v0.10.2 From 5c256b68348e9717c8ba23521ef52ef7f69d4448 Mon Sep 17 00:00:00 2001 From: Peter Chen Date: Wed, 16 Sep 2015 15:52:33 +0800 Subject: usb: phy: mxs: add "fsl,imx6ul-usbphy" compatible string Add "fsl,imx6ul-usbphy" compatible string for iMX6ul usb phy Signed-off-by: Peter Chen Signed-off-by: Li Jun Signed-off-by: Felipe Balbi diff --git a/drivers/usb/phy/phy-mxs-usb.c b/drivers/usb/phy/phy-mxs-usb.c index b7536af..c2936dc 100644 --- a/drivers/usb/phy/phy-mxs-usb.c +++ b/drivers/usb/phy/phy-mxs-usb.c @@ -143,12 +143,17 @@ static const struct mxs_phy_data imx6sx_phy_data = { .flags = MXS_PHY_DISCONNECT_LINE_WITHOUT_VBUS, }; +static const struct mxs_phy_data imx6ul_phy_data = { + .flags = MXS_PHY_DISCONNECT_LINE_WITHOUT_VBUS, +}; + static const struct of_device_id mxs_phy_dt_ids[] = { { .compatible = "fsl,imx6sx-usbphy", .data = &imx6sx_phy_data, }, { .compatible = "fsl,imx6sl-usbphy", .data = &imx6sl_phy_data, }, { .compatible = "fsl,imx6q-usbphy", .data = &imx6q_phy_data, }, { .compatible = "fsl,imx23-usbphy", .data = &imx23_phy_data, }, { .compatible = "fsl,vf610-usbphy", .data = &vf610_phy_data, }, + { .compatible = "fsl,imx6ul-usbphy", .data = &imx6ul_phy_data, }, { /* sentinel */ } }; MODULE_DEVICE_TABLE(of, mxs_phy_dt_ids); -- cgit v0.10.2 From 7fe941422339c96072fc1eec4e8b484ccf7ccd0e Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Thu, 19 Nov 2015 13:00:14 +0900 Subject: vfio: Drop owner assignment from platform_driver platform_driver does not need to set an owner because platform_driver_register() will set it. Signed-off-by: Krzysztof Kozlowski Acked-by: Baptiste Reynal Signed-off-by: Alex Williamson diff --git a/drivers/vfio/platform/vfio_platform.c b/drivers/vfio/platform/vfio_platform.c index f1625dc..b1cc3a7 100644 --- a/drivers/vfio/platform/vfio_platform.c +++ b/drivers/vfio/platform/vfio_platform.c @@ -92,7 +92,6 @@ static struct platform_driver vfio_platform_driver = { .remove = vfio_platform_remove, .driver = { .name = "vfio-platform", - .owner = THIS_MODULE, }, }; -- cgit v0.10.2 From 7d10f4e0797597a31f2724b46e619ee10d8eb501 Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Sat, 14 Nov 2015 11:07:01 +0100 Subject: vfio-pci: constify pci_error_handlers structures This pci_error_handlers structure is never modified, like all the other pci_error_handlers structures, so declare it as const. Done with the help of Coccinelle. Signed-off-by: Julia Lawall Signed-off-by: Alex Williamson diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c index 32b88bd..2760a7b 100644 --- a/drivers/vfio/pci/vfio_pci.c +++ b/drivers/vfio/pci/vfio_pci.c @@ -1035,7 +1035,7 @@ static pci_ers_result_t vfio_pci_aer_err_detected(struct pci_dev *pdev, return PCI_ERS_RESULT_CAN_RECOVER; } -static struct pci_error_handlers vfio_err_handlers = { +static const struct pci_error_handlers vfio_err_handlers = { .error_detected = vfio_pci_aer_err_detected, }; -- cgit v0.10.2 From f74875dc36135ebae82a8e005f4b7f52289d2c40 Mon Sep 17 00:00:00 2001 From: Stefan Wahren Date: Thu, 12 Nov 2015 21:08:34 +0000 Subject: usb: dwc2: fix kernel oops during driver probe This patch make sure that all necessary members of dwc2_hsotg are initialized before the irq handler is requested. So the kernel oops triggered by dwc2_handle_common_intr has been fixed. dwc2 20980000.usb: Configuration mismatch. Forcing host mode dwc2 20980000.usb: no platform data or transceiver defined Unable to handle kernel paging request at virtual address cc860040 pgd = c0004000 [cc860040] *pgd=0b41e811, *pte=00000000, *ppte=00000000 Internal error: Oops: 7 [#1] ARM CPU: 0 PID: 1 Comm: swapper Not tainted 4.3.0-rc3+ #19 Hardware name: BCM2835 task: cb494000 ti: cb4d0000 task.ti: cb4d0000 PC is at dwc2_is_controller_alive+0x18/0x34 LR is at dwc2_handle_common_intr+0x24/0xb60 Acked-by: John Youn Tested-by: Marek Szyprowski Signed-off-by: Stefan Wahren Signed-off-by: Felipe Balbi diff --git a/drivers/usb/dwc2/platform.c b/drivers/usb/dwc2/platform.c index 91e2e6b..39c1cbf 100644 --- a/drivers/usb/dwc2/platform.c +++ b/drivers/usb/dwc2/platform.c @@ -367,20 +367,6 @@ static int dwc2_driver_probe(struct platform_device *dev) if (retval) return retval; - irq = platform_get_irq(dev, 0); - if (irq < 0) { - dev_err(&dev->dev, "missing IRQ resource\n"); - return irq; - } - - dev_dbg(hsotg->dev, "registering common handler for irq%d\n", - irq); - retval = devm_request_irq(hsotg->dev, irq, - dwc2_handle_common_intr, IRQF_SHARED, - dev_name(hsotg->dev), hsotg); - if (retval) - return retval; - res = platform_get_resource(dev, IORESOURCE_MEM, 0); hsotg->regs = devm_ioremap_resource(&dev->dev, res); if (IS_ERR(hsotg->regs)) @@ -415,6 +401,20 @@ static int dwc2_driver_probe(struct platform_device *dev) dwc2_set_all_params(hsotg->core_params, -1); + irq = platform_get_irq(dev, 0); + if (irq < 0) { + dev_err(&dev->dev, "missing IRQ resource\n"); + return irq; + } + + dev_dbg(hsotg->dev, "registering common handler for irq%d\n", + irq); + retval = devm_request_irq(hsotg->dev, irq, + dwc2_handle_common_intr, IRQF_SHARED, + dev_name(hsotg->dev), hsotg); + if (retval) + return retval; + retval = dwc2_lowlevel_hw_enable(hsotg); if (retval) return retval; -- cgit v0.10.2 From 7200be7c81cc1a1c54a79348a724c16345c5242f Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 19 Nov 2015 16:17:24 -0800 Subject: vfio: platform: remove needless stack usage request_module already takes format strings, so no need to duplicate the effort. Signed-off-by: Kees Cook Signed-off-by: Alex Williamson diff --git a/drivers/vfio/platform/vfio_platform_common.c b/drivers/vfio/platform/vfio_platform_common.c index a1c50d6..418cdd9 100644 --- a/drivers/vfio/platform/vfio_platform_common.c +++ b/drivers/vfio/platform/vfio_platform_common.c @@ -51,13 +51,10 @@ static vfio_platform_reset_fn_t vfio_platform_lookup_reset(const char *compat, static void vfio_platform_get_reset(struct vfio_platform_device *vdev) { - char modname[256]; - vdev->reset = vfio_platform_lookup_reset(vdev->compat, &vdev->reset_module); if (!vdev->reset) { - snprintf(modname, 256, "vfio-reset:%s", vdev->compat); - request_module(modname); + request_module("vfio-reset:%s", vdev->compat); vdev->reset = vfio_platform_lookup_reset(vdev->compat, &vdev->reset_module); } -- cgit v0.10.2 From 4f1dd973acffeb67142d39f3bfbc4dd0569f2da6 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 19 Nov 2015 15:16:30 +0100 Subject: sata/mvebu: use #ifdef around suspend/resume code The newly added suspend/resume implementation for ahci_mvebu causes a link error when CONFIG_PM_SLEEP is disabled: ERROR: "ahci_platform_suspend_host" [drivers/ata/ahci_mvebu.ko] undefined! ERROR: "ahci_platform_resume_host" [drivers/ata/ahci_mvebu.ko] undefined! This adds the same #ifdef here that exists in the ahci_platform driver which defines the above functions. Signed-off-by: Arnd Bergmann Fixes: d6ecf1581488 ("ata: ahci_mvebu: add suspend/resume support") Acked-by: Thomas Petazzoni Signed-off-by: Tejun Heo diff --git a/drivers/ata/ahci_mvebu.c b/drivers/ata/ahci_mvebu.c index 8490d37..f7a7fa8 100644 --- a/drivers/ata/ahci_mvebu.c +++ b/drivers/ata/ahci_mvebu.c @@ -62,6 +62,7 @@ static void ahci_mvebu_regret_option(struct ahci_host_priv *hpriv) writel(0x80, hpriv->mmio + AHCI_VENDOR_SPECIFIC_0_DATA); } +#ifdef CONFIG_PM_SLEEP static int ahci_mvebu_suspend(struct platform_device *pdev, pm_message_t state) { return ahci_platform_suspend_host(&pdev->dev); @@ -81,6 +82,10 @@ static int ahci_mvebu_resume(struct platform_device *pdev) return ahci_platform_resume_host(&pdev->dev); } +#else +#define ahci_mvebu_suspend NULL +#define ahci_mvebu_resume NULL +#endif static const struct ata_port_info ahci_mvebu_port_info = { .flags = AHCI_FLAG_COMMON, -- cgit v0.10.2 From 049af1060bb81532f2700762a8ba71eb3fa81f5a Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Sat, 21 Nov 2015 13:32:21 +0300 Subject: vfio: fix a warning message The first argument to the WARN() macro has to be a condition. I'm sort of disappointed that this code doesn't generate a compiler warning. I guess -Wformat-extra-args doesn't work in the kernel. Signed-off-by: Dan Carpenter Signed-off-by: Alex Williamson diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c index de632da..9da0703 100644 --- a/drivers/vfio/vfio.c +++ b/drivers/vfio/vfio.c @@ -682,7 +682,7 @@ static int vfio_group_nb_add_dev(struct vfio_group *group, struct device *dev) return 0; /* TODO Prevent device auto probing */ - WARN("Device %s added to live group %d!\n", dev_name(dev), + WARN(1, "Device %s added to live group %d!\n", dev_name(dev), iommu_group_id(group->iommu_group)); return 0; -- cgit v0.10.2 From 2e9fed42209b17116c6221e136ccbd3f252f5f86 Mon Sep 17 00:00:00 2001 From: Ioana Ciornei Date: Tue, 27 Oct 2015 20:40:56 +0200 Subject: staging: iio: dummy: complete IIO events delivery to userspace Starting with commit fd2bb310ca (Staging: iio: Move evgen interrupt generation to irq_work) event processing is handled by calling both the top half and the threaded part properly simulating real hardware interrupts making use of threaded interrupts. This way the processing is split in 2 parts: * the IRQ handler that runs in IRQ context and only saves the event timestamp * the threaded handler that runs in process context, reads the events and pushes the in the userspace. If the IRQ handler returns IRQ_HANDLED the threaded handler is not even being called since the interrupt is considered to be processed. Because the iio dummy driver processes the events in the threaded handler the IRQ handler must return IRQ_WAKE_THREAD so that the threaded part would be awakened and called. Signed-off-by: Ioana Ciornei Signed-off-by: Jonathan Cameron diff --git a/drivers/staging/iio/iio_simple_dummy_events.c b/drivers/staging/iio/iio_simple_dummy_events.c index bfbf1c5..6eb600f 100644 --- a/drivers/staging/iio/iio_simple_dummy_events.c +++ b/drivers/staging/iio/iio_simple_dummy_events.c @@ -159,7 +159,7 @@ static irqreturn_t iio_simple_dummy_get_timestamp(int irq, void *private) struct iio_dummy_state *st = iio_priv(indio_dev); st->event_timestamp = iio_get_time_ns(); - return IRQ_HANDLED; + return IRQ_WAKE_THREAD; } /** -- cgit v0.10.2 From 45a6b8218df54087c3bb8dc731424d8789d31790 Mon Sep 17 00:00:00 2001 From: Matt Ranostay Date: Mon, 26 Oct 2015 20:18:23 -0700 Subject: iio: lidar: return -EINVAL on invalid signal Returning zero from the measurment function has the side effect of corrupting the triggered buffer readings, better to use -EINVAL than a zero measurement reading. The INVALID status happens even it isn't out of range sometimes roughly once every second or two. This can be from an invalid second signal return path. Hence there are spurious zero readings from the triggered buffer, and warning messages in the kernel log. Signed-off-by: Matt Ranostay Signed-off-by: Jonathan Cameron diff --git a/drivers/iio/proximity/pulsedlight-lidar-lite-v2.c b/drivers/iio/proximity/pulsedlight-lidar-lite-v2.c index 961f9f99..e544fcf 100644 --- a/drivers/iio/proximity/pulsedlight-lidar-lite-v2.c +++ b/drivers/iio/proximity/pulsedlight-lidar-lite-v2.c @@ -130,10 +130,10 @@ static int lidar_get_measurement(struct lidar_data *data, u16 *reg) if (ret < 0) break; - /* return 0 since laser is likely pointed out of range */ + /* return -EINVAL since laser is likely pointed out of range */ if (ret & LIDAR_REG_STATUS_INVALID) { *reg = 0; - ret = 0; + ret = -EINVAL; break; } @@ -197,7 +197,7 @@ static irqreturn_t lidar_trigger_handler(int irq, void *private) if (!ret) { iio_push_to_buffers_with_timestamp(indio_dev, data->buffer, iio_get_time_ns()); - } else { + } else if (ret != -EINVAL) { dev_err(&data->client->dev, "cannot read LIDAR measurement"); } -- cgit v0.10.2 From 8386c27587594899e4b638010fa0187fe068b295 Mon Sep 17 00:00:00 2001 From: Matt Ranostay Date: Sat, 7 Nov 2015 20:21:28 -0800 Subject: iio: light: apds9960: correct ->last_busy count Add missing pm_runtime_mark_last_busy to apds9960_set_power_state function. Unless pm_runtime_mark_last_busy is called the pm_runtime_put_autosuspend may put the device into suspend before the delay time requested. Signed-off-by: Matt Ranostay Signed-off-by: Jonathan Cameron diff --git a/drivers/iio/light/apds9960.c b/drivers/iio/light/apds9960.c index 7d269ef..f6a07dc 100644 --- a/drivers/iio/light/apds9960.c +++ b/drivers/iio/light/apds9960.c @@ -453,6 +453,7 @@ static int apds9960_set_power_state(struct apds9960_data *data, bool on) usleep_range(data->als_adc_int_us, APDS9960_MAX_INT_TIME_IN_US); } else { + pm_runtime_mark_last_busy(dev); ret = pm_runtime_put_autosuspend(dev); } -- cgit v0.10.2 From 231bfe53c57e89857753c940192acba933cba56c Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Sat, 21 Nov 2015 13:33:00 +0300 Subject: iio: fix some warning messages WARN_ON() only takes a condition argument. I have changed these to WARN() instead. Signed-off-by: Dan Carpenter Cc: Signed-off-by: Jonathan Cameron diff --git a/drivers/iio/industrialio-buffer.c b/drivers/iio/industrialio-buffer.c index d7e908a..0f6f63b 100644 --- a/drivers/iio/industrialio-buffer.c +++ b/drivers/iio/industrialio-buffer.c @@ -302,7 +302,7 @@ static int iio_scan_mask_set(struct iio_dev *indio_dev, if (trialmask == NULL) return -ENOMEM; if (!indio_dev->masklength) { - WARN_ON("Trying to set scanmask prior to registering buffer\n"); + WARN(1, "Trying to set scanmask prior to registering buffer\n"); goto err_invalid_mask; } bitmap_copy(trialmask, buffer->scan_mask, indio_dev->masklength); diff --git a/drivers/iio/industrialio-core.c b/drivers/iio/industrialio-core.c index 208358f..159ede6 100644 --- a/drivers/iio/industrialio-core.c +++ b/drivers/iio/industrialio-core.c @@ -655,7 +655,7 @@ int __iio_device_attr_init(struct device_attribute *dev_attr, break; case IIO_SEPARATE: if (!chan->indexed) { - WARN_ON("Differential channels must be indexed\n"); + WARN(1, "Differential channels must be indexed\n"); ret = -EINVAL; goto error_free_full_postfix; } -- cgit v0.10.2 From d4c65fe4ed69a62a30a680789322ed677e3438af Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Wed, 18 Nov 2015 23:04:13 +0100 Subject: iio: adc: spmi-vadc: add missing of_node_put for_each_available_child_of_node performs an of_node_get on each iteration, so a break out of the loop requires an of_node_put. A simplified version of the semantic patch that fixes this problem is as follows (http://coccinelle.lip6.fr): // @@ expression root,e; local idexpression child; @@ for_each_available_child_of_node(root, child) { ... when != of_node_put(child) when != e = child ( return child; | + of_node_put(child); ? return ...; ) ... } // Signed-off-by: Julia Lawall Cc: Signed-off-by: Jonathan Cameron diff --git a/drivers/iio/adc/qcom-spmi-vadc.c b/drivers/iio/adc/qcom-spmi-vadc.c index 0c4618b..c2babe5 100644 --- a/drivers/iio/adc/qcom-spmi-vadc.c +++ b/drivers/iio/adc/qcom-spmi-vadc.c @@ -839,8 +839,10 @@ static int vadc_get_dt_data(struct vadc_priv *vadc, struct device_node *node) for_each_available_child_of_node(node, child) { ret = vadc_get_dt_channel_data(vadc->dev, &prop, child); - if (ret) + if (ret) { + of_node_put(child); return ret; + } vadc->chan_props[index] = prop; -- cgit v0.10.2 From ddaaf4e291dd63db0667991e4a335fcf3a7df13e Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Thu, 12 Nov 2015 11:00:03 +0100 Subject: perf/core: Fix RCU problem with cgroup context switching code The RCU checker detected RCU violation in the cgroup switching routines perf_cgroup_sched_in() and perf_cgroup_sched_out(). We were dereferencing cgroup from task without holding the RCU lock. Fix this by holding the RCU read lock. We move the locking from perf_cgroup_switch() to avoid double locking. Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Vince Weaver Cc: edumazet@google.com Link: http://lkml.kernel.org/r/1447322404-10920-2-git-send-email-eranian@google.com Signed-off-by: Ingo Molnar diff --git a/kernel/events/core.c b/kernel/events/core.c index 36babfd..60e71ca 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -489,7 +489,6 @@ static void perf_cgroup_switch(struct task_struct *task, int mode) * we reschedule only in the presence of cgroup * constrained events. */ - rcu_read_lock(); list_for_each_entry_rcu(pmu, &pmus, entry) { cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); @@ -531,8 +530,6 @@ static void perf_cgroup_switch(struct task_struct *task, int mode) } } - rcu_read_unlock(); - local_irq_restore(flags); } @@ -542,6 +539,7 @@ static inline void perf_cgroup_sched_out(struct task_struct *task, struct perf_cgroup *cgrp1; struct perf_cgroup *cgrp2 = NULL; + rcu_read_lock(); /* * we come here when we know perf_cgroup_events > 0 */ @@ -561,6 +559,8 @@ static inline void perf_cgroup_sched_out(struct task_struct *task, */ if (cgrp1 != cgrp2) perf_cgroup_switch(task, PERF_CGROUP_SWOUT); + + rcu_read_unlock(); } static inline void perf_cgroup_sched_in(struct task_struct *prev, @@ -569,6 +569,7 @@ static inline void perf_cgroup_sched_in(struct task_struct *prev, struct perf_cgroup *cgrp1; struct perf_cgroup *cgrp2 = NULL; + rcu_read_lock(); /* * we come here when we know perf_cgroup_events > 0 */ @@ -584,6 +585,8 @@ static inline void perf_cgroup_sched_in(struct task_struct *prev, */ if (cgrp1 != cgrp2) perf_cgroup_switch(task, PERF_CGROUP_SWIN); + + rcu_read_unlock(); } static inline int perf_cgroup_connect(int fd, struct perf_event *event, @@ -9452,7 +9455,9 @@ static void perf_cgroup_css_free(struct cgroup_subsys_state *css) static int __perf_cgroup_move(void *info) { struct task_struct *task = info; + rcu_read_lock(); perf_cgroup_switch(task, PERF_CGROUP_SWOUT | PERF_CGROUP_SWIN); + rcu_read_unlock(); return 0; } -- cgit v0.10.2 From 614e4c4ebc75517295bccd29b20ddbc5b52af6fc Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Thu, 12 Nov 2015 11:00:04 +0100 Subject: perf/core: Robustify the perf_cgroup_from_task() RCU checks This patch reinforces the lockdep checks performed by perf_cgroup_from_tsk() by passing the perf_event_context whenever possible. It is okay to not hold the RCU read lock when we know we hold the ctx->lock. This patch makes sure this property holds. In some functions, such as perf_cgroup_sched_in(), we do not pass the context because we are sure we are holding the RCU read lock. Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Vince Weaver Cc: edumazet@google.com Link: http://lkml.kernel.org/r/1447322404-10920-3-git-send-email-eranian@google.com Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/cpu/perf_event_intel_cqm.c b/arch/x86/kernel/cpu/perf_event_intel_cqm.c index 377e8f8..a316ca9 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_cqm.c +++ b/arch/x86/kernel/cpu/perf_event_intel_cqm.c @@ -298,7 +298,7 @@ static bool __match_event(struct perf_event *a, struct perf_event *b) static inline struct perf_cgroup *event_to_cgroup(struct perf_event *event) { if (event->attach_state & PERF_ATTACH_TASK) - return perf_cgroup_from_task(event->hw.target); + return perf_cgroup_from_task(event->hw.target, event->ctx); return event->cgrp; } diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index d841d33..f9828a4 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -697,9 +697,11 @@ struct perf_cgroup { * if there is no cgroup event for the current CPU context. */ static inline struct perf_cgroup * -perf_cgroup_from_task(struct task_struct *task) +perf_cgroup_from_task(struct task_struct *task, struct perf_event_context *ctx) { - return container_of(task_css(task, perf_event_cgrp_id), + return container_of(task_css_check(task, perf_event_cgrp_id, + ctx ? lockdep_is_held(&ctx->lock) + : true), struct perf_cgroup, css); } #endif /* CONFIG_CGROUP_PERF */ diff --git a/kernel/events/core.c b/kernel/events/core.c index 60e71ca..1ac857a 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -435,7 +435,7 @@ static inline void update_cgrp_time_from_event(struct perf_event *event) if (!is_cgroup_event(event)) return; - cgrp = perf_cgroup_from_task(current); + cgrp = perf_cgroup_from_task(current, event->ctx); /* * Do not update time when cgroup is not active */ @@ -458,7 +458,7 @@ perf_cgroup_set_timestamp(struct task_struct *task, if (!task || !ctx->nr_cgroups) return; - cgrp = perf_cgroup_from_task(task); + cgrp = perf_cgroup_from_task(task, ctx); info = this_cpu_ptr(cgrp->info); info->timestamp = ctx->timestamp; } @@ -521,8 +521,10 @@ static void perf_cgroup_switch(struct task_struct *task, int mode) * set cgrp before ctxsw in to allow * event_filter_match() to not have to pass * task around + * we pass the cpuctx->ctx to perf_cgroup_from_task() + * because cgorup events are only per-cpu */ - cpuctx->cgrp = perf_cgroup_from_task(task); + cpuctx->cgrp = perf_cgroup_from_task(task, &cpuctx->ctx); cpu_ctx_sched_in(cpuctx, EVENT_ALL, task); } perf_pmu_enable(cpuctx->ctx.pmu); @@ -542,15 +544,17 @@ static inline void perf_cgroup_sched_out(struct task_struct *task, rcu_read_lock(); /* * we come here when we know perf_cgroup_events > 0 + * we do not need to pass the ctx here because we know + * we are holding the rcu lock */ - cgrp1 = perf_cgroup_from_task(task); + cgrp1 = perf_cgroup_from_task(task, NULL); /* * next is NULL when called from perf_event_enable_on_exec() * that will systematically cause a cgroup_switch() */ if (next) - cgrp2 = perf_cgroup_from_task(next); + cgrp2 = perf_cgroup_from_task(next, NULL); /* * only schedule out current cgroup events if we know @@ -572,11 +576,13 @@ static inline void perf_cgroup_sched_in(struct task_struct *prev, rcu_read_lock(); /* * we come here when we know perf_cgroup_events > 0 + * we do not need to pass the ctx here because we know + * we are holding the rcu lock */ - cgrp1 = perf_cgroup_from_task(task); + cgrp1 = perf_cgroup_from_task(task, NULL); /* prev can never be NULL */ - cgrp2 = perf_cgroup_from_task(prev); + cgrp2 = perf_cgroup_from_task(prev, NULL); /* * only need to schedule in cgroup events if we are changing -- cgit v0.10.2 From daecbd267daad36c1cd90254f502610f82efcb30 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 16 Nov 2015 11:07:44 +0100 Subject: perf: Update email address in MAINTAINERS While still valid, I'm trying to phase out this email address. Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Signed-off-by: Ingo Molnar diff --git a/MAINTAINERS b/MAINTAINERS index 050d0e7..a45fe35 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -8284,7 +8284,7 @@ F: include/linux/delayacct.h F: kernel/delayacct.c PERFORMANCE EVENTS SUBSYSTEM -M: Peter Zijlstra +M: Peter Zijlstra M: Ingo Molnar M: Arnaldo Carvalho de Melo L: linux-kernel@vger.kernel.org -- cgit v0.10.2 From b28ae9560b693bcd2e9f4d6d9c415d5380b7c3c5 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 20 Oct 2015 11:46:33 -0700 Subject: perf/x86: Fix LBR call stack save/restore This fixes a bug I added in the following commit: 90405aa02247 ("perf/x86/intel/lbr: Limit LBR accesses to TOS in callstack mode") The bug could lead to lost LBR call stacks. When restoring the LBR state we need to use the TOS of the previous context, not the current context. To do that we need to save/restore the TOS. Signed-off-by: Andi Kleen Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: acme@kernel.org Cc: jolsa@kernel.org Link: http://lkml.kernel.org/r/1445366797-30894-1-git-send-email-andi@firstfloor.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index 499f533..ffa7a92 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -627,6 +627,7 @@ struct x86_perf_task_context { u64 lbr_from[MAX_LBR_ENTRIES]; u64 lbr_to[MAX_LBR_ENTRIES]; u64 lbr_info[MAX_LBR_ENTRIES]; + int tos; int lbr_callstack_users; int lbr_stack_state; }; diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c index bfd0b71..659f01e 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c +++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c @@ -239,7 +239,7 @@ static void __intel_pmu_lbr_restore(struct x86_perf_task_context *task_ctx) } mask = x86_pmu.lbr_nr - 1; - tos = intel_pmu_lbr_tos(); + tos = task_ctx->tos; for (i = 0; i < tos; i++) { lbr_idx = (tos - i) & mask; wrmsrl(x86_pmu.lbr_from + lbr_idx, task_ctx->lbr_from[i]); @@ -247,6 +247,7 @@ static void __intel_pmu_lbr_restore(struct x86_perf_task_context *task_ctx) if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO) wrmsrl(MSR_LBR_INFO_0 + lbr_idx, task_ctx->lbr_info[i]); } + wrmsrl(x86_pmu.lbr_tos, tos); task_ctx->lbr_stack_state = LBR_NONE; } @@ -270,6 +271,7 @@ static void __intel_pmu_lbr_save(struct x86_perf_task_context *task_ctx) if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO) rdmsrl(MSR_LBR_INFO_0 + lbr_idx, task_ctx->lbr_info[i]); } + task_ctx->tos = tos; task_ctx->lbr_stack_state = LBR_VALID; } -- cgit v0.10.2 From 90eec103b96e30401c0b846045bf8a1c7159b6da Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 16 Nov 2015 11:08:45 +0100 Subject: treewide: Remove old email address There were still a number of references to my old Red Hat email address in the kernel source. Remove these while keeping the Red Hat copyright notices intact. Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Signed-off-by: Ingo Molnar diff --git a/arch/blackfin/kernel/perf_event.c b/arch/blackfin/kernel/perf_event.c index 1e9c8b0..170d786 100644 --- a/arch/blackfin/kernel/perf_event.c +++ b/arch/blackfin/kernel/perf_event.c @@ -14,7 +14,7 @@ * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar * Copyright (C) 2009 Jaswinder Singh Rajput * Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter - * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra + * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra * Copyright (C) 2009 Intel Corporation, * * ppc: diff --git a/arch/sh/kernel/perf_event.c b/arch/sh/kernel/perf_event.c index 7cfd7f1..4dca183 100644 --- a/arch/sh/kernel/perf_event.c +++ b/arch/sh/kernel/perf_event.c @@ -10,7 +10,7 @@ * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar * Copyright (C) 2009 Jaswinder Singh Rajput * Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter - * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra + * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra * Copyright (C) 2009 Intel Corporation, * * ppc: diff --git a/arch/sparc/kernel/perf_event.c b/arch/sparc/kernel/perf_event.c index b0da5ae..3091267 100644 --- a/arch/sparc/kernel/perf_event.c +++ b/arch/sparc/kernel/perf_event.c @@ -9,7 +9,7 @@ * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar * Copyright (C) 2009 Jaswinder Singh Rajput * Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter - * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra + * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra */ #include diff --git a/arch/tile/kernel/perf_event.c b/arch/tile/kernel/perf_event.c index bb509ce..8767060 100644 --- a/arch/tile/kernel/perf_event.c +++ b/arch/tile/kernel/perf_event.c @@ -21,7 +21,7 @@ * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar * Copyright (C) 2009 Jaswinder Singh Rajput * Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter - * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra + * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra * Copyright (C) 2009 Intel Corporation, * Copyright (C) 2009 Google, Inc., Stephane Eranian */ diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 4562cf0..2bf79d7 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -5,7 +5,7 @@ * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar * Copyright (C) 2009 Jaswinder Singh Rajput * Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter - * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra + * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra * Copyright (C) 2009 Intel Corporation, * Copyright (C) 2009 Google, Inc., Stephane Eranian * diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index ffa7a92..ab18b8a 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -5,7 +5,7 @@ * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar * Copyright (C) 2009 Jaswinder Singh Rajput * Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter - * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra + * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra * Copyright (C) 2009 Intel Corporation, * Copyright (C) 2009 Google, Inc., Stephane Eranian * diff --git a/arch/x86/kernel/irq_work.c b/arch/x86/kernel/irq_work.c index dc5fa6a..3512ba6 100644 --- a/arch/x86/kernel/irq_work.c +++ b/arch/x86/kernel/irq_work.c @@ -1,7 +1,7 @@ /* * x86 specific code for irq_work * - * Copyright (C) 2010 Red Hat, Inc., Peter Zijlstra + * Copyright (C) 2010 Red Hat, Inc., Peter Zijlstra */ #include diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h index db284bf..9dbb739 100644 --- a/include/asm-generic/tlb.h +++ b/include/asm-generic/tlb.h @@ -5,7 +5,7 @@ * Copyright 2001 Red Hat, Inc. * Based on code from mm/memory.c Copyright Linus Torvalds and others. * - * Copyright 2011 Red Hat, Inc., Peter Zijlstra + * Copyright 2011 Red Hat, Inc., Peter Zijlstra * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h index 8dde559..0536524 100644 --- a/include/linux/jump_label.h +++ b/include/linux/jump_label.h @@ -5,7 +5,7 @@ * Jump label support * * Copyright (C) 2009-2012 Jason Baron - * Copyright (C) 2011-2012 Peter Zijlstra + * Copyright (C) 2011-2012 Red Hat, Inc., Peter Zijlstra * * DEPRECATED API: * diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index 70400dc..c57e424 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -2,7 +2,7 @@ * Runtime locking correctness validator * * Copyright (C) 2006,2007 Red Hat, Inc., Ingo Molnar - * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra + * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra * * see Documentation/locking/lockdep-design.txt for more details. */ diff --git a/include/linux/proportions.h b/include/linux/proportions.h index 5440f64..2122133 100644 --- a/include/linux/proportions.h +++ b/include/linux/proportions.h @@ -1,7 +1,7 @@ /* * FLoating proportions * - * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra + * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra * * This file contains the public data structure and API definitions. */ diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h index 0bdc72f..4a29c75 100644 --- a/include/linux/uprobes.h +++ b/include/linux/uprobes.h @@ -21,7 +21,7 @@ * Authors: * Srikar Dronamraju * Jim Keniston - * Copyright (C) 2011-2012 Red Hat, Inc., Peter Zijlstra + * Copyright (C) 2011-2012 Red Hat, Inc., Peter Zijlstra */ #include diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c index d659487..9c41800 100644 --- a/kernel/events/callchain.c +++ b/kernel/events/callchain.c @@ -3,7 +3,7 @@ * * Copyright (C) 2008 Thomas Gleixner * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar - * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra + * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra * Copyright © 2009 Paul Mackerras, IBM Corp. * * For licensing details see kernel-base/COPYING diff --git a/kernel/events/core.c b/kernel/events/core.c index 1ac857a..5854fcf 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3,7 +3,7 @@ * * Copyright (C) 2008 Thomas Gleixner * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar - * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra + * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra * Copyright © 2009 Paul Mackerras, IBM Corp. * * For licensing details see kernel-base/COPYING diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index b5d1ea7..adfdc05 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -3,7 +3,7 @@ * * Copyright (C) 2008 Thomas Gleixner * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar - * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra + * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra * Copyright © 2009 Paul Mackerras, IBM Corp. * * For licensing details see kernel-base/COPYING diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 4e5e979..7dad849 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -19,7 +19,7 @@ * Authors: * Srikar Dronamraju * Jim Keniston - * Copyright (C) 2011-2012 Red Hat, Inc., Peter Zijlstra + * Copyright (C) 2011-2012 Red Hat, Inc., Peter Zijlstra */ #include diff --git a/kernel/irq_work.c b/kernel/irq_work.c index cbf9fb8..bcf107c 100644 --- a/kernel/irq_work.c +++ b/kernel/irq_work.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2010 Red Hat, Inc., Peter Zijlstra + * Copyright (C) 2010 Red Hat, Inc., Peter Zijlstra * * Provides a framework for enqueueing and running callbacks from hardirq * context. The enqueueing is NMI-safe. diff --git a/kernel/jump_label.c b/kernel/jump_label.c index f7dd15d..05254ee 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -2,7 +2,7 @@ * jump label support * * Copyright (C) 2009 Jason Baron - * Copyright (C) 2011 Peter Zijlstra + * Copyright (C) 2011 Peter Zijlstra * */ #include diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index deae390..60ace56 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -6,7 +6,7 @@ * Started by Ingo Molnar: * * Copyright (C) 2006,2007 Red Hat, Inc., Ingo Molnar - * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra + * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra * * this code maps all the lock dependencies as they occur in a live kernel * and will warn about the following classes of locking bugs: diff --git a/kernel/locking/lockdep_proc.c b/kernel/locking/lockdep_proc.c index d83d798..dbb61a3 100644 --- a/kernel/locking/lockdep_proc.c +++ b/kernel/locking/lockdep_proc.c @@ -6,7 +6,7 @@ * Started by Ingo Molnar: * * Copyright (C) 2006,2007 Red Hat, Inc., Ingo Molnar - * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra + * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra * * Code for /proc/lockdep and /proc/lockdep_stats: * diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index c0a2051..caf4041 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c @@ -1,7 +1,7 @@ /* * sched_clock for unstable cpu clocks * - * Copyright (C) 2008 Red Hat, Inc., Peter Zijlstra + * Copyright (C) 2008 Red Hat, Inc., Peter Zijlstra * * Updates and enhancements: * Copyright (C) 2008 Red Hat, Inc. Steven Rostedt diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index f04fda8..90e26b1 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -17,7 +17,7 @@ * Copyright (C) 2007, Thomas Gleixner * * Adaptive scheduling granularity, math enhancements by Peter Zijlstra - * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra + * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra */ #include diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index abfc903..cc9f7a9 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -1,7 +1,7 @@ /* * trace event based perf event profiling/tracing * - * Copyright (C) 2009 Red Hat Inc, Peter Zijlstra + * Copyright (C) 2009 Red Hat Inc, Peter Zijlstra * Copyright (C) 2009-2010 Frederic Weisbecker */ diff --git a/lib/btree.c b/lib/btree.c index 4264871..f93a945 100644 --- a/lib/btree.c +++ b/lib/btree.c @@ -5,7 +5,7 @@ * * Copyright (c) 2007-2008 Joern Engel * Bits and pieces stolen from Peter Zijlstra's code, which is - * Copyright 2007, Red Hat Inc. Peter Zijlstra + * Copyright 2007, Red Hat Inc. Peter Zijlstra * GPLv2 * * see http://programming.kicks-ass.net/kernel-patches/vma_lookup/btree.patch diff --git a/lib/proportions.c b/lib/proportions.c index 6f72429..efa54f25 100644 --- a/lib/proportions.c +++ b/lib/proportions.c @@ -1,7 +1,7 @@ /* * Floating proportions * - * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra + * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra * * Description: * diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 3e4d654..d15d88c 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2,7 +2,7 @@ * mm/page-writeback.c * * Copyright (C) 2002, Linus Torvalds. - * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra + * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra * * Contains functions related to writing back dirty pages at the * address_space level. -- cgit v0.10.2 From 7c90e610b60cd1ed6abafd806acfaedccbbe52d1 Mon Sep 17 00:00:00 2001 From: Konstantin Shkolnyy Date: Tue, 10 Nov 2015 16:40:13 -0600 Subject: USB: cp210x: Remove CP2110 ID from compatibility list CP2110 ID (0x10c4, 0xea80) doesn't belong here because it's a HID and completely different from CP210x devices. Signed-off-by: Konstantin Shkolnyy Cc: stable Signed-off-by: Johan Hovold diff --git a/drivers/usb/serial/cp210x.c b/drivers/usb/serial/cp210x.c index eac7cca..7d4f51a 100644 --- a/drivers/usb/serial/cp210x.c +++ b/drivers/usb/serial/cp210x.c @@ -132,7 +132,6 @@ static const struct usb_device_id id_table[] = { { USB_DEVICE(0x10C4, 0xEA60) }, /* Silicon Labs factory default */ { USB_DEVICE(0x10C4, 0xEA61) }, /* Silicon Labs factory default */ { USB_DEVICE(0x10C4, 0xEA70) }, /* Silicon Labs factory default */ - { USB_DEVICE(0x10C4, 0xEA80) }, /* Silicon Labs factory default */ { USB_DEVICE(0x10C4, 0xEA71) }, /* Infinity GPS-MIC-1 Radio Monophone */ { USB_DEVICE(0x10C4, 0xF001) }, /* Elan Digital Systems USBscope50 */ { USB_DEVICE(0x10C4, 0xF002) }, /* Elan Digital Systems USBwave12 */ -- cgit v0.10.2 From f33a7f72e5fc033daccbb8d4753d7c5c41a4d67b Mon Sep 17 00:00:00 2001 From: Jonas Jonsson Date: Sun, 22 Nov 2015 11:47:17 +0100 Subject: USB: cdc_acm: Ignore Infineon Flash Loader utility Some modems, such as the Telit UE910, are using an Infineon Flash Loader utility. It has two interfaces, 2/2/0 (Abstract Modem) and 10/0/0 (CDC Data). The latter can be used as a serial interface to upgrade the firmware of the modem. However, that isn't possible when the cdc-acm driver takes control of the device. The following is an explanation of the behaviour by Daniele Palmas during discussion on linux-usb. "This is what happens when the device is turned on (without modifying the drivers): [155492.352031] usb 1-3: new high-speed USB device number 27 using ehci-pci [155492.485429] usb 1-3: config 1 interface 0 altsetting 0 endpoint 0x81 has an invalid bInterval 255, changing to 11 [155492.485436] usb 1-3: New USB device found, idVendor=058b, idProduct=0041 [155492.485439] usb 1-3: New USB device strings: Mfr=0, Product=0, SerialNumber=0 [155492.485952] cdc_acm 1-3:1.0: ttyACM0: USB ACM device This is the flashing device that is caught by the cdc-acm driver. Once the ttyACM appears, the application starts sending a magic string (simple write on the file descriptor) to keep the device in flashing mode. If this magic string is not properly received in a certain time interval, the modem goes on in normal operative mode: [155493.748094] usb 1-3: USB disconnect, device number 27 [155494.916025] usb 1-3: new high-speed USB device number 28 using ehci-pci [155495.059978] usb 1-3: New USB device found, idVendor=1bc7, idProduct=0021 [155495.059983] usb 1-3: New USB device strings: Mfr=1, Product=2, SerialNumber=3 [155495.059986] usb 1-3: Product: 6 CDC-ACM + 1 CDC-ECM [155495.059989] usb 1-3: Manufacturer: Telit [155495.059992] usb 1-3: SerialNumber: 359658044004697 [155495.138958] cdc_acm 1-3:1.0: ttyACM0: USB ACM device [155495.140832] cdc_acm 1-3:1.2: ttyACM1: USB ACM device [155495.142827] cdc_acm 1-3:1.4: ttyACM2: USB ACM device [155495.144462] cdc_acm 1-3:1.6: ttyACM3: USB ACM device [155495.145967] cdc_acm 1-3:1.8: ttyACM4: USB ACM device [155495.147588] cdc_acm 1-3:1.10: ttyACM5: USB ACM device [155495.154322] cdc_ether 1-3:1.12 wwan0: register 'cdc_ether' at usb-0000:00:1a.7-3, Mobile Broadband Network Device, 00:00:11:12:13:14 Using the cdc-acm driver, the string, though being sent in the same way than using the usb-serial-simple driver (I can confirm that the data is passing properly since I used an hw usb sniffer), does not make the device to stay in flashing mode." Signed-off-by: Jonas Jonsson Tested-by: Daniele Palmas Cc: stable Signed-off-by: Johan Hovold diff --git a/drivers/usb/class/cdc-acm.c b/drivers/usb/class/cdc-acm.c index b30e742..26ca4f9 100644 --- a/drivers/usb/class/cdc-acm.c +++ b/drivers/usb/class/cdc-acm.c @@ -1838,6 +1838,11 @@ static const struct usb_device_id acm_ids[] = { }, #endif + /* Exclude Infineon Flash Loader utility */ + { USB_DEVICE(0x058b, 0x0041), + .driver_info = IGNORE_DEVICE, + }, + /* control interfaces without any protocol set */ { USB_INTERFACE_INFO(USB_CLASS_COMM, USB_CDC_SUBCLASS_ACM, USB_CDC_PROTO_NONE) }, -- cgit v0.10.2 From a0e80fbd56b4573de997c9a088a33abbc1121400 Mon Sep 17 00:00:00 2001 From: Jonas Jonsson Date: Sun, 22 Nov 2015 11:47:18 +0100 Subject: USB: serial: Another Infineon flash loader USB ID The flash loader has been seen on a Telit UE910 modem. The flash loader is a bit special, it presents both an ACM and CDC Data interface but only the latter is useful. Unless a magic string is sent to the device it will disappear and the regular modem device appears instead. Signed-off-by: Jonas Jonsson Tested-by: Daniele Palmas Cc: stable Signed-off-by: Johan Hovold diff --git a/drivers/usb/serial/usb-serial-simple.c b/drivers/usb/serial/usb-serial-simple.c index 3658662..a204782 100644 --- a/drivers/usb/serial/usb-serial-simple.c +++ b/drivers/usb/serial/usb-serial-simple.c @@ -53,6 +53,7 @@ DEVICE(funsoft, FUNSOFT_IDS); /* Infineon Flashloader driver */ #define FLASHLOADER_IDS() \ + { USB_DEVICE_INTERFACE_CLASS(0x058b, 0x0041, USB_CLASS_CDC_DATA) }, \ { USB_DEVICE(0x8087, 0x0716) } DEVICE(flashloader, FLASHLOADER_IDS); -- cgit v0.10.2 From df976f5d95028e26b4c967b24518fe340f8a726d Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Tue, 3 Nov 2015 23:09:58 -0500 Subject: clk: ti: clkt_dpll: fix wrong do_div() usage do_div() is meant to be used with an unsigned dividend. Signed-off-by: Nicolas Pitre Signed-off-by: Tero Kristo diff --git a/drivers/clk/ti/clkt_dpll.c b/drivers/clk/ti/clkt_dpll.c index 9023ca9..b5cc6f6 100644 --- a/drivers/clk/ti/clkt_dpll.c +++ b/drivers/clk/ti/clkt_dpll.c @@ -240,7 +240,7 @@ u8 omap2_init_dpll_parent(struct clk_hw *hw) */ unsigned long omap2_get_dpll_rate(struct clk_hw_omap *clk) { - long long dpll_clk; + u64 dpll_clk; u32 dpll_mult, dpll_div, v; struct dpll_data *dd; @@ -262,7 +262,7 @@ unsigned long omap2_get_dpll_rate(struct clk_hw_omap *clk) dpll_div = v & dd->div1_mask; dpll_div >>= __ffs(dd->div1_mask); - dpll_clk = (long long)clk_get_rate(dd->clk_ref) * dpll_mult; + dpll_clk = (u64)clk_get_rate(dd->clk_ref) * dpll_mult; do_div(dpll_clk, dpll_div + 1); return dpll_clk; -- cgit v0.10.2 From c51185b45c43737faca4574d790489a1bd8cfd11 Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Tue, 3 Nov 2015 23:17:11 -0500 Subject: clk: ti: fapll: fix wrong do_div() usage do_div() is meant to be used with an unsigned dividend. Signed-off-by: Nicolas Pitre Signed-off-by: Tero Kristo diff --git a/drivers/clk/ti/fapll.c b/drivers/clk/ti/fapll.c index f4b2e98..66a0d0e 100644 --- a/drivers/clk/ti/fapll.c +++ b/drivers/clk/ti/fapll.c @@ -168,7 +168,7 @@ static unsigned long ti_fapll_recalc_rate(struct clk_hw *hw, { struct fapll_data *fd = to_fapll(hw); u32 fapll_n, fapll_p, v; - long long rate; + u64 rate; if (ti_fapll_clock_is_bypass(fd)) return parent_rate; @@ -314,7 +314,7 @@ static unsigned long ti_fapll_synth_recalc_rate(struct clk_hw *hw, { struct fapll_synth *synth = to_synth(hw); u32 synth_div_m; - long long rate; + u64 rate; /* The audio_pll_clk1 is hardwired to produce 32.768KiHz clock */ if (!synth->div) -- cgit v0.10.2 From 3a5b1dc4a3237870cfb53d31bc4cbc2a9c7aa16e Mon Sep 17 00:00:00 2001 From: Neil Armstrong Date: Fri, 13 Nov 2015 17:29:58 +0100 Subject: clk: ti816x: Add missing dmtimer clkdev entries Add missing clkdev dmtimer related entries for dm816x. 32Khz and ext sources were missing. Signed-off-by: Neil Armstrong Cc: Brian Hutchinson Acked-by: Tony Lindgren Signed-off-by: Tero Kristo diff --git a/drivers/clk/ti/clk-816x.c b/drivers/clk/ti/clk-816x.c index 1dfad0c..2a5d84f 100644 --- a/drivers/clk/ti/clk-816x.c +++ b/drivers/clk/ti/clk-816x.c @@ -20,6 +20,8 @@ static struct ti_dt_clk dm816x_clks[] = { DT_CLK(NULL, "sys_clkin", "sys_clkin_ck"), DT_CLK(NULL, "timer_sys_ck", "sys_clkin_ck"), DT_CLK(NULL, "sys_32k_ck", "sys_32k_ck"), + DT_CLK(NULL, "timer_32k_ck", "sysclk18_ck"), + DT_CLK(NULL, "timer_ext_ck", "tclkin_ck"), DT_CLK(NULL, "mpu_ck", "mpu_ck"), DT_CLK(NULL, "timer1_fck", "timer1_fck"), DT_CLK(NULL, "timer2_fck", "timer2_fck"), -- cgit v0.10.2 From 167af5ef2cdba14ff14a13c91e5532ed479083d8 Mon Sep 17 00:00:00 2001 From: Grygorii Strashko Date: Thu, 1 Oct 2015 14:20:37 -0500 Subject: clk: ti: drop locking code from mux/divider drivers TI's mux and divider clock drivers do not require locking and they do not initialize internal spinlocks. This code was occasionally copy-posted from generic mux/divider drivers. So remove it. Signed-off-by: Grygorii Strashko Cc: Tony Lindgren Cc: Sekhar Nori Signed-off-by: Tero Kristo diff --git a/drivers/clk/ti/divider.c b/drivers/clk/ti/divider.c index 5b17268..df25583 100644 --- a/drivers/clk/ti/divider.c +++ b/drivers/clk/ti/divider.c @@ -214,7 +214,6 @@ static int ti_clk_divider_set_rate(struct clk_hw *hw, unsigned long rate, { struct clk_divider *divider; unsigned int div, value; - unsigned long flags = 0; u32 val; if (!hw || !rate) @@ -228,9 +227,6 @@ static int ti_clk_divider_set_rate(struct clk_hw *hw, unsigned long rate, if (value > div_mask(divider)) value = div_mask(divider); - if (divider->lock) - spin_lock_irqsave(divider->lock, flags); - if (divider->flags & CLK_DIVIDER_HIWORD_MASK) { val = div_mask(divider) << (divider->shift + 16); } else { @@ -240,9 +236,6 @@ static int ti_clk_divider_set_rate(struct clk_hw *hw, unsigned long rate, val |= value << divider->shift; ti_clk_ll_ops->clk_writel(val, divider->reg); - if (divider->lock) - spin_unlock_irqrestore(divider->lock, flags); - return 0; } @@ -256,8 +249,7 @@ static struct clk *_register_divider(struct device *dev, const char *name, const char *parent_name, unsigned long flags, void __iomem *reg, u8 shift, u8 width, u8 clk_divider_flags, - const struct clk_div_table *table, - spinlock_t *lock) + const struct clk_div_table *table) { struct clk_divider *div; struct clk *clk; @@ -288,7 +280,6 @@ static struct clk *_register_divider(struct device *dev, const char *name, div->shift = shift; div->width = width; div->flags = clk_divider_flags; - div->lock = lock; div->hw.init = &init; div->table = table; @@ -421,7 +412,7 @@ struct clk *ti_clk_register_divider(struct ti_clk *setup) clk = _register_divider(NULL, setup->name, div->parent, flags, (void __iomem *)reg, div->bit_shift, - width, div_flags, table, NULL); + width, div_flags, table); if (IS_ERR(clk)) kfree(table); @@ -584,8 +575,7 @@ static void __init of_ti_divider_clk_setup(struct device_node *node) goto cleanup; clk = _register_divider(NULL, node->name, parent_name, flags, reg, - shift, width, clk_divider_flags, table, - NULL); + shift, width, clk_divider_flags, table); if (!IS_ERR(clk)) { of_clk_add_provider(node, of_clk_src_simple_get, clk); diff --git a/drivers/clk/ti/mux.c b/drivers/clk/ti/mux.c index 69f08a1..dab9ba8 100644 --- a/drivers/clk/ti/mux.c +++ b/drivers/clk/ti/mux.c @@ -69,7 +69,6 @@ static int ti_clk_mux_set_parent(struct clk_hw *hw, u8 index) { struct clk_mux *mux = to_clk_mux(hw); u32 val; - unsigned long flags = 0; if (mux->table) { index = mux->table[index]; @@ -81,9 +80,6 @@ static int ti_clk_mux_set_parent(struct clk_hw *hw, u8 index) index++; } - if (mux->lock) - spin_lock_irqsave(mux->lock, flags); - if (mux->flags & CLK_MUX_HIWORD_MASK) { val = mux->mask << (mux->shift + 16); } else { @@ -93,9 +89,6 @@ static int ti_clk_mux_set_parent(struct clk_hw *hw, u8 index) val |= index << mux->shift; ti_clk_ll_ops->clk_writel(val, mux->reg); - if (mux->lock) - spin_unlock_irqrestore(mux->lock, flags); - return 0; } @@ -109,7 +102,7 @@ static struct clk *_register_mux(struct device *dev, const char *name, const char **parent_names, u8 num_parents, unsigned long flags, void __iomem *reg, u8 shift, u32 mask, u8 clk_mux_flags, - u32 *table, spinlock_t *lock) + u32 *table) { struct clk_mux *mux; struct clk *clk; @@ -133,7 +126,6 @@ static struct clk *_register_mux(struct device *dev, const char *name, mux->shift = shift; mux->mask = mask; mux->flags = clk_mux_flags; - mux->lock = lock; mux->table = table; mux->hw.init = &init; @@ -175,7 +167,7 @@ struct clk *ti_clk_register_mux(struct ti_clk *setup) return _register_mux(NULL, setup->name, mux->parents, mux->num_parents, flags, (void __iomem *)reg, mux->bit_shift, mask, - mux_flags, NULL, NULL); + mux_flags, NULL); } /** @@ -227,8 +219,7 @@ static void of_mux_clk_setup(struct device_node *node) mask = (1 << fls(mask)) - 1; clk = _register_mux(NULL, node->name, parent_names, num_parents, - flags, reg, shift, mask, clk_mux_flags, NULL, - NULL); + flags, reg, shift, mask, clk_mux_flags, NULL); if (!IS_ERR(clk)) of_clk_add_provider(node, of_clk_src_simple_get, clk); -- cgit v0.10.2 From a4dad1ae24f850410c4e60f22823cba1289b8d52 Mon Sep 17 00:00:00 2001 From: David Turner Date: Tue, 24 Nov 2015 14:34:37 -0500 Subject: ext4: Fix handling of extended tv_sec In ext4, the bottom two bits of {a,c,m}time_extra are used to extend the {a,c,m}time fields, deferring the year 2038 problem to the year 2446. When decoding these extended fields, for times whose bottom 32 bits would represent a negative number, sign extension causes the 64-bit extended timestamp to be negative as well, which is not what's intended. This patch corrects that issue, so that the only negative {a,c,m}times are those between 1901 and 1970 (as per 32-bit signed timestamps). Some older kernels might have written pre-1970 dates with 1,1 in the extra bits. This patch treats those incorrectly-encoded dates as pre-1970, instead of post-2311, until kernel 4.20 is released. Hopefully by then e2fsck will have fixed up the bad data. Also add a comment explaining the encoding of ext4's extra {a,c,m}time bits. Signed-off-by: David Turner Signed-off-by: Theodore Ts'o Reported-by: Mark Harris Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=23732 Cc: stable@vger.kernel.org diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 750063f..cc7ca4e 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include @@ -727,19 +728,55 @@ struct move_extent { <= (EXT4_GOOD_OLD_INODE_SIZE + \ (einode)->i_extra_isize)) \ +/* + * We use an encoding that preserves the times for extra epoch "00": + * + * extra msb of adjust for signed + * epoch 32-bit 32-bit tv_sec to + * bits time decoded 64-bit tv_sec 64-bit tv_sec valid time range + * 0 0 1 -0x80000000..-0x00000001 0x000000000 1901-12-13..1969-12-31 + * 0 0 0 0x000000000..0x07fffffff 0x000000000 1970-01-01..2038-01-19 + * 0 1 1 0x080000000..0x0ffffffff 0x100000000 2038-01-19..2106-02-07 + * 0 1 0 0x100000000..0x17fffffff 0x100000000 2106-02-07..2174-02-25 + * 1 0 1 0x180000000..0x1ffffffff 0x200000000 2174-02-25..2242-03-16 + * 1 0 0 0x200000000..0x27fffffff 0x200000000 2242-03-16..2310-04-04 + * 1 1 1 0x280000000..0x2ffffffff 0x300000000 2310-04-04..2378-04-22 + * 1 1 0 0x300000000..0x37fffffff 0x300000000 2378-04-22..2446-05-10 + * + * Note that previous versions of the kernel on 64-bit systems would + * incorrectly use extra epoch bits 1,1 for dates between 1901 and + * 1970. e2fsck will correct this, assuming that it is run on the + * affected filesystem before 2242. + */ + static inline __le32 ext4_encode_extra_time(struct timespec *time) { - return cpu_to_le32((sizeof(time->tv_sec) > 4 ? - (time->tv_sec >> 32) & EXT4_EPOCH_MASK : 0) | - ((time->tv_nsec << EXT4_EPOCH_BITS) & EXT4_NSEC_MASK)); + u32 extra = sizeof(time->tv_sec) > 4 ? + ((time->tv_sec - (s32)time->tv_sec) >> 32) & EXT4_EPOCH_MASK : 0; + return cpu_to_le32(extra | (time->tv_nsec << EXT4_EPOCH_BITS)); } static inline void ext4_decode_extra_time(struct timespec *time, __le32 extra) { - if (sizeof(time->tv_sec) > 4) - time->tv_sec |= (__u64)(le32_to_cpu(extra) & EXT4_EPOCH_MASK) - << 32; - time->tv_nsec = (le32_to_cpu(extra) & EXT4_NSEC_MASK) >> EXT4_EPOCH_BITS; + if (unlikely(sizeof(time->tv_sec) > 4 && + (extra & cpu_to_le32(EXT4_EPOCH_MASK)))) { +#if LINUX_VERSION_CODE < KERNEL_VERSION(4,20,0) + /* Handle legacy encoding of pre-1970 dates with epoch + * bits 1,1. We assume that by kernel version 4.20, + * everyone will have run fsck over the affected + * filesystems to correct the problem. (This + * backwards compatibility may be removed before this + * time, at the discretion of the ext4 developers.) + */ + u64 extra_bits = le32_to_cpu(extra) & EXT4_EPOCH_MASK; + if (extra_bits == 3 && ((time->tv_sec) & 0x80000000) != 0) + extra_bits = 0; + time->tv_sec += extra_bits << 32; +#else + time->tv_sec += (u64)(le32_to_cpu(extra) & EXT4_EPOCH_MASK) << 32; +#endif + } + time->tv_nsec = (le32_to_cpu(extra) & EXT4_NSEC_MASK) >> EXT4_EPOCH_BITS; } #define EXT4_INODE_SET_XTIME(xtime, inode, raw_inode) \ -- cgit v0.10.2 From bc23f0c8d7ccd8d924c4e70ce311288cb3e61ea8 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 24 Nov 2015 15:34:35 -0500 Subject: jbd2: Fix unreclaimed pages after truncate in data=journal mode Ted and Namjae have reported that truncated pages don't get timely reclaimed after being truncated in data=journal mode. The following test triggers the issue easily: for (i = 0; i < 1000; i++) { pwrite(fd, buf, 1024*1024, 0); fsync(fd); fsync(fd); ftruncate(fd, 0); } The reason is that journal_unmap_buffer() finds that truncated buffers are not journalled (jh->b_transaction == NULL), they are part of checkpoint list of a transaction (jh->b_cp_transaction != NULL) and have been already written out (!buffer_dirty(bh)). We clean such buffers but we leave them in the checkpoint list. Since checkpoint transaction holds a reference to the journal head, these buffers cannot be released until the checkpoint transaction is cleaned up. And at that point we don't call release_buffer_page() anymore so pages detached from mapping are lingering in the system waiting for reclaim to find them and free them. Fix the problem by removing buffers from transaction checkpoint lists when journal_unmap_buffer() finds out they don't have to be there anymore. Reported-and-tested-by: Namjae Jeon Fixes: de1b794130b130e77ffa975bb58cb843744f9ae5 Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o Cc: stable@vger.kernel.org diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 6b8338e..b996212 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -2152,6 +2152,7 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh, if (!buffer_dirty(bh)) { /* bdflush has written it. We can drop it now */ + __jbd2_journal_remove_checkpoint(jh); goto zap_buffer; } @@ -2181,6 +2182,7 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh, /* The orphan record's transaction has * committed. We can cleanse this buffer */ clear_buffer_jbddirty(bh); + __jbd2_journal_remove_checkpoint(jh); goto zap_buffer; } } -- cgit v0.10.2 From 07687c031d14a1f36613231c0ea13ce3c4025615 Mon Sep 17 00:00:00 2001 From: Alan Tull Date: Thu, 29 Oct 2015 14:39:56 -0500 Subject: fpga manager: remove label Remove implementation of 'label' DT binding. Signed-off-by: Alan Tull Reviewed-by: Moritz Fischer Signed-off-by: Greg Kroah-Hartman diff --git a/drivers/fpga/fpga-mgr.c b/drivers/fpga/fpga-mgr.c index a24f5cb..bfbf8aa 100644 --- a/drivers/fpga/fpga-mgr.c +++ b/drivers/fpga/fpga-mgr.c @@ -256,7 +256,6 @@ int fpga_mgr_register(struct device *dev, const char *name, void *priv) { struct fpga_manager *mgr; - const char *dt_label; int id, ret; if (!mops || !mops->write_init || !mops->write || @@ -300,11 +299,9 @@ int fpga_mgr_register(struct device *dev, const char *name, mgr->dev.id = id; dev_set_drvdata(dev, mgr); - dt_label = of_get_property(mgr->dev.of_node, "label", NULL); - if (dt_label) - ret = dev_set_name(&mgr->dev, "%s", dt_label); - else - ret = dev_set_name(&mgr->dev, "fpga%d", id); + ret = dev_set_name(&mgr->dev, "fpga%d", id); + if (ret) + goto error_device; ret = device_add(&mgr->dev); if (ret) -- cgit v0.10.2 From e8c77bda05e5d93cce6f38cfdde4192307951dea Mon Sep 17 00:00:00 2001 From: Tobias Klauser Date: Wed, 18 Nov 2015 10:48:16 +0100 Subject: fpga manager: Fix firmware resource leak on error If fpga_mgr_buf_load() fails, the firmware resource previously allocated by request_firmware() is leaked. Fix it by calling release_firmware() regardless of the return value of fpga_mgr_buf_load(). Found by the Coverity scanner (CID 1339653). Fixes: 6a8c3be7ec8e ("add FPGA manager core") Signed-off-by: Tobias Klauser Acked-by: Alan Tull Signed-off-by: Greg Kroah-Hartman diff --git a/drivers/fpga/fpga-mgr.c b/drivers/fpga/fpga-mgr.c index bfbf8aa..953dc91 100644 --- a/drivers/fpga/fpga-mgr.c +++ b/drivers/fpga/fpga-mgr.c @@ -122,12 +122,10 @@ int fpga_mgr_firmware_load(struct fpga_manager *mgr, u32 flags, } ret = fpga_mgr_buf_load(mgr, flags, fw->data, fw->size); - if (ret) - return ret; release_firmware(fw); - return 0; + return ret; } EXPORT_SYMBOL_GPL(fpga_mgr_firmware_load); -- cgit v0.10.2 From c21ac06648a7a9985fa81a53ba3a65569a1de388 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 25 Nov 2015 23:03:49 +0100 Subject: clk: mmp: add linux/clk.h includes The common clk implementation for MMP broke without anyone noticing when we stopped including linux/clk.h from the clk-provider header. This did not show up in the defconfig builds because those use the legacy MMP clk drivers, and it did not show up in my randconfig tests either because I was testing with my mmp multiplatform series applied, which at some point gained the fixup. This fixes the three broken files. Signed-off-by: Arnd Bergmann Fixes: 61ae76563ec3 ("clk: Remove clk.h from clk-provider.h") Signed-off-by: Stephen Boyd diff --git a/drivers/clk/mmp/clk-mmp2.c b/drivers/clk/mmp/clk-mmp2.c index 09d2832..71fd293 100644 --- a/drivers/clk/mmp/clk-mmp2.c +++ b/drivers/clk/mmp/clk-mmp2.c @@ -9,6 +9,7 @@ * warranty of any kind, whether express or implied. */ +#include #include #include #include diff --git a/drivers/clk/mmp/clk-pxa168.c b/drivers/clk/mmp/clk-pxa168.c index 93e967c..7524491 100644 --- a/drivers/clk/mmp/clk-pxa168.c +++ b/drivers/clk/mmp/clk-pxa168.c @@ -9,6 +9,7 @@ * warranty of any kind, whether express or implied. */ +#include #include #include #include diff --git a/drivers/clk/mmp/clk-pxa910.c b/drivers/clk/mmp/clk-pxa910.c index 993abcd..37ba04b 100644 --- a/drivers/clk/mmp/clk-pxa910.c +++ b/drivers/clk/mmp/clk-pxa910.c @@ -9,6 +9,7 @@ * warranty of any kind, whether express or implied. */ +#include #include #include #include -- cgit v0.10.2 From 025af189fb44250206dd8a32fa4a682392af3301 Mon Sep 17 00:00:00 2001 From: Thomas Hellstrom Date: Fri, 20 Nov 2015 11:43:50 -0800 Subject: drm/ttm: Fixed a read/write lock imbalance In ttm_write_lock(), the uninterruptible path should call __ttm_write_lock() not __ttm_read_lock(). This fixes a vmwgfx hang on F23 start up. syeh: Extracted this from one of Thomas' internal patches. Cc: Signed-off-by: Thomas Hellstrom Reviewed-by: Sinclair Yeh diff --git a/drivers/gpu/drm/ttm/ttm_lock.c b/drivers/gpu/drm/ttm/ttm_lock.c index 6a95454..f154fb1 100644 --- a/drivers/gpu/drm/ttm/ttm_lock.c +++ b/drivers/gpu/drm/ttm/ttm_lock.c @@ -180,7 +180,7 @@ int ttm_write_lock(struct ttm_lock *lock, bool interruptible) spin_unlock(&lock->lock); } } else - wait_event(lock->queue, __ttm_read_lock(lock)); + wait_event(lock->queue, __ttm_write_lock(lock)); return ret; } -- cgit v0.10.2 From 99f9be4c73cff20ec64793be68f03f927474298c Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Sat, 21 Nov 2015 13:29:39 +0300 Subject: drm/vmwgfx: fix a warning message The WARN_ON() macro only takes a condition argument, it doesn't take a message. I have converted this to WARN() instead. Signed-off-by: Dan Carpenter Reviewed-by: Sinclair Yeh diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_fifo.c b/drivers/gpu/drm/vmwgfx/vmwgfx_fifo.c index a8baf5f..b6a0806 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_fifo.c +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_fifo.c @@ -390,7 +390,7 @@ void *vmw_fifo_reserve_dx(struct vmw_private *dev_priv, uint32_t bytes, else if (ctx_id == SVGA3D_INVALID_ID) ret = vmw_local_fifo_reserve(dev_priv, bytes); else { - WARN_ON("Command buffer has not been allocated.\n"); + WARN(1, "Command buffer has not been allocated.\n"); ret = NULL; } if (IS_ERR_OR_NULL(ret)) { -- cgit v0.10.2 From e2c9e0b28e146c9a3bce21408f3c02e24ac7ac31 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 26 Nov 2015 15:20:19 -0500 Subject: ext4: fix an endianness bug in ext4_encrypted_zeroout() ex->ee_block is not host-endian (note that accesses of other fields of *ex right next to that line go through the helpers that do proper conversion from little-endian to host-endian; it might make sense to add similar for ->ee_block to avoid reintroducing that kind of bugs...) Cc: stable@vger.kernel.org # v4.1+ Signed-off-by: Al Viro Signed-off-by: Theodore Ts'o diff --git a/fs/ext4/crypto.c b/fs/ext4/crypto.c index af06830..1a08350 100644 --- a/fs/ext4/crypto.c +++ b/fs/ext4/crypto.c @@ -389,7 +389,7 @@ int ext4_encrypted_zeroout(struct inode *inode, struct ext4_extent *ex) struct ext4_crypto_ctx *ctx; struct page *ciphertext_page = NULL; struct bio *bio; - ext4_lblk_t lblk = ex->ee_block; + ext4_lblk_t lblk = le32_to_cpu(ex->ee_block); ext4_fsblk_t pblk = ext4_ext_pblock(ex); unsigned int len = ext4_ext_get_actual_len(ex); int ret, err = 0; -- cgit v0.10.2 From 5a1c7f47da9b32d0671e776b0f388095b7f91e2e Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 26 Nov 2015 15:20:50 -0500 Subject: ext4: fix an endianness bug in ext4_encrypted_follow_link() applying le32_to_cpu() to 16bit value is a bad idea... Cc: stable@vger.kernel.org # v4.1+ Signed-off-by: Al Viro Signed-off-by: Theodore Ts'o diff --git a/fs/ext4/symlink.c b/fs/ext4/symlink.c index abe2401..e8e7af6 100644 --- a/fs/ext4/symlink.c +++ b/fs/ext4/symlink.c @@ -52,7 +52,7 @@ static const char *ext4_encrypted_follow_link(struct dentry *dentry, void **cook /* Symlink is encrypted */ sd = (struct ext4_encrypted_symlink_data *)caddr; cstr.name = sd->encrypted_path; - cstr.len = le32_to_cpu(sd->len); + cstr.len = le16_to_cpu(sd->len); if ((cstr.len + sizeof(struct ext4_encrypted_symlink_data) - 1) > max_size) { -- cgit v0.10.2 From 681c46b164d79aaa5d18f7519b39e29f9b441e40 Mon Sep 17 00:00:00 2001 From: Xu Cang Date: Thu, 26 Nov 2015 15:52:24 -0500 Subject: ext4: add "static" to ext4_seq_##name##_fops struct to fix sparse warning, add static to ext4_seq_##name##_fops struct. Signed-off-by: Theodore Ts'o diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c index 1b57c72..1420a3c 100644 --- a/fs/ext4/sysfs.c +++ b/fs/ext4/sysfs.c @@ -358,7 +358,7 @@ static int name##_open(struct inode *inode, struct file *file) \ return single_open(file, ext4_seq_##name##_show, PDE_DATA(inode)); \ } \ \ -const struct file_operations ext4_seq_##name##_fops = { \ +static const struct file_operations ext4_seq_##name##_fops = { \ .owner = THIS_MODULE, \ .open = name##_open, \ .read = seq_read, \ -- cgit v0.10.2 From 62f49ee26f14753bd7fa634f51d537a79061bf0f Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 16 Nov 2015 22:26:43 +0100 Subject: cpufreq: s3c24xx: Do not mark s3c2410_plls_add as __init s3c2410_plls_add is a device notifier that may be called at runtime and is correctly not marked __init. However it calls s3c_plltab_register() which is marked __init, and that triggers a build error when we are checking for section mismatches: WARNING: vmlinux.o(.text+0x195e0): Section mismatch in reference from the function s3c2410_plls_add() to the function .init.text:s3c_plltab_register() The function s3c2410_plls_add() references the function __init s3c_plltab_register(). This is often because s3c2410_plls_add lacks a __init annotation or the annotation of s3c_plltab_register is wrong. This removes the __init annotation from s3c2410_plls_add as well as the __initdata section annotations from s3c2440_plls_12 and s3c2440_plls_169344, which in turn are referenced from s3c2410_plls_add. Signed-off-by: Arnd Bergmann Signed-off-by: Krzysztof Kozlowski diff --git a/arch/arm/mach-s3c24xx/pll-s3c2440-12000000.c b/arch/arm/mach-s3c24xx/pll-s3c2440-12000000.c index a19460e..b355fca 100644 --- a/arch/arm/mach-s3c24xx/pll-s3c2440-12000000.c +++ b/arch/arm/mach-s3c24xx/pll-s3c2440-12000000.c @@ -20,7 +20,7 @@ #include #include -static struct cpufreq_frequency_table s3c2440_plls_12[] __initdata = { +static struct cpufreq_frequency_table s3c2440_plls_12[] = { { .frequency = 75000000, .driver_data = PLLVAL(0x75, 3, 3), }, /* FVco 600.000000 */ { .frequency = 80000000, .driver_data = PLLVAL(0x98, 4, 3), }, /* FVco 640.000000 */ { .frequency = 90000000, .driver_data = PLLVAL(0x70, 2, 3), }, /* FVco 720.000000 */ diff --git a/arch/arm/mach-s3c24xx/pll-s3c2440-16934400.c b/arch/arm/mach-s3c24xx/pll-s3c2440-16934400.c index 1191b29..be9a248 100644 --- a/arch/arm/mach-s3c24xx/pll-s3c2440-16934400.c +++ b/arch/arm/mach-s3c24xx/pll-s3c2440-16934400.c @@ -20,7 +20,7 @@ #include #include -static struct cpufreq_frequency_table s3c2440_plls_169344[] __initdata = { +static struct cpufreq_frequency_table s3c2440_plls_169344[] = { { .frequency = 78019200, .driver_data = PLLVAL(121, 5, 3), }, /* FVco 624.153600 */ { .frequency = 84067200, .driver_data = PLLVAL(131, 5, 3), }, /* FVco 672.537600 */ { .frequency = 90115200, .driver_data = PLLVAL(141, 5, 3), }, /* FVco 720.921600 */ diff --git a/drivers/cpufreq/s3c24xx-cpufreq.c b/drivers/cpufreq/s3c24xx-cpufreq.c index 733aa51..68ef8fd 100644 --- a/drivers/cpufreq/s3c24xx-cpufreq.c +++ b/drivers/cpufreq/s3c24xx-cpufreq.c @@ -648,7 +648,7 @@ late_initcall(s3c_cpufreq_initcall); * * Register the given set of PLLs with the system. */ -int __init s3c_plltab_register(struct cpufreq_frequency_table *plls, +int s3c_plltab_register(struct cpufreq_frequency_table *plls, unsigned int plls_no) { struct cpufreq_frequency_table *vals; -- cgit v0.10.2 From 53254f900bd9ff1e3cc5628e76126bb403d9d160 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 23 Nov 2015 14:55:41 -0500 Subject: cgroup: make css_set pin its css's to avoid use-afer-free A css_set represents the relationship between a set of tasks and css's. css_set never pinned the associated css's. This was okay because tasks used to always disassociate immediately (in RCU sense) - either a task is moved to a different css_set or exits and never accesses css_set again. Unfortunately, afcf6c8b7544 ("cgroup: add cgroup_subsys->free() method and use it to fix pids controller") and patches leading up to it made a zombie hold onto its css_set and deref the associated css's on its release. Nothing pins the css's after exit and it might have already been freed leading to use-after-free. general protection fault: 0000 [#1] PREEMPT SMP task: ffffffff81bf2500 ti: ffffffff81be4000 task.ti: ffffffff81be4000 RIP: 0010:[] [] pids_cancel.constprop.4+0x5/0x40 ... Call Trace: [] ? pids_free+0x3d/0xa0 [] cgroup_free+0x53/0xe0 [] __put_task_struct+0x42/0x130 [] delayed_put_task_struct+0x77/0x130 [] rcu_process_callbacks+0x2f4/0x820 [] ? rcu_process_callbacks+0x2b3/0x820 [] __do_softirq+0xd4/0x460 [] irq_exit+0x89/0xa0 [] smp_apic_timer_interrupt+0x42/0x50 [] apic_timer_interrupt+0x84/0x90 ... Code: 5b 5d c3 48 89 df 48 c7 c2 c9 f9 ae 81 48 c7 c6 91 2c ae 81 e8 1d 94 0e 00 31 c0 5b 5d c3 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 48 83 87 e0 00 00 00 ff 78 01 c3 80 3d 08 7a c1 00 00 74 02 RIP [] pids_cancel.constprop.4+0x5/0x40 RSP ---[ end trace 89a4a4b916b90c49 ]--- Kernel panic - not syncing: Fatal exception in interrupt Kernel Offset: disabled ---[ end Kernel panic - not syncing: Fatal exception in interrupt Fix it by making css_set pin the associate css's until its release. Signed-off-by: Tejun Heo Reported-by: Dave Jones Reported-by: Daniel Wagner Link: http://lkml.kernel.org/g/20151120041836.GA18390@codemonkey.org.uk Link: http://lkml.kernel.org/g/5652D448.3080002@bmw-carit.de Fixes: afcf6c8b7544 ("cgroup: add cgroup_subsys->free() method and use it to fix pids controller") diff --git a/kernel/cgroup.c b/kernel/cgroup.c index b316deb..5cea63f 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -760,9 +760,11 @@ static void put_css_set_locked(struct css_set *cset) if (!atomic_dec_and_test(&cset->refcount)) return; - /* This css_set is dead. unlink it and release cgroup refcounts */ - for_each_subsys(ss, ssid) + /* This css_set is dead. unlink it and release cgroup and css refs */ + for_each_subsys(ss, ssid) { list_del(&cset->e_cset_node[ssid]); + css_put(cset->subsys[ssid]); + } hash_del(&cset->hlist); css_set_count--; @@ -1062,9 +1064,13 @@ static struct css_set *find_css_set(struct css_set *old_cset, key = css_set_hash(cset->subsys); hash_add(css_set_table, &cset->hlist, key); - for_each_subsys(ss, ssid) + for_each_subsys(ss, ssid) { + struct cgroup_subsys_state *css = cset->subsys[ssid]; + list_add_tail(&cset->e_cset_node[ssid], - &cset->subsys[ssid]->cgroup->e_csets[ssid]); + &css->cgroup->e_csets[ssid]); + css_get(css); + } spin_unlock_bh(&css_set_lock); -- cgit v0.10.2 From c9e75f0492b248aeaa7af8991a6fc9a21506bc96 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 27 Nov 2015 19:57:19 +0100 Subject: cgroup: pids: fix race between cgroup_post_fork() and cgroup_migrate() If the new child migrates to another cgroup before cgroup_post_fork() calls subsys->fork(), then both pids_can_attach() and pids_fork() will do the same pids_uncharge(old_pids) + pids_charge(pids) sequence twice. Change copy_process() to call threadgroup_change_begin/threadgroup_change_end unconditionally. percpu_down_read() is cheap and this allows other cleanups, see the next changes. Also, this way we can unify cgroup_threadgroup_rwsem and dup_mmap_sem. Signed-off-by: Oleg Nesterov Acked-by: Zefan Li Signed-off-by: Tejun Heo diff --git a/kernel/cgroup_pids.c b/kernel/cgroup_pids.c index cdd8df4..15ef2e4 100644 --- a/kernel/cgroup_pids.c +++ b/kernel/cgroup_pids.c @@ -243,27 +243,10 @@ static void pids_cancel_fork(struct task_struct *task, void *priv) static void pids_fork(struct task_struct *task, void *priv) { - struct cgroup_subsys_state *css; - struct cgroup_subsys_state *old_css = priv; - struct pids_cgroup *pids; - struct pids_cgroup *old_pids = css_pids(old_css); - - css = task_get_css(task, pids_cgrp_id); - pids = css_pids(css); - - /* - * If the association has changed, we have to revert and reapply the - * charge/uncharge on the wrong hierarchy to the current one. Since - * the association can only change due to an organisation event, its - * okay for us to ignore the limit in this case. - */ - if (pids != old_pids) { - pids_uncharge(old_pids, 1); - pids_charge(pids, 1); - } + struct cgroup_subsys_state *css = priv; + WARN_ON(task_css_check(task, pids_cgrp_id, true) != css); css_put(css); - css_put(old_css); } static void pids_free(struct task_struct *task) diff --git a/kernel/fork.c b/kernel/fork.c index f97f2c4..fce002e 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1368,8 +1368,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, p->real_start_time = ktime_get_boot_ns(); p->io_context = NULL; p->audit_context = NULL; - if (clone_flags & CLONE_THREAD) - threadgroup_change_begin(current); + threadgroup_change_begin(current); cgroup_fork(p); #ifdef CONFIG_NUMA p->mempolicy = mpol_dup(p->mempolicy); @@ -1610,8 +1609,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, proc_fork_connector(p); cgroup_post_fork(p, cgrp_ss_priv); - if (clone_flags & CLONE_THREAD) - threadgroup_change_end(current); + threadgroup_change_end(current); perf_event_fork(p); trace_task_newtask(p, clone_flags); @@ -1652,8 +1650,7 @@ bad_fork_cleanup_policy: mpol_put(p->mempolicy); bad_fork_cleanup_threadgroup_lock: #endif - if (clone_flags & CLONE_THREAD) - threadgroup_change_end(current); + threadgroup_change_end(current); delayacct_tsk_free(p); bad_fork_cleanup_count: atomic_dec(&p->cred->user->processes); -- cgit v0.10.2 From afbcb364bee9e7cf46c94257a82cb9760b6d254f Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 27 Nov 2015 19:57:22 +0100 Subject: cgroup: pids: kill pids_fork(), simplify pids_can_fork() and pids_cancel_fork() Now that we know that the forking task can't migrate amd the child is always moved to the same cgroup by cgroup_post_fork()->css_set_move_task() we can change pids_can_fork() and pids_cancel_fork() to just use task_css(current). And since we no longer need to pin this css, we can remove pid_fork(). Note: the patch uses task_css_check(true), perhaps it makes sense to add a helper or change task_css_set_check() to take cgroup_threadgroup_rwsem into account. Signed-off-by: Oleg Nesterov Acked-by: Zefan Li Signed-off-by: Tejun Heo diff --git a/kernel/cgroup_pids.c b/kernel/cgroup_pids.c index 15ef2e4..de3359a 100644 --- a/kernel/cgroup_pids.c +++ b/kernel/cgroup_pids.c @@ -205,48 +205,28 @@ static void pids_cancel_attach(struct cgroup_subsys_state *css, } } +/* + * task_css_check(true) in pids_can_fork() and pids_cancel_fork() relies + * on threadgroup_change_begin() held by the copy_process(). + */ static int pids_can_fork(struct task_struct *task, void **priv_p) { struct cgroup_subsys_state *css; struct pids_cgroup *pids; - int err; - /* - * Use the "current" task_css for the pids subsystem as the tentative - * css. It is possible we will charge the wrong hierarchy, in which - * case we will forcefully revert/reapply the charge on the right - * hierarchy after it is committed to the task proper. - */ - css = task_get_css(current, pids_cgrp_id); + css = task_css_check(current, pids_cgrp_id, true); pids = css_pids(css); - - err = pids_try_charge(pids, 1); - if (err) - goto err_css_put; - - *priv_p = css; - return 0; - -err_css_put: - css_put(css); - return err; + return pids_try_charge(pids, 1); } static void pids_cancel_fork(struct task_struct *task, void *priv) { - struct cgroup_subsys_state *css = priv; - struct pids_cgroup *pids = css_pids(css); + struct cgroup_subsys_state *css; + struct pids_cgroup *pids; + css = task_css_check(current, pids_cgrp_id, true); + pids = css_pids(css); pids_uncharge(pids, 1); - css_put(css); -} - -static void pids_fork(struct task_struct *task, void *priv) -{ - struct cgroup_subsys_state *css = priv; - - WARN_ON(task_css_check(task, pids_cgrp_id, true) != css); - css_put(css); } static void pids_free(struct task_struct *task) @@ -329,7 +309,6 @@ struct cgroup_subsys pids_cgrp_subsys = { .cancel_attach = pids_cancel_attach, .can_fork = pids_can_fork, .cancel_fork = pids_cancel_fork, - .fork = pids_fork, .free = pids_free, .legacy_cftypes = pids_files, .dfl_cftypes = pids_files, -- cgit v0.10.2 From 023113d24ef9e1d2b44cb2446872b17e2b01d8b1 Mon Sep 17 00:00:00 2001 From: Xiangliang Yu Date: Thu, 26 Nov 2015 20:27:02 +0800 Subject: AHCI: Fix softreset failed issue of Port Multiplier Current code doesn't update port value of Port Multiplier(PM) when sending FIS of softreset to device, command will fail if FBS is enabled. There are two ways to fix the issue: the first is to disable FBS before sending softreset command to PM device and the second is to update port value of PM when sending command. For the first way, i can't find any related rule in AHCI Spec. The second way can avoid disabling FBS and has better performance. Signed-off-by: Xiangliang Yu Signed-off-by: Tejun Heo Cc: stable@vger.kernel.org diff --git a/drivers/ata/libahci.c b/drivers/ata/libahci.c index 096064c..4665512 100644 --- a/drivers/ata/libahci.c +++ b/drivers/ata/libahci.c @@ -1273,6 +1273,15 @@ static int ahci_exec_polled_cmd(struct ata_port *ap, int pmp, ata_tf_to_fis(tf, pmp, is_cmd, fis); ahci_fill_cmd_slot(pp, 0, cmd_fis_len | flags | (pmp << 12)); + /* set port value for softreset of Port Multiplier */ + if (pp->fbs_enabled && pp->fbs_last_dev != pmp) { + tmp = readl(port_mmio + PORT_FBS); + tmp &= ~(PORT_FBS_DEV_MASK | PORT_FBS_DEC); + tmp |= pmp << PORT_FBS_DEV_OFFSET; + writel(tmp, port_mmio + PORT_FBS); + pp->fbs_last_dev = pmp; + } + /* issue & wait */ writel(1, port_mmio + PORT_CMD_ISSUE); -- cgit v0.10.2 From d98f1cd0a3b70ea91f1dfda3ac36c3b2e1a4d5e2 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Thu, 26 Nov 2015 12:00:59 -0500 Subject: sata_sil: disable trim When I connect an Intel SSD to SATA SIL controller (PCI ID 1095:3114), any TRIM command results in I/O errors being reported in the log. There is other similar error reported with TRIM and the SIL controller: https://bugs.centos.org/view.php?id=5880 Apparently the controller doesn't support TRIM commands. This patch disables TRIM support on the SATA SIL controller. ata7.00: exception Emask 0x0 SAct 0x0 SErr 0x0 action 0x0 ata7.00: BMDMA2 stat 0x50001 ata7.00: failed command: DATA SET MANAGEMENT ata7.00: cmd 06/01:01:00:00:00/00:00:00:00:00/a0 tag 0 dma 512 out res 51/04:01:00:00:00/00:00:00:00:00/a0 Emask 0x1 (device error) ata7.00: status: { DRDY ERR } ata7.00: error: { ABRT } ata7.00: device reported invalid CHS sector 0 sd 8:0:0:0: [sdb] tag#0 FAILED Result: hostbyte=DID_OK driverbyte=DRIVER_SENSE sd 8:0:0:0: [sdb] tag#0 Sense Key : Illegal Request [current] [descriptor] sd 8:0:0:0: [sdb] tag#0 Add. Sense: Unaligned write command sd 8:0:0:0: [sdb] tag#0 CDB: Write same(16) 93 08 00 00 00 00 00 21 95 88 00 20 00 00 00 00 blk_update_request: I/O error, dev sdb, sector 2200968 Signed-off-by: Mikulas Patocka Cc: stable@kernel.org Signed-off-by: Tejun Heo diff --git a/drivers/ata/sata_sil.c b/drivers/ata/sata_sil.c index dea6edcb..29bcff0 100644 --- a/drivers/ata/sata_sil.c +++ b/drivers/ata/sata_sil.c @@ -630,6 +630,9 @@ static void sil_dev_config(struct ata_device *dev) unsigned int n, quirks = 0; unsigned char model_num[ATA_ID_PROD_LEN + 1]; + /* This controller doesn't support trim */ + dev->horkage |= ATA_HORKAGE_NOTRIM; + ata_id_c_string(dev->id, model_num, ATA_ID_PROD, sizeof(model_num)); for (n = 0; sil_blacklist[n].product; n++) -- cgit v0.10.2 From d3e41a9fc091bf5f5d4504aead899d73bd67d71e Mon Sep 17 00:00:00 2001 From: Neil Armstrong Date: Thu, 12 Nov 2015 15:07:30 +0100 Subject: ARM: dts: add dm816x missing #mbox-cells Add missing #mbox-cells for dm816x mbox DT node. Cc: Brian Hutchinson Signed-off-by: Neil Armstrong Signed-off-by: Tony Lindgren diff --git a/arch/arm/boot/dts/dm816x.dtsi b/arch/arm/boot/dts/dm816x.dtsi index 3c99cfa..a7a34e4 100644 --- a/arch/arm/boot/dts/dm816x.dtsi +++ b/arch/arm/boot/dts/dm816x.dtsi @@ -218,6 +218,7 @@ reg = <0x480c8000 0x2000>; interrupts = <77>; ti,hwmods = "mailbox"; + #mbox-cells = <1>; ti,mbox-num-users = <4>; ti,mbox-num-fifos = <12>; mbox_dsp: mbox_dsp { -- cgit v0.10.2 From f1b2e7c29a52309724d13a1eb4de457406697118 Mon Sep 17 00:00:00 2001 From: Neil Armstrong Date: Thu, 12 Nov 2015 15:07:35 +0100 Subject: ARM: dts: add dm816x missing spi DT dma handles Add the missing SPI controller DMA handler in the dm816x DT node, only properties for the two channels on four were present. Cc: Brian Hutchinson Signed-off-by: Neil Armstrong Signed-off-by: Tony Lindgren diff --git a/arch/arm/boot/dts/dm816x.dtsi b/arch/arm/boot/dts/dm816x.dtsi index a7a34e4..eee636d 100644 --- a/arch/arm/boot/dts/dm816x.dtsi +++ b/arch/arm/boot/dts/dm816x.dtsi @@ -280,8 +280,11 @@ ti,spi-num-cs = <4>; ti,hwmods = "mcspi1"; dmas = <&edma 16 &edma 17 - &edma 18 &edma 19>; - dma-names = "tx0", "rx0", "tx1", "rx1"; + &edma 18 &edma 19 + &edma 20 &edma 21 + &edma 22 &edma 23>; + dma-names = "tx0", "rx0", "tx1", "rx1", + "tx2", "rx2", "tx3", "rx3"; }; mmc1: mmc@48060000 { -- cgit v0.10.2 From 741e96e8790cbd389d27e29bbf66de2c691fd775 Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Tue, 3 Nov 2015 19:46:23 -0500 Subject: imx/clk-pllv1: fix wrong do_div() usage do_div() is meant to be used with an unsigned dividend. Signed-off-by: Nicolas Pitre Acked-by: Shawn Guo Signed-off-by: Stephen Boyd diff --git a/drivers/clk/imx/clk-pllv1.c b/drivers/clk/imx/clk-pllv1.c index 8564e43..82fe366 100644 --- a/drivers/clk/imx/clk-pllv1.c +++ b/drivers/clk/imx/clk-pllv1.c @@ -52,7 +52,7 @@ static unsigned long clk_pllv1_recalc_rate(struct clk_hw *hw, unsigned long parent_rate) { struct clk_pllv1 *pll = to_clk_pllv1(hw); - long long ll; + unsigned long long ull; int mfn_abs; unsigned int mfi, mfn, mfd, pd; u32 reg; @@ -94,16 +94,16 @@ static unsigned long clk_pllv1_recalc_rate(struct clk_hw *hw, rate = parent_rate * 2; rate /= pd + 1; - ll = (unsigned long long)rate * mfn_abs; + ull = (unsigned long long)rate * mfn_abs; - do_div(ll, mfd + 1); + do_div(ull, mfd + 1); if (mfn_is_negative(pll, mfn)) - ll = -ll; + ull = (rate * mfi) - ull; + else + ull = (rate * mfi) + ull; - ll = (rate * mfi) + ll; - - return ll; + return ull; } static struct clk_ops clk_pllv1_ops = { -- cgit v0.10.2 From 0d2681e1f19162e5975b70edd0c2d1d256f38023 Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Tue, 3 Nov 2015 20:01:40 -0500 Subject: imx/clk-pllv2: fix wrong do_div() usage do_div() is meant to be used with an unsigned dividend. Signed-off-by: Nicolas Pitre Acked-by: Shawn Guo Signed-off-by: Stephen Boyd diff --git a/drivers/clk/imx/clk-pllv2.c b/drivers/clk/imx/clk-pllv2.c index b18f875..4aeda56 100644 --- a/drivers/clk/imx/clk-pllv2.c +++ b/drivers/clk/imx/clk-pllv2.c @@ -79,7 +79,7 @@ static unsigned long __clk_pllv2_recalc_rate(unsigned long parent_rate, { long mfi, mfn, mfd, pdf, ref_clk; unsigned long dbl; - s64 temp; + u64 temp; dbl = dp_ctl & MXC_PLL_DP_CTL_DPDCK0_2_EN; @@ -98,8 +98,9 @@ static unsigned long __clk_pllv2_recalc_rate(unsigned long parent_rate, temp = (u64) ref_clk * abs(mfn); do_div(temp, mfd + 1); if (mfn < 0) - temp = -temp; - temp = (ref_clk * mfi) + temp; + temp = (ref_clk * mfi) - temp; + else + temp = (ref_clk * mfi) + temp; return temp; } @@ -126,7 +127,7 @@ static int __clk_pllv2_set_rate(unsigned long rate, unsigned long parent_rate, { u32 reg; long mfi, pdf, mfn, mfd = 999999; - s64 temp64; + u64 temp64; unsigned long quad_parent_rate; quad_parent_rate = 4 * parent_rate; -- cgit v0.10.2 From 279104e3ade92b38198fdaead9e84bd80057693a Mon Sep 17 00:00:00 2001 From: Sudip Mukherjee Date: Mon, 23 Nov 2015 15:36:50 +0530 Subject: clk: qoriq: fix memory leak If get_pll_div() fails we exited by returning NULL but we missed releasing hwc. Signed-off-by: Sudip Mukherjee Fixes: 0dfc86b3173f ("clk: qoriq: Move chip-specific knowledge into driver") Signed-off-by: Stephen Boyd diff --git a/drivers/clk/clk-qoriq.c b/drivers/clk/clk-qoriq.c index 1ab0fb8..7bc1c45 100644 --- a/drivers/clk/clk-qoriq.c +++ b/drivers/clk/clk-qoriq.c @@ -778,8 +778,10 @@ static struct clk * __init create_one_cmux(struct clockgen *cg, int idx) */ clksel = (cg_in(cg, hwc->reg) & CLKSEL_MASK) >> CLKSEL_SHIFT; div = get_pll_div(cg, hwc, clksel); - if (!div) + if (!div) { + kfree(hwc); return NULL; + } pct80_rate = clk_get_rate(div->clk); pct80_rate *= 8; -- cgit v0.10.2 From 62e345ae5b6ed977b02c24d1eaeaece062f9fb17 Mon Sep 17 00:00:00 2001 From: Felipe Balbi Date: Mon, 30 Nov 2015 15:24:29 -0600 Subject: usb: dwc3: gadget: don't prestart interrupt endpoints Because interrupt endpoints usually transmit such small amounts of data, it seems pointless to prestart transfers and try to get speed improvements. This patch also sorts out a problem with CDC ECM function where its notification endpoint gets stuck in busy state and we continuously issue Update Transfer commands. Fixes: 8a1a9c9e4503 ("usb: dwc3: gadget: start transfer on XFER_COMPLETE") Signed-off-by: Felipe Balbi diff --git a/drivers/usb/dwc3/gadget.c b/drivers/usb/dwc3/gadget.c index e24a01c..a58376f 100644 --- a/drivers/usb/dwc3/gadget.c +++ b/drivers/usb/dwc3/gadget.c @@ -1078,6 +1078,7 @@ static int __dwc3_gadget_ep_queue(struct dwc3_ep *dep, struct dwc3_request *req) * little bit faster. */ if (!usb_endpoint_xfer_isoc(dep->endpoint.desc) && + !usb_endpoint_xfer_int(dep->endpoint.desc) && !(dep->flags & DWC3_EP_BUSY)) { ret = __dwc3_gadget_kick_transfer(dep, 0, true); goto out; -- cgit v0.10.2 From fc82792851d693e4f6da81b4b97e7af62d596af9 Mon Sep 17 00:00:00 2001 From: Grygorii Strashko Date: Thu, 26 Nov 2015 17:22:23 +0200 Subject: ARM: OMAP2+: enable REGULATOR_FIXED_VOLTAGE Enable REGULATOR_FIXED_VOLTAGE for all OMAP2+ platforms otherwise system can't boot from SD-card when kernel is built for single SoC (for example, with CONFIG_SOC_DRA7XX=y only). It's also required for almost all TI SoC's platforms. Signed-off-by: Grygorii Strashko Signed-off-by: Tony Lindgren diff --git a/arch/arm/mach-omap2/Kconfig b/arch/arm/mach-omap2/Kconfig index 5076d3f..4b4371d 100644 --- a/arch/arm/mach-omap2/Kconfig +++ b/arch/arm/mach-omap2/Kconfig @@ -121,6 +121,7 @@ config ARCH_OMAP2PLUS_TYPICAL select NEON if CPU_V7 select PM select REGULATOR + select REGULATOR_FIXED_VOLTAGE select TWL4030_CORE if ARCH_OMAP3 || ARCH_OMAP4 select TWL4030_POWER if ARCH_OMAP3 || ARCH_OMAP4 select VFP @@ -201,7 +202,6 @@ config MACH_OMAP3_PANDORA depends on ARCH_OMAP3 default y select OMAP_PACKAGE_CBB - select REGULATOR_FIXED_VOLTAGE if REGULATOR config MACH_NOKIA_N810 bool -- cgit v0.10.2 From 9eb8cd2b0780bca0719e754e7bdcf5f368f001bd Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Wed, 18 Nov 2015 21:46:38 +1100 Subject: of: Fix comparison of reserved memory regions In order to check for overlapping reserved memory regions, we first need to sort the array of memory regions. This is implemented using sort(), and a custom comparison function __rmem_cmp(). Unfortunatley __rmem_cmp() doesn't work in all cases. Because the two base values are phys_addr_t, they may be u64 on some platforms, in which case subtracting one from the other and then (implicitly) casting to int does not give us the -ve/0/+ve value we need. This leads to incorrect reports about overlaps, eg: ibm,slw-image@1ffe600000 (0x0000001ffe600000--0x0000001ffe700000) overlaps with ibm,firmware-allocs-memory@1000000000 (0x0000001000000000--0x0000001000dc0200) Fix it by just doing the standard double if and return 0 logic. Fixes: ae1add247bf8 ("of: Check for overlap in reserved memory regions") Signed-off-by: Michael Ellerman Signed-off-by: Rob Herring diff --git a/drivers/of/of_reserved_mem.c b/drivers/of/of_reserved_mem.c index be77e75..1a3556a 100644 --- a/drivers/of/of_reserved_mem.c +++ b/drivers/of/of_reserved_mem.c @@ -206,7 +206,13 @@ static int __init __rmem_cmp(const void *a, const void *b) { const struct reserved_mem *ra = a, *rb = b; - return ra->base - rb->base; + if (ra->base < rb->base) + return -1; + + if (ra->base > rb->base) + return 1; + + return 0; } static void __init __rmem_check_for_overlap(void) -- cgit v0.10.2 From 78bb2abe09d6bd15eeee7250f2fa0cb76432a8a2 Mon Sep 17 00:00:00 2001 From: "Dmitry V. Krivenok" Date: Mon, 30 Nov 2015 23:45:48 +0300 Subject: of: do not use 0x in front of %pa Signed-off-by: Dmitry V. Krivenok Signed-off-by: Rob Herring diff --git a/drivers/of/fdt.c b/drivers/of/fdt.c index d243029..1bbe3a9 100644 --- a/drivers/of/fdt.c +++ b/drivers/of/fdt.c @@ -1041,7 +1041,7 @@ void __init __weak early_init_dt_add_memory_arch(u64 base, u64 size) int __init __weak early_init_dt_reserve_memory_arch(phys_addr_t base, phys_addr_t size, bool nomap) { - pr_err("Reserved memory not supported, ignoring range 0x%pa - 0x%pa%s\n", + pr_err("Reserved memory not supported, ignoring range %pa - %pa%s\n", &base, &size, nomap ? " (nomap)" : ""); return -ENOSYS; } -- cgit v0.10.2 From ba85edbe0d39710dc74a2a9c6f90929b5722a53b Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Mon, 30 Nov 2015 15:22:37 +0900 Subject: of/address: fix typo in comment block of of_translate_one() Remove the "not" before "cannot". I am fixing the comment block style while I am here. Signed-off-by: Masahiro Yamada Signed-off-by: Rob Herring diff --git a/drivers/of/address.c b/drivers/of/address.c index cd53fe4..9582c57 100644 --- a/drivers/of/address.c +++ b/drivers/of/address.c @@ -485,9 +485,10 @@ static int of_translate_one(struct device_node *parent, struct of_bus *bus, int rone; u64 offset = OF_BAD_ADDR; - /* Normally, an absence of a "ranges" property means we are + /* + * Normally, an absence of a "ranges" property means we are * crossing a non-translatable boundary, and thus the addresses - * below the current not cannot be converted to CPU physical ones. + * below the current cannot be converted to CPU physical ones. * Unfortunately, while this is very clear in the spec, it's not * what Apple understood, and they do have things like /uni-n or * /ht nodes with no "ranges" property and a lot of perfectly -- cgit v0.10.2 From e80cf2e50bfabb14dd3667b2360a393dda3edc3f Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Wed, 21 Oct 2015 22:41:40 +0200 Subject: clk: scpi: add missing of_node_put for_each_available_child_of_node performs an of_node_get on each iteration, so a break out of the loop requires an of_node_put. The semantic patch that fixes this problem is as follows (http://coccinelle.lip6.fr): // @@ expression root,e; local idexpression child; @@ for_each_available_child_of_node(root, child) { ... when != of_node_put(child) when != e = child ( return child; | + of_node_put(child); ? return ...; ) ... } // Signed-off-by: Julia Lawall Acked-by: Sudeep Holla Signed-off-by: Stephen Boyd diff --git a/drivers/clk/clk-scpi.c b/drivers/clk/clk-scpi.c index 0b501a9..cd0f272 100644 --- a/drivers/clk/clk-scpi.c +++ b/drivers/clk/clk-scpi.c @@ -292,6 +292,7 @@ static int scpi_clocks_probe(struct platform_device *pdev) ret = scpi_clk_add(dev, child, match); if (ret) { scpi_clocks_remove(pdev); + of_node_put(child); return ret; } } -- cgit v0.10.2 From df36c5bede207f734e4750beb2b14fb892050280 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrien=20Verg=C3=A9?= Date: Tue, 24 Nov 2015 16:02:04 +0100 Subject: USB: quirks: Fix another ELAN touchscreen MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Like other buggy models that had their fixes [1], the touchscreen with id 04f3:21b8 from ELAN Microelectronics needs the device-qualifier quirk. Otherwise, it fails to respond, blocks the boot for a random amount of time and pollutes dmesg with: [ 2887.373196] usb 1-5: new full-speed USB device number 41 using xhci_hcd [ 2889.502000] usb 1-5: unable to read config index 0 descriptor/start: -71 [ 2889.502005] usb 1-5: can't read configurations, error -71 [ 2889.654571] usb 1-5: new full-speed USB device number 42 using xhci_hcd [ 2891.783438] usb 1-5: unable to read config index 0 descriptor/start: -71 [ 2891.783443] usb 1-5: can't read configurations, error -71 [1]: See commits c68929f, 876af5d, d749947, a32c99e and dc703ec. Tested-by: Adrien Vergé Signed-off-by: Adrien Vergé Signed-off-by: Greg Kroah-Hartman diff --git a/drivers/usb/core/quirks.c b/drivers/usb/core/quirks.c index f5a3819..fcd6ac0 100644 --- a/drivers/usb/core/quirks.c +++ b/drivers/usb/core/quirks.c @@ -125,6 +125,9 @@ static const struct usb_device_id usb_quirk_list[] = { { USB_DEVICE(0x04f3, 0x016f), .driver_info = USB_QUIRK_DEVICE_QUALIFIER }, + { USB_DEVICE(0x04f3, 0x21b8), .driver_info = + USB_QUIRK_DEVICE_QUALIFIER }, + /* Roland SC-8820 */ { USB_DEVICE(0x0582, 0x0007), .driver_info = USB_QUIRK_RESET_RESUME }, -- cgit v0.10.2 From 9fa62b1a31c96715aef34f25000e882ed4ac4876 Mon Sep 17 00:00:00 2001 From: Dmitry Katsubo Date: Fri, 20 Nov 2015 01:30:44 +0100 Subject: usb-storage: Fix scsi-sd failure "Invalid field in cdb" for USB adapter JMicron The patch extends the family of SATA-to-USB JMicron adapters that need FUA to be disabled and applies the same policy for uas driver. See details in http://unix.stackexchange.com/questions/237204/ Signed-off-by: Dmitry Katsubo Tested-by: Dmitry Katsubo Acked-by: Alan Stern Cc: stable Signed-off-by: Greg Kroah-Hartman diff --git a/drivers/usb/storage/uas.c b/drivers/usb/storage/uas.c index e691516..5c66d3f 100644 --- a/drivers/usb/storage/uas.c +++ b/drivers/usb/storage/uas.c @@ -796,6 +796,10 @@ static int uas_slave_configure(struct scsi_device *sdev) if (devinfo->flags & US_FL_NO_REPORT_OPCODES) sdev->no_report_opcodes = 1; + /* A few buggy USB-ATA bridges don't understand FUA */ + if (devinfo->flags & US_FL_BROKEN_FUA) + sdev->broken_fua = 1; + scsi_change_queue_depth(sdev, devinfo->qdepth - 2); return 0; } diff --git a/drivers/usb/storage/unusual_devs.h b/drivers/usb/storage/unusual_devs.h index 6b24791..7ffe420 100644 --- a/drivers/usb/storage/unusual_devs.h +++ b/drivers/usb/storage/unusual_devs.h @@ -1987,7 +1987,7 @@ UNUSUAL_DEV( 0x14cd, 0x6600, 0x0201, 0x0201, US_FL_IGNORE_RESIDUE ), /* Reported by Michael Büsch */ -UNUSUAL_DEV( 0x152d, 0x0567, 0x0114, 0x0114, +UNUSUAL_DEV( 0x152d, 0x0567, 0x0114, 0x0116, "JMicron", "USB to ATA/ATAPI Bridge", USB_SC_DEVICE, USB_PR_DEVICE, NULL, diff --git a/drivers/usb/storage/unusual_uas.h b/drivers/usb/storage/unusual_uas.h index c85ea53..ccc113e 100644 --- a/drivers/usb/storage/unusual_uas.h +++ b/drivers/usb/storage/unusual_uas.h @@ -132,7 +132,7 @@ UNUSUAL_DEV(0x152d, 0x0567, 0x0000, 0x9999, "JMicron", "JMS567", USB_SC_DEVICE, USB_PR_DEVICE, NULL, - US_FL_NO_REPORT_OPCODES), + US_FL_BROKEN_FUA | US_FL_NO_REPORT_OPCODES), /* Reported-by: Hans de Goede */ UNUSUAL_DEV(0x2109, 0x0711, 0x0000, 0x9999, -- cgit v0.10.2 From 9225c0b7b976dd9ceac2b80727a60d8fcb906a62 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 1 Dec 2015 19:52:12 +0000 Subject: staging: lustre: echo_copy.._lsm() dereferences userland pointers directly missing get_user() Signed-off-by: Al Viro Cc: stable Signed-off-by: Greg Kroah-Hartman diff --git a/drivers/staging/lustre/lustre/obdecho/echo_client.c b/drivers/staging/lustre/lustre/obdecho/echo_client.c index f61ef66..a4a9a76 100644 --- a/drivers/staging/lustre/lustre/obdecho/echo_client.c +++ b/drivers/staging/lustre/lustre/obdecho/echo_client.c @@ -1270,6 +1270,7 @@ static int echo_copyout_lsm(struct lov_stripe_md *lsm, void *_ulsm, int ulsm_nob) { struct lov_stripe_md *ulsm = _ulsm; + struct lov_oinfo **p; int nob, i; nob = offsetof(struct lov_stripe_md, lsm_oinfo[lsm->lsm_stripe_count]); @@ -1279,9 +1280,10 @@ echo_copyout_lsm(struct lov_stripe_md *lsm, void *_ulsm, int ulsm_nob) if (copy_to_user(ulsm, lsm, sizeof(*ulsm))) return -EFAULT; - for (i = 0; i < lsm->lsm_stripe_count; i++) { - if (copy_to_user(ulsm->lsm_oinfo[i], lsm->lsm_oinfo[i], - sizeof(lsm->lsm_oinfo[0]))) + for (i = 0, p = lsm->lsm_oinfo; i < lsm->lsm_stripe_count; i++, p++) { + struct lov_oinfo __user *up; + if (get_user(up, ulsm->lsm_oinfo + i) || + copy_to_user(up, *p, sizeof(struct lov_oinfo))) return -EFAULT; } return 0; @@ -1289,9 +1291,10 @@ echo_copyout_lsm(struct lov_stripe_md *lsm, void *_ulsm, int ulsm_nob) static int echo_copyin_lsm(struct echo_device *ed, struct lov_stripe_md *lsm, - void *ulsm, int ulsm_nob) + struct lov_stripe_md __user *ulsm, int ulsm_nob) { struct echo_client_obd *ec = ed->ed_ec; + struct lov_oinfo **p; int i; if (ulsm_nob < sizeof(*lsm)) @@ -1306,11 +1309,10 @@ echo_copyin_lsm(struct echo_device *ed, struct lov_stripe_md *lsm, ((__u64)lsm->lsm_stripe_size * lsm->lsm_stripe_count > ~0UL)) return -EINVAL; - for (i = 0; i < lsm->lsm_stripe_count; i++) { - if (copy_from_user(lsm->lsm_oinfo[i], - ((struct lov_stripe_md *)ulsm)-> \ - lsm_oinfo[i], - sizeof(lsm->lsm_oinfo[0]))) + for (i = 0, p = lsm->lsm_oinfo; i < lsm->lsm_stripe_count; i++, p++) { + struct lov_oinfo __user *up; + if (get_user(up, ulsm->lsm_oinfo + i) || + copy_from_user(*p, up, sizeof(struct lov_oinfo))) return -EFAULT; } return 0; -- cgit v0.10.2 From d5d4fdd86f5759924fe54efa793e22eccf508db6 Mon Sep 17 00:00:00 2001 From: Guillaume Delbergue Date: Tue, 1 Dec 2015 18:55:51 +0100 Subject: irqchip/versatile-fpga: Fix PCI IRQ mapping on Versatile PB This patch is specifically for PCI support on the Versatile PB board using a DT. Currently, the dynamic IRQ mapping is broken when using DTs. For example, on QEMU, the SCSI driver is unable to request the IRQ. To fix this issue, this patch replaces the current dynamic mechanism with a static value as is done in the non-DT case. Signed-off-by: Guillaume Delbergue Signed-off-by: Arnd Bergmann Cc: stable@vger.kernel.org diff --git a/drivers/irqchip/irq-versatile-fpga.c b/drivers/irqchip/irq-versatile-fpga.c index 598ab3f..cadf104 100644 --- a/drivers/irqchip/irq-versatile-fpga.c +++ b/drivers/irqchip/irq-versatile-fpga.c @@ -210,7 +210,12 @@ int __init fpga_irq_of_init(struct device_node *node, parent_irq = -1; } +#ifdef CONFIG_ARCH_VERSATILE + fpga_irq_init(base, node->name, IRQ_SIC_START, parent_irq, valid_mask, + node); +#else fpga_irq_init(base, node->name, 0, parent_irq, valid_mask, node); +#endif writel(clear_mask, base + IRQ_ENABLE_CLEAR); writel(clear_mask, base + FIQ_ENABLE_CLEAR); -- cgit v0.10.2 From 33bd2dd03dd0bfa1130d11062a9e5f40d0cf1d3f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrien=20Verg=C3=A9?= Date: Tue, 1 Dec 2015 19:56:48 +0100 Subject: USB: quirks: Apply ALWAYS_POLL to all ELAN devices MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit All ELAN hid devices seem to require the ALWAYS_POLL quirk. Let's use this quirk for all devices from this vendor, rather than maintaining a list of all its known product IDs. Tested-by: Adrien Vergé Signed-off-by: Adrien Vergé Reviewed-by: Benjamin Tissoires Reviewed-by: Jiri Kosina Signed-off-by: Greg Kroah-Hartman diff --git a/drivers/hid/hid-ids.h b/drivers/hid/hid-ids.h index ac1feea..3c7e0c3 100644 --- a/drivers/hid/hid-ids.h +++ b/drivers/hid/hid-ids.h @@ -316,11 +316,6 @@ #define USB_DEVICE_ID_DWAV_EGALAX_MULTITOUCH_A001 0xa001 #define USB_VENDOR_ID_ELAN 0x04f3 -#define USB_DEVICE_ID_ELAN_TOUCHSCREEN 0x0089 -#define USB_DEVICE_ID_ELAN_TOUCHSCREEN_009B 0x009b -#define USB_DEVICE_ID_ELAN_TOUCHSCREEN_0103 0x0103 -#define USB_DEVICE_ID_ELAN_TOUCHSCREEN_010c 0x010c -#define USB_DEVICE_ID_ELAN_TOUCHSCREEN_016F 0x016f #define USB_VENDOR_ID_ELECOM 0x056e #define USB_DEVICE_ID_ELECOM_BM084 0x0061 diff --git a/drivers/hid/usbhid/hid-quirks.c b/drivers/hid/usbhid/hid-quirks.c index 94bb137..78fc94a 100644 --- a/drivers/hid/usbhid/hid-quirks.c +++ b/drivers/hid/usbhid/hid-quirks.c @@ -72,11 +72,7 @@ static const struct hid_blacklist { { USB_VENDOR_ID_CHICONY, USB_DEVICE_ID_CHICONY_PIXART_USB_OPTICAL_MOUSE, HID_QUIRK_ALWAYS_POLL }, { USB_VENDOR_ID_DMI, USB_DEVICE_ID_DMI_ENC, HID_QUIRK_NOGET }, { USB_VENDOR_ID_DRAGONRISE, USB_DEVICE_ID_DRAGONRISE_WIIU, HID_QUIRK_MULTI_INPUT }, - { USB_VENDOR_ID_ELAN, USB_DEVICE_ID_ELAN_TOUCHSCREEN, HID_QUIRK_ALWAYS_POLL }, - { USB_VENDOR_ID_ELAN, USB_DEVICE_ID_ELAN_TOUCHSCREEN_009B, HID_QUIRK_ALWAYS_POLL }, - { USB_VENDOR_ID_ELAN, USB_DEVICE_ID_ELAN_TOUCHSCREEN_0103, HID_QUIRK_ALWAYS_POLL }, - { USB_VENDOR_ID_ELAN, USB_DEVICE_ID_ELAN_TOUCHSCREEN_010c, HID_QUIRK_ALWAYS_POLL }, - { USB_VENDOR_ID_ELAN, USB_DEVICE_ID_ELAN_TOUCHSCREEN_016F, HID_QUIRK_ALWAYS_POLL }, + { USB_VENDOR_ID_ELAN, HID_ANY_ID, HID_QUIRK_ALWAYS_POLL }, { USB_VENDOR_ID_ELO, USB_DEVICE_ID_ELO_TS2700, HID_QUIRK_NOGET }, { USB_VENDOR_ID_FORMOSA, USB_DEVICE_ID_FORMOSA_IR_RECEIVER, HID_QUIRK_NO_INIT_REPORTS }, { USB_VENDOR_ID_FREESCALE, USB_DEVICE_ID_FREESCALE_MX28, HID_QUIRK_NOGET }, @@ -339,7 +335,8 @@ static const struct hid_blacklist *usbhid_exists_squirk(const u16 idVendor, for (; hid_blacklist[n].idVendor; n++) if (hid_blacklist[n].idVendor == idVendor && - hid_blacklist[n].idProduct == idProduct) + (hid_blacklist[n].idProduct == (__u16) HID_ANY_ID || + hid_blacklist[n].idProduct == idProduct)) bl_entry = &hid_blacklist[n]; if (bl_entry != NULL) -- cgit v0.10.2 From d66e5139aee7d8379fc135b00b9ac7c66f9d4c74 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 20 Nov 2015 23:20:28 +0100 Subject: ARM: ixp4xx: fix read{b,w,l} return types On ixp4xx, the readl() function returns an 'unsigned long' output when indirect I/O is used. This is unlike any other platform, and it causes lots of harmless compiler warnings, such as: drivers/ata/libahci.c: In function 'ahci_show_host_version': drivers/ata/libahci.c:254:22: warning: format '%x' expects argument of type 'unsigned int', but argument 3 has type 'long unsigned int' [-Wformat=] drivers/block/mtip32xx/mtip32xx.c: In function 'mtip_hw_read_registers': drivers/block/mtip32xx/mtip32xx.c:2602:31: warning: format '%X' expects argument of type 'unsigned int', but argument 3 has type 'long unsigned int' [-Wformat=] drivers/block/cciss.c: In function 'print_cfg_table': drivers/block/cciss.c:3845:25: warning: format '%d' expects argument of type 'int', but argument 4 has type 'long unsigned int' [-Wformat=] This changes all six of the ixp4xx specific I/O read functions to return the same types that we have in the normal asm/io.h, to avoid the warnings. Signed-off-by: Arnd Bergmann Acked-by: Krzysztof Halasa diff --git a/arch/arm/mach-ixp4xx/include/mach/io.h b/arch/arm/mach-ixp4xx/include/mach/io.h index b024390..7a0c13b 100644 --- a/arch/arm/mach-ixp4xx/include/mach/io.h +++ b/arch/arm/mach-ixp4xx/include/mach/io.h @@ -143,7 +143,7 @@ static inline void __indirect_writesl(volatile void __iomem *bus_addr, writel(*vaddr++, bus_addr); } -static inline unsigned char __indirect_readb(const volatile void __iomem *p) +static inline u8 __indirect_readb(const volatile void __iomem *p) { u32 addr = (u32)p; u32 n, byte_enables, data; @@ -166,7 +166,7 @@ static inline void __indirect_readsb(const volatile void __iomem *bus_addr, *vaddr++ = readb(bus_addr); } -static inline unsigned short __indirect_readw(const volatile void __iomem *p) +static inline u16 __indirect_readw(const volatile void __iomem *p) { u32 addr = (u32)p; u32 n, byte_enables, data; @@ -189,7 +189,7 @@ static inline void __indirect_readsw(const volatile void __iomem *bus_addr, *vaddr++ = readw(bus_addr); } -static inline unsigned long __indirect_readl(const volatile void __iomem *p) +static inline u32 __indirect_readl(const volatile void __iomem *p) { u32 addr = (__force u32)p; u32 data; @@ -350,7 +350,7 @@ static inline void insl(u32 io_addr, void *p, u32 count) ((unsigned long)p <= (PIO_MASK + PIO_OFFSET))) #define ioread8(p) ioread8(p) -static inline unsigned int ioread8(const void __iomem *addr) +static inline u8 ioread8(const void __iomem *addr) { unsigned long port = (unsigned long __force)addr; if (__is_io_address(port)) @@ -378,7 +378,7 @@ static inline void ioread8_rep(const void __iomem *addr, void *vaddr, u32 count) } #define ioread16(p) ioread16(p) -static inline unsigned int ioread16(const void __iomem *addr) +static inline u16 ioread16(const void __iomem *addr) { unsigned long port = (unsigned long __force)addr; if (__is_io_address(port)) @@ -407,7 +407,7 @@ static inline void ioread16_rep(const void __iomem *addr, void *vaddr, } #define ioread32(p) ioread32(p) -static inline unsigned int ioread32(const void __iomem *addr) +static inline u32 ioread32(const void __iomem *addr) { unsigned long port = (unsigned long __force)addr; if (__is_io_address(port)) -- cgit v0.10.2 From 464ad8c43a9ead98c2b0eaed86bea727f2ad106e Mon Sep 17 00:00:00 2001 From: Hans Yang Date: Tue, 1 Dec 2015 16:54:59 +0800 Subject: usb: core : hub: Fix BOS 'NULL pointer' kernel panic When a USB 3.0 mass storage device is disconnected in transporting state, storage device driver may handle it as a transport error and reset the device by invoking usb_reset_and_verify_device() and following could happen: in usb_reset_and_verify_device(): udev->bos = NULL; For U1/U2 enabled devices, driver will disable LPM, and in some conditions: from usb_unlocked_disable_lpm() --> usb_disable_lpm() --> usb_enable_lpm() udev->bos->ss_cap->bU1devExitLat; And it causes 'NULL pointer' and 'kernel panic': [ 157.976257] Unable to handle kernel NULL pointer dereference at virtual address 00000010 ... [ 158.026400] PC is at usb_enable_link_state+0x34/0x2e0 [ 158.031442] LR is at usb_enable_lpm+0x98/0xac ... [ 158.137368] [] usb_enable_link_state+0x34/0x2e0 [ 158.143451] [] usb_enable_lpm+0x94/0xac [ 158.148840] [] usb_disable_lpm+0xa8/0xb4 ... [ 158.214954] Kernel panic - not syncing: Fatal exception This commit moves 'udev->bos = NULL' behind usb_unlocked_disable_lpm() to prevent from NULL pointer access. Issue can be reproduced by following setup: 1) A SS pen drive behind a SS hub connected to the host. 2) Transporting data between the pen drive and the host. 3) Abruptly disconnect hub and pen drive from host. 4) With a chance it crashes. Signed-off-by: Hans Yang Acked-by: Alan Stern Cc: stable Signed-off-by: Greg Kroah-Hartman diff --git a/drivers/usb/core/hub.c b/drivers/usb/core/hub.c index bdeadc1..585c3cb 100644 --- a/drivers/usb/core/hub.c +++ b/drivers/usb/core/hub.c @@ -5326,9 +5326,6 @@ static int usb_reset_and_verify_device(struct usb_device *udev) if (udev->usb2_hw_lpm_enabled == 1) usb_set_usb2_hardware_lpm(udev, 0); - bos = udev->bos; - udev->bos = NULL; - /* Disable LPM and LTM while we reset the device and reinstall the alt * settings. Device-initiated LPM settings, and system exit latency * settings are cleared when the device is reset, so we have to set @@ -5337,15 +5334,18 @@ static int usb_reset_and_verify_device(struct usb_device *udev) ret = usb_unlocked_disable_lpm(udev); if (ret) { dev_err(&udev->dev, "%s Failed to disable LPM\n.", __func__); - goto re_enumerate; + goto re_enumerate_no_bos; } ret = usb_disable_ltm(udev); if (ret) { dev_err(&udev->dev, "%s Failed to disable LTM\n.", __func__); - goto re_enumerate; + goto re_enumerate_no_bos; } + bos = udev->bos; + udev->bos = NULL; + for (i = 0; i < SET_CONFIG_TRIES; ++i) { /* ep0 maxpacket size may change; let the HCD know about it. @@ -5442,10 +5442,11 @@ done: return 0; re_enumerate: - /* LPM state doesn't matter when we're about to destroy the device. */ - hub_port_logical_disconnect(parent_hub, port1); usb_release_bos_descriptor(udev); udev->bos = bos; +re_enumerate_no_bos: + /* LPM state doesn't matter when we're about to destroy the device. */ + hub_port_logical_disconnect(parent_hub, port1); return -ENODEV; } -- cgit v0.10.2 From f9fa1887dcf26bd346665a6ae3d3f53dec54cba1 Mon Sep 17 00:00:00 2001 From: Alexey Khoroshilov Date: Sat, 21 Nov 2015 00:36:44 +0300 Subject: USB: whci-hcd: add check for dma mapping error qset_fill_page_list() do not check for dma mapping errors. Found by Linux Driver Verification project (linuxtesting.org). Signed-off-by: Alexey Khoroshilov Cc: stable Signed-off-by: Greg Kroah-Hartman diff --git a/drivers/usb/host/whci/qset.c b/drivers/usb/host/whci/qset.c index dc31c42..9f1c053 100644 --- a/drivers/usb/host/whci/qset.c +++ b/drivers/usb/host/whci/qset.c @@ -377,6 +377,10 @@ static int qset_fill_page_list(struct whc *whc, struct whc_std *std, gfp_t mem_f if (std->pl_virt == NULL) return -ENOMEM; std->dma_addr = dma_map_single(whc->wusbhc.dev, std->pl_virt, pl_len, DMA_TO_DEVICE); + if (dma_mapping_error(whc->wusbhc.dev, std->dma_addr)) { + kfree(std->pl_virt); + return -EFAULT; + } for (p = 0; p < std->num_pointers; p++) { std->pl_virt[p].buf_ptr = cpu_to_le64(dma_addr); -- cgit v0.10.2 From 5377adb092664d336ac212499961cac5e8728794 Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Wed, 18 Nov 2015 02:01:21 +0000 Subject: usb: Use the USB_SS_MULT() macro to decode burst multiplier for log message usb_parse_ss_endpoint_companion() now decodes the burst multiplier correctly in order to check that it's <= 3, but still uses the wrong expression if warning that it's > 3. Fixes: ff30cbc8da42 ("usb: Use the USB_SS_MULT() macro to get the ...") Signed-off-by: Ben Hutchings Cc: stable Signed-off-by: Greg Kroah-Hartman diff --git a/drivers/usb/core/config.c b/drivers/usb/core/config.c index 7caff02..5050760 100644 --- a/drivers/usb/core/config.c +++ b/drivers/usb/core/config.c @@ -115,7 +115,8 @@ static void usb_parse_ss_endpoint_companion(struct device *ddev, int cfgno, USB_SS_MULT(desc->bmAttributes) > 3) { dev_warn(ddev, "Isoc endpoint has Mult of %d in " "config %d interface %d altsetting %d ep %d: " - "setting to 3\n", desc->bmAttributes + 1, + "setting to 3\n", + USB_SS_MULT(desc->bmAttributes), cfgno, inum, asnum, ep->desc.bEndpointAddress); ep->ss_ep_comp.bmAttributes = 2; } -- cgit v0.10.2 From 3b60a26fdc3b32c00d750458df33d414e8f924ce Mon Sep 17 00:00:00 2001 From: Stefan Agner Date: Sat, 17 Oct 2015 21:05:20 -0700 Subject: ARM: imx: clk-vf610: fix SAI clock tree The Synchronous Audio Interface (SAI) instances are clocked by independent clocks: The bus clock and the audio clock (as shown in Figure 51-1 in the Vybrid Reference Manual). The clock gates in CCGR0/CCGR1 for SAI0 through SAI3 are bus clock gates, as access tests to the registers with/without gating those clocks have shown. The audio clock is gated by the SAIx_EN gates in CCM_CSCDR1, followed by a clock divider (SAIx_DIV). Currently, the parent of the bus clock gates has been assigned to SAIx_DIV, which is not involved in the bus clock path for the SAI instances (see chapter 9.10.12, SAI clocking in the Vybrid Reference Manual). Fix this by define the parent clock of VF610_CLK_SAIx to be the bus clock. If the driver needs the audio clock (when used in master mode), a fixed device tree is required which assign the audio clock properly to VF610_CLK_SAIx_DIV. Signed-off-by: Stefan Agner Acked-by: Stephen Boyd Signed-off-by: Shawn Guo diff --git a/drivers/clk/imx/clk-vf610.c b/drivers/clk/imx/clk-vf610.c index d1b1c95..0a94d96 100644 --- a/drivers/clk/imx/clk-vf610.c +++ b/drivers/clk/imx/clk-vf610.c @@ -335,22 +335,22 @@ static void __init vf610_clocks_init(struct device_node *ccm_node) clk[VF610_CLK_SAI0_SEL] = imx_clk_mux("sai0_sel", CCM_CSCMR1, 0, 2, sai_sels, 4); clk[VF610_CLK_SAI0_EN] = imx_clk_gate("sai0_en", "sai0_sel", CCM_CSCDR1, 16); clk[VF610_CLK_SAI0_DIV] = imx_clk_divider("sai0_div", "sai0_en", CCM_CSCDR1, 0, 4); - clk[VF610_CLK_SAI0] = imx_clk_gate2("sai0", "sai0_div", CCM_CCGR0, CCM_CCGRx_CGn(15)); + clk[VF610_CLK_SAI0] = imx_clk_gate2("sai0", "ipg_bus", CCM_CCGR0, CCM_CCGRx_CGn(15)); clk[VF610_CLK_SAI1_SEL] = imx_clk_mux("sai1_sel", CCM_CSCMR1, 2, 2, sai_sels, 4); clk[VF610_CLK_SAI1_EN] = imx_clk_gate("sai1_en", "sai1_sel", CCM_CSCDR1, 17); clk[VF610_CLK_SAI1_DIV] = imx_clk_divider("sai1_div", "sai1_en", CCM_CSCDR1, 4, 4); - clk[VF610_CLK_SAI1] = imx_clk_gate2("sai1", "sai1_div", CCM_CCGR1, CCM_CCGRx_CGn(0)); + clk[VF610_CLK_SAI1] = imx_clk_gate2("sai1", "ipg_bus", CCM_CCGR1, CCM_CCGRx_CGn(0)); clk[VF610_CLK_SAI2_SEL] = imx_clk_mux("sai2_sel", CCM_CSCMR1, 4, 2, sai_sels, 4); clk[VF610_CLK_SAI2_EN] = imx_clk_gate("sai2_en", "sai2_sel", CCM_CSCDR1, 18); clk[VF610_CLK_SAI2_DIV] = imx_clk_divider("sai2_div", "sai2_en", CCM_CSCDR1, 8, 4); - clk[VF610_CLK_SAI2] = imx_clk_gate2("sai2", "sai2_div", CCM_CCGR1, CCM_CCGRx_CGn(1)); + clk[VF610_CLK_SAI2] = imx_clk_gate2("sai2", "ipg_bus", CCM_CCGR1, CCM_CCGRx_CGn(1)); clk[VF610_CLK_SAI3_SEL] = imx_clk_mux("sai3_sel", CCM_CSCMR1, 6, 2, sai_sels, 4); clk[VF610_CLK_SAI3_EN] = imx_clk_gate("sai3_en", "sai3_sel", CCM_CSCDR1, 19); clk[VF610_CLK_SAI3_DIV] = imx_clk_divider("sai3_div", "sai3_en", CCM_CSCDR1, 12, 4); - clk[VF610_CLK_SAI3] = imx_clk_gate2("sai3", "sai3_div", CCM_CCGR1, CCM_CCGRx_CGn(2)); + clk[VF610_CLK_SAI3] = imx_clk_gate2("sai3", "ipg_bus", CCM_CCGR1, CCM_CCGRx_CGn(2)); clk[VF610_CLK_NFC_SEL] = imx_clk_mux("nfc_sel", CCM_CSCMR1, 12, 2, nfc_sels, 4); clk[VF610_CLK_NFC_EN] = imx_clk_gate("nfc_en", "nfc_sel", CCM_CSCDR2, 9); -- cgit v0.10.2 From 531ee1f4ea7fb5634cb79824e6df6be83596a054 Mon Sep 17 00:00:00 2001 From: Stefan Agner Date: Sat, 17 Oct 2015 21:05:21 -0700 Subject: ARM: dts: vf610: fix clock definition for SAI2 So far, only the bus clock has been assigned, but in reality the SAI IP has for clock inputs. The driver has been updated to make use of the additional clock inputs by c3ecef21c3f2 ("ASoC: fsl_sai: add sai master mode support"). Due to a bug in the clock tree, the audio clock has been enabled none the less by the specified bus clock (see "ARM: imx: clk-vf610: fix SAI clock tree"), which made master mode even without the proper clock assigned working. This patch completes the clock definition for SAI2. On Vybrid, only two MCLK out of the four options are available (the first being the bus clock itself). See chapter 8.10.1.2.3 of the Vybrid Reference manual ("SAI transmitter and receiver options for MCLK selection"). Note: The audio clocks are only required in master mode. Signed-off-by: Stefan Agner Signed-off-by: Shawn Guo diff --git a/arch/arm/boot/dts/vfxxx.dtsi b/arch/arm/boot/dts/vfxxx.dtsi index 0d5acc2..3cd1b27 100644 --- a/arch/arm/boot/dts/vfxxx.dtsi +++ b/arch/arm/boot/dts/vfxxx.dtsi @@ -178,8 +178,10 @@ compatible = "fsl,vf610-sai"; reg = <0x40031000 0x1000>; interrupts = <86 IRQ_TYPE_LEVEL_HIGH>; - clocks = <&clks VF610_CLK_SAI2>; - clock-names = "sai"; + clocks = <&clks VF610_CLK_SAI2>, + <&clks VF610_CLK_SAI2_DIV>, + <&clks 0>, <&clks 0>; + clock-names = "bus", "mclk1", "mclk2", "mclk3"; dma-names = "tx", "rx"; dmas = <&edma0 0 21>, <&edma0 0 20>; -- cgit v0.10.2 From 30ce6e1cc5a0f781d60227e9096c86e188d2c2bd Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Mon, 23 Nov 2015 16:24:45 -0500 Subject: dm btree: fix leak of bufio-backed block in btree_split_sibling error path The block allocated at the start of btree_split_sibling() is never released if later insert_at() fails. Fix this by releasing the previously allocated bufio block using unlock_block(). Reported-by: Mikulas Patocka Signed-off-by: Mike Snitzer Cc: stable@vger.kernel.org diff --git a/drivers/md/persistent-data/dm-btree.c b/drivers/md/persistent-data/dm-btree.c index c573402..0918a7c 100644 --- a/drivers/md/persistent-data/dm-btree.c +++ b/drivers/md/persistent-data/dm-btree.c @@ -473,8 +473,10 @@ static int btree_split_sibling(struct shadow_spine *s, unsigned parent_index, r = insert_at(sizeof(__le64), pn, parent_index + 1, le64_to_cpu(rn->keys[0]), &location); - if (r) + if (r) { + unlock_block(s->info, right); return r; + } if (key < le64_to_cpu(rn->keys[0])) { unlock_block(s->info, right); -- cgit v0.10.2 From 993ceab91986e2e737ce9a3e23bebc8cce649240 Mon Sep 17 00:00:00 2001 From: Joe Thornber Date: Wed, 2 Dec 2015 12:24:39 +0000 Subject: dm thin metadata: fix bug in dm_thin_remove_range() dm_btree_remove_leaves() only unmaps a contiguous region so we need a loop, in __remove_range(), to handle ranges that contain multiple regions. A new btree function, dm_btree_lookup_next(), is introduced which is more efficiently able to skip over regions of the thin device which aren't mapped. __remove_range() uses dm_btree_lookup_next() for each iteration of __remove_range()'s loop. Also, improve description of dm_btree_remove_leaves(). Fixes: 6550f075 ("dm thin metadata: add dm_thin_remove_range()") Signed-off-by: Joe Thornber Signed-off-by: Mike Snitzer Cc: stable@vger.kernel.org # 4.1+ diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c index 1fa4569..67871e7 100644 --- a/drivers/md/dm-thin-metadata.c +++ b/drivers/md/dm-thin-metadata.c @@ -1538,7 +1538,7 @@ static int __remove(struct dm_thin_device *td, dm_block_t block) static int __remove_range(struct dm_thin_device *td, dm_block_t begin, dm_block_t end) { int r; - unsigned count; + unsigned count, total_count = 0; struct dm_pool_metadata *pmd = td->pmd; dm_block_t keys[1] = { td->id }; __le64 value; @@ -1561,11 +1561,29 @@ static int __remove_range(struct dm_thin_device *td, dm_block_t begin, dm_block_ if (r) return r; - r = dm_btree_remove_leaves(&pmd->bl_info, mapping_root, &begin, end, &mapping_root, &count); - if (r) - return r; + /* + * Remove leaves stops at the first unmapped entry, so we have to + * loop round finding mapped ranges. + */ + while (begin < end) { + r = dm_btree_lookup_next(&pmd->bl_info, mapping_root, &begin, &begin, &value); + if (r == -ENODATA) + break; + + if (r) + return r; + + if (begin >= end) + break; + + r = dm_btree_remove_leaves(&pmd->bl_info, mapping_root, &begin, end, &mapping_root, &count); + if (r) + return r; + + total_count += count; + } - td->mapped_blocks -= count; + td->mapped_blocks -= total_count; td->changed = 1; /* diff --git a/drivers/md/persistent-data/dm-btree.c b/drivers/md/persistent-data/dm-btree.c index 0918a7c..7e5b7f1 100644 --- a/drivers/md/persistent-data/dm-btree.c +++ b/drivers/md/persistent-data/dm-btree.c @@ -63,6 +63,11 @@ int lower_bound(struct btree_node *n, uint64_t key) return bsearch(n, key, 0); } +static int upper_bound(struct btree_node *n, uint64_t key) +{ + return bsearch(n, key, 1); +} + void inc_children(struct dm_transaction_manager *tm, struct btree_node *n, struct dm_btree_value_type *vt) { @@ -392,6 +397,82 @@ int dm_btree_lookup(struct dm_btree_info *info, dm_block_t root, } EXPORT_SYMBOL_GPL(dm_btree_lookup); +static int dm_btree_lookup_next_single(struct dm_btree_info *info, dm_block_t root, + uint64_t key, uint64_t *rkey, void *value_le) +{ + int r, i; + uint32_t flags, nr_entries; + struct dm_block *node; + struct btree_node *n; + + r = bn_read_lock(info, root, &node); + if (r) + return r; + + n = dm_block_data(node); + flags = le32_to_cpu(n->header.flags); + nr_entries = le32_to_cpu(n->header.nr_entries); + + if (flags & INTERNAL_NODE) { + i = lower_bound(n, key); + if (i < 0 || i >= nr_entries) { + r = -ENODATA; + goto out; + } + + r = dm_btree_lookup_next_single(info, value64(n, i), key, rkey, value_le); + if (r == -ENODATA && i < (nr_entries - 1)) { + i++; + r = dm_btree_lookup_next_single(info, value64(n, i), key, rkey, value_le); + } + + } else { + i = upper_bound(n, key); + if (i < 0 || i >= nr_entries) { + r = -ENODATA; + goto out; + } + + *rkey = le64_to_cpu(n->keys[i]); + memcpy(value_le, value_ptr(n, i), info->value_type.size); + } +out: + dm_tm_unlock(info->tm, node); + return r; +} + +int dm_btree_lookup_next(struct dm_btree_info *info, dm_block_t root, + uint64_t *keys, uint64_t *rkey, void *value_le) +{ + unsigned level; + int r = -ENODATA; + __le64 internal_value_le; + struct ro_spine spine; + + init_ro_spine(&spine, info); + for (level = 0; level < info->levels - 1u; level++) { + r = btree_lookup_raw(&spine, root, keys[level], + lower_bound, rkey, + &internal_value_le, sizeof(uint64_t)); + if (r) + goto out; + + if (*rkey != keys[level]) { + r = -ENODATA; + goto out; + } + + root = le64_to_cpu(internal_value_le); + } + + r = dm_btree_lookup_next_single(info, root, keys[level], rkey, value_le); +out: + exit_ro_spine(&spine); + return r; +} + +EXPORT_SYMBOL_GPL(dm_btree_lookup_next); + /* * Splits a node by creating a sibling node and shifting half the nodes * contents across. Assumes there is a parent node, and it has room for diff --git a/drivers/md/persistent-data/dm-btree.h b/drivers/md/persistent-data/dm-btree.h index 11d8cf7..c74301f 100644 --- a/drivers/md/persistent-data/dm-btree.h +++ b/drivers/md/persistent-data/dm-btree.h @@ -110,6 +110,13 @@ int dm_btree_lookup(struct dm_btree_info *info, dm_block_t root, uint64_t *keys, void *value_le); /* + * Tries to find the first key where the bottom level key is >= to that + * given. Useful for skipping empty sections of the btree. + */ +int dm_btree_lookup_next(struct dm_btree_info *info, dm_block_t root, + uint64_t *keys, uint64_t *rkey, void *value_le); + +/* * Insertion (or overwrite an existing value). O(ln(n)) */ int dm_btree_insert(struct dm_btree_info *info, dm_block_t root, @@ -135,9 +142,10 @@ int dm_btree_remove(struct dm_btree_info *info, dm_block_t root, uint64_t *keys, dm_block_t *new_root); /* - * Removes values between 'keys' and keys2, where keys2 is keys with the - * final key replaced with 'end_key'. 'end_key' is the one-past-the-end - * value. 'keys' may be altered. + * Removes a _contiguous_ run of values starting from 'keys' and not + * reaching keys2 (where keys2 is keys with the final key replaced with + * 'end_key'). 'end_key' is the one-past-the-end value. 'keys' may be + * altered. */ int dm_btree_remove_leaves(struct dm_btree_info *info, dm_block_t root, uint64_t *keys, uint64_t end_key, -- cgit v0.10.2 From 59f0ec231f397001801264063db3b6dcc3eef590 Mon Sep 17 00:00:00 2001 From: Maxime Ripard Date: Tue, 1 Dec 2015 12:14:52 +0100 Subject: clk: sunxi: pll2: Fix clock running too fast Contrary to what the datasheet says, the pre divider doesn't seem to be incremented by one in the PLL2, but just uses the value from the register, with 0 being a bypass. This fixes the audio playing too fast. Since we now have the same pre-divider flags, and the only difference with the A10 is the post-divider offset, also remove the structure to just pass the offset as an argument. Signed-off-by: Maxime Ripard Fixes: eb662f854710 ("clk: sunxi: pll2: Add A13 support") Signed-off-by: Stephen Boyd diff --git a/drivers/clk/sunxi/clk-a10-pll2.c b/drivers/clk/sunxi/clk-a10-pll2.c index 5484c31e..0ee1f36 100644 --- a/drivers/clk/sunxi/clk-a10-pll2.c +++ b/drivers/clk/sunxi/clk-a10-pll2.c @@ -41,15 +41,10 @@ #define SUN4I_PLL2_OUTPUTS 4 -struct sun4i_pll2_data { - u32 post_div_offset; - u32 pre_div_flags; -}; - static DEFINE_SPINLOCK(sun4i_a10_pll2_lock); static void __init sun4i_pll2_setup(struct device_node *node, - struct sun4i_pll2_data *data) + int post_div_offset) { const char *clk_name = node->name, *parent; struct clk **clks, *base_clk, *prediv_clk; @@ -76,7 +71,7 @@ static void __init sun4i_pll2_setup(struct device_node *node, parent, 0, reg, SUN4I_PLL2_PRE_DIV_SHIFT, SUN4I_PLL2_PRE_DIV_WIDTH, - data->pre_div_flags, + CLK_DIVIDER_ONE_BASED | CLK_DIVIDER_ALLOW_ZERO, &sun4i_a10_pll2_lock); if (!prediv_clk) { pr_err("Couldn't register the prediv clock\n"); @@ -127,7 +122,7 @@ static void __init sun4i_pll2_setup(struct device_node *node, */ val = readl(reg); val &= ~(SUN4I_PLL2_POST_DIV_MASK << SUN4I_PLL2_POST_DIV_SHIFT); - val |= (SUN4I_PLL2_POST_DIV_VALUE - data->post_div_offset) << SUN4I_PLL2_POST_DIV_SHIFT; + val |= (SUN4I_PLL2_POST_DIV_VALUE - post_div_offset) << SUN4I_PLL2_POST_DIV_SHIFT; writel(val, reg); of_property_read_string_index(node, "clock-output-names", @@ -191,25 +186,17 @@ err_unmap: iounmap(reg); } -static struct sun4i_pll2_data sun4i_a10_pll2_data = { - .pre_div_flags = CLK_DIVIDER_ONE_BASED | CLK_DIVIDER_ALLOW_ZERO, -}; - static void __init sun4i_a10_pll2_setup(struct device_node *node) { - sun4i_pll2_setup(node, &sun4i_a10_pll2_data); + sun4i_pll2_setup(node, 0); } CLK_OF_DECLARE(sun4i_a10_pll2, "allwinner,sun4i-a10-pll2-clk", sun4i_a10_pll2_setup); -static struct sun4i_pll2_data sun5i_a13_pll2_data = { - .post_div_offset = 1, -}; - static void __init sun5i_a13_pll2_setup(struct device_node *node) { - sun4i_pll2_setup(node, &sun5i_a13_pll2_data); + sun4i_pll2_setup(node, 1); } CLK_OF_DECLARE(sun5i_a13_pll2, "allwinner,sun5i-a13-pll2-clk", -- cgit v0.10.2 From 599c963a0f19b14132065788322207eaa58bc7f8 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 3 Dec 2015 10:18:21 -0500 Subject: cgroup_freezer: simplify propagation of CGROUP_FROZEN clearing in freezer_attach() If one or more tasks get moved into a frozen css, the frozen state is cleared up from the destination css so that it can be reasserted once the migrated tasks are frozen. freezer_attach() implements this in two separate steps - clearing CGROUP_FROZEN on the target css while processing each task and propagating the clearing upwards after the task loop is done if necessary. This patch merges the two steps. Propagation now takes place inside the task loop. This simplifies the code and prepares it for the fix of multi-destination migration. Signed-off-by: Tejun Heo diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index f1b30ad..ff02a8e 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -158,9 +158,7 @@ static void freezer_css_free(struct cgroup_subsys_state *css) static void freezer_attach(struct cgroup_subsys_state *new_css, struct cgroup_taskset *tset) { - struct freezer *freezer = css_freezer(new_css); struct task_struct *task; - bool clear_frozen = false; mutex_lock(&freezer_mutex); @@ -175,21 +173,20 @@ static void freezer_attach(struct cgroup_subsys_state *new_css, * be visible in a FROZEN cgroup and frozen tasks in a THAWED one. */ cgroup_taskset_for_each(task, tset) { + struct freezer *freezer = css_freezer(new_css); + if (!(freezer->state & CGROUP_FREEZING)) { __thaw_task(task); } else { freeze_task(task); - freezer->state &= ~CGROUP_FROZEN; - clear_frozen = true; + /* clear FROZEN and propagate upwards */ + while (freezer && (freezer->state & CGROUP_FROZEN)) { + freezer->state &= ~CGROUP_FROZEN; + freezer = parent_freezer(freezer); + } } } - /* propagate FROZEN clearing upwards */ - while (clear_frozen && (freezer = parent_freezer(freezer))) { - freezer->state &= ~CGROUP_FROZEN; - clear_frozen = freezer->state & CGROUP_FREEZING; - } - mutex_unlock(&freezer_mutex); } -- cgit v0.10.2 From 1f7dd3e5a6e4f093017fff12232572ee1aa4639b Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 3 Dec 2015 10:18:21 -0500 Subject: cgroup: fix handling of multi-destination migration from subtree_control enabling Consider the following v2 hierarchy. P0 (+memory) --- P1 (-memory) --- A \- B P0 has memory enabled in its subtree_control while P1 doesn't. If both A and B contain processes, they would belong to the memory css of P1. Now if memory is enabled on P1's subtree_control, memory csses should be created on both A and B and A's processes should be moved to the former and B's processes the latter. IOW, enabling controllers can cause atomic migrations into different csses. The core cgroup migration logic has been updated accordingly but the controller migration methods haven't and still assume that all tasks migrate to a single target css; furthermore, the methods were fed the css in which subtree_control was updated which is the parent of the target csses. pids controller depends on the migration methods to move charges and this made the controller attribute charges to the wrong csses often triggering the following warning by driving a counter negative. WARNING: CPU: 1 PID: 1 at kernel/cgroup_pids.c:97 pids_cancel.constprop.6+0x31/0x40() Modules linked in: CPU: 1 PID: 1 Comm: systemd Not tainted 4.4.0-rc1+ #29 ... ffffffff81f65382 ffff88007c043b90 ffffffff81551ffc 0000000000000000 ffff88007c043bc8 ffffffff810de202 ffff88007a752000 ffff88007a29ab00 ffff88007c043c80 ffff88007a1d8400 0000000000000001 ffff88007c043bd8 Call Trace: [] dump_stack+0x4e/0x82 [] warn_slowpath_common+0x82/0xc0 [] warn_slowpath_null+0x1a/0x20 [] pids_cancel.constprop.6+0x31/0x40 [] pids_can_attach+0x6d/0xf0 [] cgroup_taskset_migrate+0x6c/0x330 [] cgroup_migrate+0xf5/0x190 [] cgroup_attach_task+0x176/0x200 [] __cgroup_procs_write+0x2ad/0x460 [] cgroup_procs_write+0x14/0x20 [] cgroup_file_write+0x35/0x1c0 [] kernfs_fop_write+0x141/0x190 [] __vfs_write+0x28/0xe0 [] vfs_write+0xac/0x1a0 [] SyS_write+0x49/0xb0 [] entry_SYSCALL_64_fastpath+0x12/0x76 This patch fixes the bug by removing @css parameter from the three migration methods, ->can_attach, ->cancel_attach() and ->attach() and updating cgroup_taskset iteration helpers also return the destination css in addition to the task being migrated. All controllers are updated accordingly. * Controllers which don't care whether there are one or multiple target csses can be converted trivially. cpu, io, freezer, perf, netclassid and netprio fall in this category. * cpuset's current implementation assumes that there's single source and destination and thus doesn't support v2 hierarchy already. The only change made by this patchset is how that single destination css is obtained. * memory migration path already doesn't do anything on v2. How the single destination css is obtained is updated and the prep stage of mem_cgroup_can_attach() is reordered to accomodate the change. * pids is the only controller which was affected by this bug. It now correctly handles multi-destination migrations and no longer causes counter underflow from incorrect accounting. Signed-off-by: Tejun Heo Reported-and-tested-by: Daniel Wagner Cc: Aleksa Sarai diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 5bcdfc1..5a37188 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -1127,15 +1127,15 @@ void blkcg_exit_queue(struct request_queue *q) * of the main cic data structures. For now we allow a task to change * its cgroup only if it's the only owner of its ioc. */ -static int blkcg_can_attach(struct cgroup_subsys_state *css, - struct cgroup_taskset *tset) +static int blkcg_can_attach(struct cgroup_taskset *tset) { struct task_struct *task; + struct cgroup_subsys_state *dst_css; struct io_context *ioc; int ret = 0; /* task_lock() is needed to avoid races with exit_io_context() */ - cgroup_taskset_for_each(task, tset) { + cgroup_taskset_for_each(task, dst_css, tset) { task_lock(task); ioc = task->io_context; if (ioc && atomic_read(&ioc->nr_tasks) > 1) diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 869fd4a..06b77f9d 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -422,12 +422,9 @@ struct cgroup_subsys { void (*css_reset)(struct cgroup_subsys_state *css); void (*css_e_css_changed)(struct cgroup_subsys_state *css); - int (*can_attach)(struct cgroup_subsys_state *css, - struct cgroup_taskset *tset); - void (*cancel_attach)(struct cgroup_subsys_state *css, - struct cgroup_taskset *tset); - void (*attach)(struct cgroup_subsys_state *css, - struct cgroup_taskset *tset); + int (*can_attach)(struct cgroup_taskset *tset); + void (*cancel_attach)(struct cgroup_taskset *tset); + void (*attach)(struct cgroup_taskset *tset); int (*can_fork)(struct task_struct *task, void **priv_p); void (*cancel_fork)(struct task_struct *task, void *priv); void (*fork)(struct task_struct *task, void *priv); diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index f640830..cb91b44 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -120,8 +120,10 @@ struct cgroup_subsys_state *css_rightmost_descendant(struct cgroup_subsys_state struct cgroup_subsys_state *css_next_descendant_post(struct cgroup_subsys_state *pos, struct cgroup_subsys_state *css); -struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset); -struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset); +struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset, + struct cgroup_subsys_state **dst_cssp); +struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset, + struct cgroup_subsys_state **dst_cssp); void css_task_iter_start(struct cgroup_subsys_state *css, struct css_task_iter *it); @@ -236,30 +238,39 @@ void css_task_iter_end(struct css_task_iter *it); /** * cgroup_taskset_for_each - iterate cgroup_taskset * @task: the loop cursor + * @dst_css: the destination css * @tset: taskset to iterate * * @tset may contain multiple tasks and they may belong to multiple - * processes. When there are multiple tasks in @tset, if a task of a - * process is in @tset, all tasks of the process are in @tset. Also, all - * are guaranteed to share the same source and destination csses. + * processes. + * + * On the v2 hierarchy, there may be tasks from multiple processes and they + * may not share the source or destination csses. + * + * On traditional hierarchies, when there are multiple tasks in @tset, if a + * task of a process is in @tset, all tasks of the process are in @tset. + * Also, all are guaranteed to share the same source and destination csses. * * Iteration is not in any specific order. */ -#define cgroup_taskset_for_each(task, tset) \ - for ((task) = cgroup_taskset_first((tset)); (task); \ - (task) = cgroup_taskset_next((tset))) +#define cgroup_taskset_for_each(task, dst_css, tset) \ + for ((task) = cgroup_taskset_first((tset), &(dst_css)); \ + (task); \ + (task) = cgroup_taskset_next((tset), &(dst_css))) /** * cgroup_taskset_for_each_leader - iterate group leaders in a cgroup_taskset * @leader: the loop cursor + * @dst_css: the destination css * @tset: takset to iterate * * Iterate threadgroup leaders of @tset. For single-task migrations, @tset * may not contain any. */ -#define cgroup_taskset_for_each_leader(leader, tset) \ - for ((leader) = cgroup_taskset_first((tset)); (leader); \ - (leader) = cgroup_taskset_next((tset))) \ +#define cgroup_taskset_for_each_leader(leader, dst_css, tset) \ + for ((leader) = cgroup_taskset_first((tset), &(dst_css)); \ + (leader); \ + (leader) = cgroup_taskset_next((tset), &(dst_css))) \ if ((leader) != (leader)->group_leader) \ ; \ else diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 5cea63f..470f653 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2237,6 +2237,9 @@ struct cgroup_taskset { struct list_head src_csets; struct list_head dst_csets; + /* the subsys currently being processed */ + int ssid; + /* * Fields for cgroup_taskset_*() iteration. * @@ -2299,25 +2302,29 @@ static void cgroup_taskset_add(struct task_struct *task, /** * cgroup_taskset_first - reset taskset and return the first task * @tset: taskset of interest + * @dst_cssp: output variable for the destination css * * @tset iteration is initialized and the first task is returned. */ -struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset) +struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset, + struct cgroup_subsys_state **dst_cssp) { tset->cur_cset = list_first_entry(tset->csets, struct css_set, mg_node); tset->cur_task = NULL; - return cgroup_taskset_next(tset); + return cgroup_taskset_next(tset, dst_cssp); } /** * cgroup_taskset_next - iterate to the next task in taskset * @tset: taskset of interest + * @dst_cssp: output variable for the destination css * * Return the next task in @tset. Iteration must have been initialized * with cgroup_taskset_first(). */ -struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset) +struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset, + struct cgroup_subsys_state **dst_cssp) { struct css_set *cset = tset->cur_cset; struct task_struct *task = tset->cur_task; @@ -2332,6 +2339,18 @@ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset) if (&task->cg_list != &cset->mg_tasks) { tset->cur_cset = cset; tset->cur_task = task; + + /* + * This function may be called both before and + * after cgroup_taskset_migrate(). The two cases + * can be distinguished by looking at whether @cset + * has its ->mg_dst_cset set. + */ + if (cset->mg_dst_cset) + *dst_cssp = cset->mg_dst_cset->subsys[tset->ssid]; + else + *dst_cssp = cset->subsys[tset->ssid]; + return task; } @@ -2367,7 +2386,8 @@ static int cgroup_taskset_migrate(struct cgroup_taskset *tset, /* check that we can legitimately attach to the cgroup */ for_each_e_css(css, i, dst_cgrp) { if (css->ss->can_attach) { - ret = css->ss->can_attach(css, tset); + tset->ssid = i; + ret = css->ss->can_attach(tset); if (ret) { failed_css = css; goto out_cancel_attach; @@ -2400,9 +2420,12 @@ static int cgroup_taskset_migrate(struct cgroup_taskset *tset, */ tset->csets = &tset->dst_csets; - for_each_e_css(css, i, dst_cgrp) - if (css->ss->attach) - css->ss->attach(css, tset); + for_each_e_css(css, i, dst_cgrp) { + if (css->ss->attach) { + tset->ssid = i; + css->ss->attach(tset); + } + } ret = 0; goto out_release_tset; @@ -2411,8 +2434,10 @@ out_cancel_attach: for_each_e_css(css, i, dst_cgrp) { if (css == failed_css) break; - if (css->ss->cancel_attach) - css->ss->cancel_attach(css, tset); + if (css->ss->cancel_attach) { + tset->ssid = i; + css->ss->cancel_attach(tset); + } } out_release_tset: spin_lock_bh(&css_set_lock); diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index ff02a8e..2d3df82 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -155,10 +155,10 @@ static void freezer_css_free(struct cgroup_subsys_state *css) * @freezer->lock. freezer_attach() makes the new tasks conform to the * current state and all following state changes can see the new tasks. */ -static void freezer_attach(struct cgroup_subsys_state *new_css, - struct cgroup_taskset *tset) +static void freezer_attach(struct cgroup_taskset *tset) { struct task_struct *task; + struct cgroup_subsys_state *new_css; mutex_lock(&freezer_mutex); @@ -172,7 +172,7 @@ static void freezer_attach(struct cgroup_subsys_state *new_css, * current state before executing the following - !frozen tasks may * be visible in a FROZEN cgroup and frozen tasks in a THAWED one. */ - cgroup_taskset_for_each(task, tset) { + cgroup_taskset_for_each(task, new_css, tset) { struct freezer *freezer = css_freezer(new_css); if (!(freezer->state & CGROUP_FREEZING)) { diff --git a/kernel/cgroup_pids.c b/kernel/cgroup_pids.c index de3359a..8e27fc5 100644 --- a/kernel/cgroup_pids.c +++ b/kernel/cgroup_pids.c @@ -162,13 +162,13 @@ revert: return -EAGAIN; } -static int pids_can_attach(struct cgroup_subsys_state *css, - struct cgroup_taskset *tset) +static int pids_can_attach(struct cgroup_taskset *tset) { - struct pids_cgroup *pids = css_pids(css); struct task_struct *task; + struct cgroup_subsys_state *dst_css; - cgroup_taskset_for_each(task, tset) { + cgroup_taskset_for_each(task, dst_css, tset) { + struct pids_cgroup *pids = css_pids(dst_css); struct cgroup_subsys_state *old_css; struct pids_cgroup *old_pids; @@ -187,13 +187,13 @@ static int pids_can_attach(struct cgroup_subsys_state *css, return 0; } -static void pids_cancel_attach(struct cgroup_subsys_state *css, - struct cgroup_taskset *tset) +static void pids_cancel_attach(struct cgroup_taskset *tset) { - struct pids_cgroup *pids = css_pids(css); struct task_struct *task; + struct cgroup_subsys_state *dst_css; - cgroup_taskset_for_each(task, tset) { + cgroup_taskset_for_each(task, dst_css, tset) { + struct pids_cgroup *pids = css_pids(dst_css); struct cgroup_subsys_state *old_css; struct pids_cgroup *old_pids; diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 10ae736..02a8ea5 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1429,15 +1429,16 @@ static int fmeter_getrate(struct fmeter *fmp) static struct cpuset *cpuset_attach_old_cs; /* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */ -static int cpuset_can_attach(struct cgroup_subsys_state *css, - struct cgroup_taskset *tset) +static int cpuset_can_attach(struct cgroup_taskset *tset) { - struct cpuset *cs = css_cs(css); + struct cgroup_subsys_state *css; + struct cpuset *cs; struct task_struct *task; int ret; /* used later by cpuset_attach() */ - cpuset_attach_old_cs = task_cs(cgroup_taskset_first(tset)); + cpuset_attach_old_cs = task_cs(cgroup_taskset_first(tset, &css)); + cs = css_cs(css); mutex_lock(&cpuset_mutex); @@ -1447,7 +1448,7 @@ static int cpuset_can_attach(struct cgroup_subsys_state *css, (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))) goto out_unlock; - cgroup_taskset_for_each(task, tset) { + cgroup_taskset_for_each(task, css, tset) { ret = task_can_attach(task, cs->cpus_allowed); if (ret) goto out_unlock; @@ -1467,9 +1468,14 @@ out_unlock: return ret; } -static void cpuset_cancel_attach(struct cgroup_subsys_state *css, - struct cgroup_taskset *tset) +static void cpuset_cancel_attach(struct cgroup_taskset *tset) { + struct cgroup_subsys_state *css; + struct cpuset *cs; + + cgroup_taskset_first(tset, &css); + cs = css_cs(css); + mutex_lock(&cpuset_mutex); css_cs(css)->attach_in_progress--; mutex_unlock(&cpuset_mutex); @@ -1482,16 +1488,19 @@ static void cpuset_cancel_attach(struct cgroup_subsys_state *css, */ static cpumask_var_t cpus_attach; -static void cpuset_attach(struct cgroup_subsys_state *css, - struct cgroup_taskset *tset) +static void cpuset_attach(struct cgroup_taskset *tset) { /* static buf protected by cpuset_mutex */ static nodemask_t cpuset_attach_nodemask_to; struct task_struct *task; struct task_struct *leader; - struct cpuset *cs = css_cs(css); + struct cgroup_subsys_state *css; + struct cpuset *cs; struct cpuset *oldcs = cpuset_attach_old_cs; + cgroup_taskset_first(tset, &css); + cs = css_cs(css); + mutex_lock(&cpuset_mutex); /* prepare for attach */ @@ -1502,7 +1511,7 @@ static void cpuset_attach(struct cgroup_subsys_state *css, guarantee_online_mems(cs, &cpuset_attach_nodemask_to); - cgroup_taskset_for_each(task, tset) { + cgroup_taskset_for_each(task, css, tset) { /* * can_attach beforehand should guarantee that this doesn't * fail. TODO: have a better way to handle failure here @@ -1518,7 +1527,7 @@ static void cpuset_attach(struct cgroup_subsys_state *css, * sleep and should be moved outside migration path proper. */ cpuset_attach_nodemask_to = cs->effective_mems; - cgroup_taskset_for_each_leader(leader, tset) { + cgroup_taskset_for_each_leader(leader, css, tset) { struct mm_struct *mm = get_task_mm(leader); if (mm) { diff --git a/kernel/events/core.c b/kernel/events/core.c index 36babfd..026305d 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -9456,12 +9456,12 @@ static int __perf_cgroup_move(void *info) return 0; } -static void perf_cgroup_attach(struct cgroup_subsys_state *css, - struct cgroup_taskset *tset) +static void perf_cgroup_attach(struct cgroup_taskset *tset) { struct task_struct *task; + struct cgroup_subsys_state *css; - cgroup_taskset_for_each(task, tset) + cgroup_taskset_for_each(task, css, tset) task_function_call(task, __perf_cgroup_move, task); } diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 4d568ac..a9db4819 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -8217,12 +8217,12 @@ static void cpu_cgroup_fork(struct task_struct *task, void *private) sched_move_task(task); } -static int cpu_cgroup_can_attach(struct cgroup_subsys_state *css, - struct cgroup_taskset *tset) +static int cpu_cgroup_can_attach(struct cgroup_taskset *tset) { struct task_struct *task; + struct cgroup_subsys_state *css; - cgroup_taskset_for_each(task, tset) { + cgroup_taskset_for_each(task, css, tset) { #ifdef CONFIG_RT_GROUP_SCHED if (!sched_rt_can_attach(css_tg(css), task)) return -EINVAL; @@ -8235,12 +8235,12 @@ static int cpu_cgroup_can_attach(struct cgroup_subsys_state *css, return 0; } -static void cpu_cgroup_attach(struct cgroup_subsys_state *css, - struct cgroup_taskset *tset) +static void cpu_cgroup_attach(struct cgroup_taskset *tset) { struct task_struct *task; + struct cgroup_subsys_state *css; - cgroup_taskset_for_each(task, tset) + cgroup_taskset_for_each(task, css, tset) sched_move_task(task); } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 9acfb16..c92a65b 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4779,23 +4779,18 @@ static void mem_cgroup_clear_mc(void) spin_unlock(&mc.lock); } -static int mem_cgroup_can_attach(struct cgroup_subsys_state *css, - struct cgroup_taskset *tset) +static int mem_cgroup_can_attach(struct cgroup_taskset *tset) { - struct mem_cgroup *memcg = mem_cgroup_from_css(css); + struct cgroup_subsys_state *css; + struct mem_cgroup *memcg; struct mem_cgroup *from; struct task_struct *leader, *p; struct mm_struct *mm; unsigned long move_flags; int ret = 0; - /* - * We are now commited to this value whatever it is. Changes in this - * tunable will only affect upcoming migrations, not the current one. - * So we need to save it, and keep it going. - */ - move_flags = READ_ONCE(memcg->move_charge_at_immigrate); - if (!move_flags) + /* charge immigration isn't supported on the default hierarchy */ + if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) return 0; /* @@ -4805,13 +4800,23 @@ static int mem_cgroup_can_attach(struct cgroup_subsys_state *css, * multiple. */ p = NULL; - cgroup_taskset_for_each_leader(leader, tset) { + cgroup_taskset_for_each_leader(leader, css, tset) { WARN_ON_ONCE(p); p = leader; + memcg = mem_cgroup_from_css(css); } if (!p) return 0; + /* + * We are now commited to this value whatever it is. Changes in this + * tunable will only affect upcoming migrations, not the current one. + * So we need to save it, and keep it going. + */ + move_flags = READ_ONCE(memcg->move_charge_at_immigrate); + if (!move_flags) + return 0; + from = mem_cgroup_from_task(p); VM_BUG_ON(from == memcg); @@ -4842,8 +4847,7 @@ static int mem_cgroup_can_attach(struct cgroup_subsys_state *css, return ret; } -static void mem_cgroup_cancel_attach(struct cgroup_subsys_state *css, - struct cgroup_taskset *tset) +static void mem_cgroup_cancel_attach(struct cgroup_taskset *tset) { if (mc.to) mem_cgroup_clear_mc(); @@ -4985,10 +4989,10 @@ retry: atomic_dec(&mc.from->moving_account); } -static void mem_cgroup_move_task(struct cgroup_subsys_state *css, - struct cgroup_taskset *tset) +static void mem_cgroup_move_task(struct cgroup_taskset *tset) { - struct task_struct *p = cgroup_taskset_first(tset); + struct cgroup_subsys_state *css; + struct task_struct *p = cgroup_taskset_first(tset, &css); struct mm_struct *mm = get_task_mm(p); if (mm) { @@ -5000,17 +5004,14 @@ static void mem_cgroup_move_task(struct cgroup_subsys_state *css, mem_cgroup_clear_mc(); } #else /* !CONFIG_MMU */ -static int mem_cgroup_can_attach(struct cgroup_subsys_state *css, - struct cgroup_taskset *tset) +static int mem_cgroup_can_attach(struct cgroup_taskset *tset) { return 0; } -static void mem_cgroup_cancel_attach(struct cgroup_subsys_state *css, - struct cgroup_taskset *tset) +static void mem_cgroup_cancel_attach(struct cgroup_taskset *tset) { } -static void mem_cgroup_move_task(struct cgroup_subsys_state *css, - struct cgroup_taskset *tset) +static void mem_cgroup_move_task(struct cgroup_taskset *tset) { } #endif diff --git a/net/core/netclassid_cgroup.c b/net/core/netclassid_cgroup.c index 6441f47..81cb3c7 100644 --- a/net/core/netclassid_cgroup.c +++ b/net/core/netclassid_cgroup.c @@ -67,14 +67,15 @@ static int update_classid(const void *v, struct file *file, unsigned n) return 0; } -static void cgrp_attach(struct cgroup_subsys_state *css, - struct cgroup_taskset *tset) +static void cgrp_attach(struct cgroup_taskset *tset) { - struct cgroup_cls_state *cs = css_cls_state(css); - void *v = (void *)(unsigned long)cs->classid; struct task_struct *p; + struct cgroup_subsys_state *css; + + cgroup_taskset_for_each(p, css, tset) { + struct cgroup_cls_state *cs = css_cls_state(css); + void *v = (void *)(unsigned long)cs->classid; - cgroup_taskset_for_each(p, tset) { task_lock(p); iterate_fd(p->files, 0, update_classid, v); task_unlock(p); diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c index cbd0a19..40fd09f 100644 --- a/net/core/netprio_cgroup.c +++ b/net/core/netprio_cgroup.c @@ -218,13 +218,14 @@ static int update_netprio(const void *v, struct file *file, unsigned n) return 0; } -static void net_prio_attach(struct cgroup_subsys_state *css, - struct cgroup_taskset *tset) +static void net_prio_attach(struct cgroup_taskset *tset) { struct task_struct *p; - void *v = (void *)(unsigned long)css->cgroup->id; + struct cgroup_subsys_state *css; + + cgroup_taskset_for_each(p, css, tset) { + void *v = (void *)(unsigned long)css->cgroup->id; - cgroup_taskset_for_each(p, tset) { task_lock(p); iterate_fd(p->files, 0, update_netprio, v); task_unlock(p); -- cgit v0.10.2 From 67cde9c4938945b9510730c64e68d2f1dd7bc0aa Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 3 Dec 2015 10:18:21 -0500 Subject: cgroup_pids: don't account for the root cgroup Because accounting resources for the root cgroup sometimes incurs measureable overhead for workloads which don't care about cgroup and often ends up calculating a number which is available elsewhere in a slightly different form, cgroup is not in the business of providing system-wide statistics. The pids controller which was introduced recently was exposing "pids.current" at the root. This patch disable accounting for root cgroup and removes the file from the root directory. While this is a userland visible behavior change, pids has been available only in one version and was badly broken there, so I don't think this will be noticeable. If it turns out to be a problem, we can reinstate it for v1 hierarchies. Signed-off-by: Tejun Heo Cc: Aleksa Sarai diff --git a/kernel/cgroup_pids.c b/kernel/cgroup_pids.c index 8e27fc5..b50d5a1 100644 --- a/kernel/cgroup_pids.c +++ b/kernel/cgroup_pids.c @@ -106,7 +106,7 @@ static void pids_uncharge(struct pids_cgroup *pids, int num) { struct pids_cgroup *p; - for (p = pids; p; p = parent_pids(p)) + for (p = pids; parent_pids(p); p = parent_pids(p)) pids_cancel(p, num); } @@ -123,7 +123,7 @@ static void pids_charge(struct pids_cgroup *pids, int num) { struct pids_cgroup *p; - for (p = pids; p; p = parent_pids(p)) + for (p = pids; parent_pids(p); p = parent_pids(p)) atomic64_add(num, &p->counter); } @@ -140,7 +140,7 @@ static int pids_try_charge(struct pids_cgroup *pids, int num) { struct pids_cgroup *p, *q; - for (p = pids; p; p = parent_pids(p)) { + for (p = pids; parent_pids(p); p = parent_pids(p)) { int64_t new = atomic64_add_return(num, &p->counter); /* @@ -298,6 +298,7 @@ static struct cftype pids_files[] = { { .name = "current", .read_s64 = pids_current_read, + .flags = CFTYPE_NOT_ON_ROOT, }, { } /* terminate */ }; -- cgit v0.10.2 From 4fd41a8552afc01054d9d9fc7f1a63c324867d27 Mon Sep 17 00:00:00 2001 From: Ken Xue Date: Tue, 1 Dec 2015 14:45:46 +0800 Subject: SCSI: Fix NULL pointer dereference in runtime PM The routines in scsi_pm.c assume that if a runtime-PM callback is invoked for a SCSI device, it can only mean that the device's driver has asked the block layer to handle the runtime power management (by calling blk_pm_runtime_init(), which among other things sets q->dev). However, this assumption turns out to be wrong for things like the ses driver. Normally ses devices are not allowed to do runtime PM, but userspace can override this setting. If this happens, the kernel gets a NULL pointer dereference when blk_post_runtime_resume() tries to use the uninitialized q->dev pointer. This patch fixes the problem by checking q->dev in block layer before handle runtime PM. Since ses doesn't define any PM callbacks and call blk_pm_runtime_init(), the crash won't occur. This fixes Bugzilla #101371. https://bugzilla.kernel.org/show_bug.cgi?id=101371 More discussion can be found from below link. http://marc.info/?l=linux-scsi&m=144163730531875&w=2 Signed-off-by: Ken Xue Acked-by: Alan Stern Cc: Xiangliang Yu Cc: James E.J. Bottomley Cc: Jens Axboe Cc: Michael Terry Cc: stable@vger.kernel.org Signed-off-by: Jens Axboe diff --git a/block/blk-core.c b/block/blk-core.c index a0af404..3636be4 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -3405,6 +3405,9 @@ int blk_pre_runtime_suspend(struct request_queue *q) { int ret = 0; + if (!q->dev) + return ret; + spin_lock_irq(q->queue_lock); if (q->nr_pending) { ret = -EBUSY; @@ -3432,6 +3435,9 @@ EXPORT_SYMBOL(blk_pre_runtime_suspend); */ void blk_post_runtime_suspend(struct request_queue *q, int err) { + if (!q->dev) + return; + spin_lock_irq(q->queue_lock); if (!err) { q->rpm_status = RPM_SUSPENDED; @@ -3456,6 +3462,9 @@ EXPORT_SYMBOL(blk_post_runtime_suspend); */ void blk_pre_runtime_resume(struct request_queue *q) { + if (!q->dev) + return; + spin_lock_irq(q->queue_lock); q->rpm_status = RPM_RESUMING; spin_unlock_irq(q->queue_lock); @@ -3478,6 +3487,9 @@ EXPORT_SYMBOL(blk_pre_runtime_resume); */ void blk_post_runtime_resume(struct request_queue *q, int err) { + if (!q->dev) + return; + spin_lock_irq(q->queue_lock); if (!err) { q->rpm_status = RPM_ACTIVE; -- cgit v0.10.2 From 642c2d671ceff40e9453203ea0c66e991e11e249 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 30 Nov 2015 12:56:15 +0100 Subject: perf: Fix PERF_EVENT_IOC_PERIOD deadlock Dmitry reported a fairly silly recursive lock deadlock for PERF_EVENT_IOC_PERIOD, fix this by explicitly doing the inactive part of __perf_event_period() instead of calling that function. Reported-by: Dmitry Vyukov Signed-off-by: Peter Zijlstra (Intel) Cc: Cc: Alexander Potapenko Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: Eric Dumazet Cc: Jiri Olsa Cc: Kostya Serebryany Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Sasha Levin Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Fixes: c7999c6f3fed ("perf: Fix PERF_EVENT_IOC_PERIOD migration race") Link: http://lkml.kernel.org/r/20151130115615.GJ17308@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar diff --git a/kernel/events/core.c b/kernel/events/core.c index 5854fcf..49a5118 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -4225,7 +4225,14 @@ retry: goto retry; } - __perf_event_period(&pe); + if (event->attr.freq) { + event->attr.sample_freq = value; + } else { + event->attr.sample_period = value; + event->hw.sample_period = value; + } + + local64_set(&event->hw.period_left, 0); raw_spin_unlock_irq(&ctx->lock); return 0; -- cgit v0.10.2 From ae5515d66362b9d96cdcfce504567f0b8b7bd83e Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Fri, 4 Dec 2015 08:38:42 -0700 Subject: Revert: "vfio: Include No-IOMMU mode" Revert commit 033291eccbdb ("vfio: Include No-IOMMU mode") due to lack of a user. This was originally intended to fill a need for the DPDK driver, but uptake has been slow so rather than support an unproven kernel interface revert it and revisit when userspace catches up. Signed-off-by: Alex Williamson diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig index da6e2ce..850d86c 100644 --- a/drivers/vfio/Kconfig +++ b/drivers/vfio/Kconfig @@ -31,21 +31,6 @@ menuconfig VFIO If you don't know what to do here, say N. -menuconfig VFIO_NOIOMMU - bool "VFIO No-IOMMU support" - depends on VFIO - help - VFIO is built on the ability to isolate devices using the IOMMU. - Only with an IOMMU can userspace access to DMA capable devices be - considered secure. VFIO No-IOMMU mode enables IOMMU groups for - devices without IOMMU backing for the purpose of re-using the VFIO - infrastructure in a non-secure mode. Use of this mode will result - in an unsupportable kernel and will therefore taint the kernel. - Device assignment to virtual machines is also not possible with - this mode since there is no IOMMU to provide DMA translation. - - If you don't know what to do here, say N. - source "drivers/vfio/pci/Kconfig" source "drivers/vfio/platform/Kconfig" source "virt/lib/Kconfig" diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c index 2760a7b..56bf6db 100644 --- a/drivers/vfio/pci/vfio_pci.c +++ b/drivers/vfio/pci/vfio_pci.c @@ -940,13 +940,13 @@ static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) if (pdev->hdr_type != PCI_HEADER_TYPE_NORMAL) return -EINVAL; - group = vfio_iommu_group_get(&pdev->dev); + group = iommu_group_get(&pdev->dev); if (!group) return -EINVAL; vdev = kzalloc(sizeof(*vdev), GFP_KERNEL); if (!vdev) { - vfio_iommu_group_put(group, &pdev->dev); + iommu_group_put(group); return -ENOMEM; } @@ -957,7 +957,7 @@ static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) ret = vfio_add_group_dev(&pdev->dev, &vfio_pci_ops, vdev); if (ret) { - vfio_iommu_group_put(group, &pdev->dev); + iommu_group_put(group); kfree(vdev); return ret; } @@ -993,7 +993,7 @@ static void vfio_pci_remove(struct pci_dev *pdev) if (!vdev) return; - vfio_iommu_group_put(pdev->dev.iommu_group, &pdev->dev); + iommu_group_put(pdev->dev.iommu_group); kfree(vdev); if (vfio_pci_is_vga(pdev)) { diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c index 9da0703..6070b79 100644 --- a/drivers/vfio/vfio.c +++ b/drivers/vfio/vfio.c @@ -62,7 +62,6 @@ struct vfio_container { struct rw_semaphore group_lock; struct vfio_iommu_driver *iommu_driver; void *iommu_data; - bool noiommu; }; struct vfio_unbound_dev { @@ -85,7 +84,6 @@ struct vfio_group { struct list_head unbound_list; struct mutex unbound_lock; atomic_t opened; - bool noiommu; }; struct vfio_device { @@ -97,147 +95,6 @@ struct vfio_device { void *device_data; }; -#ifdef CONFIG_VFIO_NOIOMMU -static bool noiommu __read_mostly; -module_param_named(enable_unsafe_noiommu_support, - noiommu, bool, S_IRUGO | S_IWUSR); -MODULE_PARM_DESC(enable_unsafe_noiommu_mode, "Enable UNSAFE, no-IOMMU mode. This mode provides no device isolation, no DMA translation, no host kernel protection, cannot be used for device assignment to virtual machines, requires RAWIO permissions, and will taint the kernel. If you do not know what this is for, step away. (default: false)"); -#endif - -/* - * vfio_iommu_group_{get,put} are only intended for VFIO bus driver probe - * and remove functions, any use cases other than acquiring the first - * reference for the purpose of calling vfio_add_group_dev() or removing - * that symmetric reference after vfio_del_group_dev() should use the raw - * iommu_group_{get,put} functions. In particular, vfio_iommu_group_put() - * removes the device from the dummy group and cannot be nested. - */ -struct iommu_group *vfio_iommu_group_get(struct device *dev) -{ - struct iommu_group *group; - int __maybe_unused ret; - - group = iommu_group_get(dev); - -#ifdef CONFIG_VFIO_NOIOMMU - /* - * With noiommu enabled, an IOMMU group will be created for a device - * that doesn't already have one and doesn't have an iommu_ops on their - * bus. We use iommu_present() again in the main code to detect these - * fake groups. - */ - if (group || !noiommu || iommu_present(dev->bus)) - return group; - - group = iommu_group_alloc(); - if (IS_ERR(group)) - return NULL; - - iommu_group_set_name(group, "vfio-noiommu"); - ret = iommu_group_add_device(group, dev); - iommu_group_put(group); - if (ret) - return NULL; - - /* - * Where to taint? At this point we've added an IOMMU group for a - * device that is not backed by iommu_ops, therefore any iommu_ - * callback using iommu_ops can legitimately Oops. So, while we may - * be about to give a DMA capable device to a user without IOMMU - * protection, which is clearly taint-worthy, let's go ahead and do - * it here. - */ - add_taint(TAINT_USER, LOCKDEP_STILL_OK); - dev_warn(dev, "Adding kernel taint for vfio-noiommu group on device\n"); -#endif - - return group; -} -EXPORT_SYMBOL_GPL(vfio_iommu_group_get); - -void vfio_iommu_group_put(struct iommu_group *group, struct device *dev) -{ -#ifdef CONFIG_VFIO_NOIOMMU - if (!iommu_present(dev->bus)) - iommu_group_remove_device(dev); -#endif - - iommu_group_put(group); -} -EXPORT_SYMBOL_GPL(vfio_iommu_group_put); - -#ifdef CONFIG_VFIO_NOIOMMU -static void *vfio_noiommu_open(unsigned long arg) -{ - if (arg != VFIO_NOIOMMU_IOMMU) - return ERR_PTR(-EINVAL); - if (!capable(CAP_SYS_RAWIO)) - return ERR_PTR(-EPERM); - - return NULL; -} - -static void vfio_noiommu_release(void *iommu_data) -{ -} - -static long vfio_noiommu_ioctl(void *iommu_data, - unsigned int cmd, unsigned long arg) -{ - if (cmd == VFIO_CHECK_EXTENSION) - return arg == VFIO_NOIOMMU_IOMMU ? 1 : 0; - - return -ENOTTY; -} - -static int vfio_iommu_present(struct device *dev, void *unused) -{ - return iommu_present(dev->bus) ? 1 : 0; -} - -static int vfio_noiommu_attach_group(void *iommu_data, - struct iommu_group *iommu_group) -{ - return iommu_group_for_each_dev(iommu_group, NULL, - vfio_iommu_present) ? -EINVAL : 0; -} - -static void vfio_noiommu_detach_group(void *iommu_data, - struct iommu_group *iommu_group) -{ -} - -static struct vfio_iommu_driver_ops vfio_noiommu_ops = { - .name = "vfio-noiommu", - .owner = THIS_MODULE, - .open = vfio_noiommu_open, - .release = vfio_noiommu_release, - .ioctl = vfio_noiommu_ioctl, - .attach_group = vfio_noiommu_attach_group, - .detach_group = vfio_noiommu_detach_group, -}; - -static struct vfio_iommu_driver vfio_noiommu_driver = { - .ops = &vfio_noiommu_ops, -}; - -/* - * Wrap IOMMU drivers, the noiommu driver is the one and only driver for - * noiommu groups (and thus containers) and not available for normal groups. - */ -#define vfio_for_each_iommu_driver(con, pos) \ - for (pos = con->noiommu ? &vfio_noiommu_driver : \ - list_first_entry(&vfio.iommu_drivers_list, \ - struct vfio_iommu_driver, vfio_next); \ - (con->noiommu ? pos != NULL : \ - &pos->vfio_next != &vfio.iommu_drivers_list); \ - pos = con->noiommu ? NULL : list_next_entry(pos, vfio_next)) -#else -#define vfio_for_each_iommu_driver(con, pos) \ - list_for_each_entry(pos, &vfio.iommu_drivers_list, vfio_next) -#endif - - /** * IOMMU driver registration */ @@ -342,8 +199,7 @@ static void vfio_group_unlock_and_free(struct vfio_group *group) /** * Group objects - create, release, get, put, search */ -static struct vfio_group *vfio_create_group(struct iommu_group *iommu_group, - bool noiommu) +static struct vfio_group *vfio_create_group(struct iommu_group *iommu_group) { struct vfio_group *group, *tmp; struct device *dev; @@ -361,7 +217,6 @@ static struct vfio_group *vfio_create_group(struct iommu_group *iommu_group, atomic_set(&group->container_users, 0); atomic_set(&group->opened, 0); group->iommu_group = iommu_group; - group->noiommu = noiommu; group->nb.notifier_call = vfio_iommu_group_notifier; @@ -397,8 +252,7 @@ static struct vfio_group *vfio_create_group(struct iommu_group *iommu_group, dev = device_create(vfio.class, NULL, MKDEV(MAJOR(vfio.group_devt), minor), - group, "%s%d", noiommu ? "noiommu-" : "", - iommu_group_id(iommu_group)); + group, "%d", iommu_group_id(iommu_group)); if (IS_ERR(dev)) { vfio_free_group_minor(minor); vfio_group_unlock_and_free(group); @@ -786,8 +640,7 @@ int vfio_add_group_dev(struct device *dev, group = vfio_group_get_from_iommu(iommu_group); if (!group) { - group = vfio_create_group(iommu_group, - !iommu_present(dev->bus)); + group = vfio_create_group(iommu_group); if (IS_ERR(group)) { iommu_group_put(iommu_group); return PTR_ERR(group); @@ -999,7 +852,8 @@ static long vfio_ioctl_check_extension(struct vfio_container *container, */ if (!driver) { mutex_lock(&vfio.iommu_drivers_lock); - vfio_for_each_iommu_driver(container, driver) { + list_for_each_entry(driver, &vfio.iommu_drivers_list, + vfio_next) { if (!try_module_get(driver->ops->owner)) continue; @@ -1068,7 +922,7 @@ static long vfio_ioctl_set_iommu(struct vfio_container *container, } mutex_lock(&vfio.iommu_drivers_lock); - vfio_for_each_iommu_driver(container, driver) { + list_for_each_entry(driver, &vfio.iommu_drivers_list, vfio_next) { void *data; if (!try_module_get(driver->ops->owner)) @@ -1333,9 +1187,6 @@ static int vfio_group_set_container(struct vfio_group *group, int container_fd) if (atomic_read(&group->container_users)) return -EINVAL; - if (group->noiommu && !capable(CAP_SYS_RAWIO)) - return -EPERM; - f = fdget(container_fd); if (!f.file) return -EBADF; @@ -1351,13 +1202,6 @@ static int vfio_group_set_container(struct vfio_group *group, int container_fd) down_write(&container->group_lock); - /* Real groups and fake groups cannot mix */ - if (!list_empty(&container->group_list) && - container->noiommu != group->noiommu) { - ret = -EPERM; - goto unlock_out; - } - driver = container->iommu_driver; if (driver) { ret = driver->ops->attach_group(container->iommu_data, @@ -1367,7 +1211,6 @@ static int vfio_group_set_container(struct vfio_group *group, int container_fd) } group->container = container; - container->noiommu = group->noiommu; list_add(&group->container_next, &container->group_list); /* Get a reference on the container and mark a user within the group */ @@ -1398,9 +1241,6 @@ static int vfio_group_get_device_fd(struct vfio_group *group, char *buf) !group->container->iommu_driver || !vfio_group_viable(group)) return -EINVAL; - if (group->noiommu && !capable(CAP_SYS_RAWIO)) - return -EPERM; - device = vfio_device_get_from_name(group, buf); if (!device) return -ENODEV; @@ -1443,10 +1283,6 @@ static int vfio_group_get_device_fd(struct vfio_group *group, char *buf) fd_install(ret, filep); - if (group->noiommu) - dev_warn(device->dev, "vfio-noiommu device opened by user " - "(%s:%d)\n", current->comm, task_pid_nr(current)); - return ret; } @@ -1535,11 +1371,6 @@ static int vfio_group_fops_open(struct inode *inode, struct file *filep) if (!group) return -ENODEV; - if (group->noiommu && !capable(CAP_SYS_RAWIO)) { - vfio_group_put(group); - return -EPERM; - } - /* Do we need multiple instances of the group open? Seems not. */ opened = atomic_cmpxchg(&group->opened, 0, 1); if (opened) { @@ -1702,11 +1533,6 @@ struct vfio_group *vfio_group_get_external_user(struct file *filep) if (!atomic_inc_not_zero(&group->container_users)) return ERR_PTR(-EINVAL); - if (group->noiommu) { - atomic_dec(&group->container_users); - return ERR_PTR(-EPERM); - } - if (!group->container->iommu_driver || !vfio_group_viable(group)) { atomic_dec(&group->container_users); diff --git a/include/linux/vfio.h b/include/linux/vfio.h index 610a86a..ddb4409 100644 --- a/include/linux/vfio.h +++ b/include/linux/vfio.h @@ -44,9 +44,6 @@ struct vfio_device_ops { void (*request)(void *device_data, unsigned int count); }; -extern struct iommu_group *vfio_iommu_group_get(struct device *dev); -extern void vfio_iommu_group_put(struct iommu_group *group, struct device *dev); - extern int vfio_add_group_dev(struct device *dev, const struct vfio_device_ops *ops, void *device_data); diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index 751b69f..9fd7b5d 100644 --- a/include/uapi/linux/vfio.h +++ b/include/uapi/linux/vfio.h @@ -39,13 +39,6 @@ #define VFIO_SPAPR_TCE_v2_IOMMU 7 /* - * The No-IOMMU IOMMU offers no translation or isolation for devices and - * supports no ioctls outside of VFIO_CHECK_EXTENSION. Use of VFIO's No-IOMMU - * code will taint the host kernel and should be used with extreme caution. - */ -#define VFIO_NOIOMMU_IOMMU 8 - -/* * The IOCTL interface is designed for extensibility by embedding the * structure length (argsz) and flags into structures passed between * kernel and userspace. We therefore use the _IO() macro for these -- cgit v0.10.2 From 84ed91526f9881886d70a082032236edaa20e7d4 Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Fri, 4 Dec 2015 15:53:42 +0200 Subject: xhci: Fix memory leak in xhci_pme_acpi_rtd3_enable() There is a memory leak because acpi_evaluate_dsm() actually returns an object which the caller is supposed to release. Fix this by calling ACPI_FREE() for the returned object (this expands to kfree() so passing NULL there is fine as well). While there correct indentation in !CONFIG_ACPI case. Signed-off-by: Mika Westerberg Cc: stable # v4.2 Signed-off-by: Mathias Nyman Signed-off-by: Greg Kroah-Hartman diff --git a/drivers/usb/host/xhci-pci.c b/drivers/usb/host/xhci-pci.c index 17f6897..c621090 100644 --- a/drivers/usb/host/xhci-pci.c +++ b/drivers/usb/host/xhci-pci.c @@ -188,10 +188,14 @@ static void xhci_pme_acpi_rtd3_enable(struct pci_dev *dev) 0xb7, 0x0c, 0x34, 0xac, 0x01, 0xe9, 0xbf, 0x45, 0xb7, 0xe6, 0x2b, 0x34, 0xec, 0x93, 0x1e, 0x23, }; - acpi_evaluate_dsm(ACPI_HANDLE(&dev->dev), intel_dsm_uuid, 3, 1, NULL); + union acpi_object *obj; + + obj = acpi_evaluate_dsm(ACPI_HANDLE(&dev->dev), intel_dsm_uuid, 3, 1, + NULL); + ACPI_FREE(obj); } #else - static void xhci_pme_acpi_rtd3_enable(struct pci_dev *dev) { } +static void xhci_pme_acpi_rtd3_enable(struct pci_dev *dev) { } #endif /* CONFIG_ACPI */ /* called during probe() after chip reset completes */ -- cgit v0.10.2 From 096b110a3dd3c868e4610937c80d2e3f3357c1a9 Mon Sep 17 00:00:00 2001 From: Chunfeng Yun Date: Fri, 4 Dec 2015 15:53:43 +0200 Subject: usb: xhci: fix config fail of FS hub behind a HS hub with MTT if a full speed hub connects to a high speed hub which supports MTT, the MTT field of its slot context will be set to 1 when xHCI driver setups an xHCI virtual device in xhci_setup_addressable_virt_dev(); once usb core fetch its hub descriptor, and need to update the xHC's internal data structures for the device, the HUB field of its slot context will be set to 1 too, meanwhile MTT is also set before, this will cause configure endpoint command fail, so in the case, we should clear MTT to 0 for full speed hub according to section 6.2.2 Signed-off-by: Chunfeng Yun Cc: stable Signed-off-by: Mathias Nyman Signed-off-by: Greg Kroah-Hartman diff --git a/drivers/usb/host/xhci.c b/drivers/usb/host/xhci.c index dfa44d3..3f91270 100644 --- a/drivers/usb/host/xhci.c +++ b/drivers/usb/host/xhci.c @@ -4778,8 +4778,16 @@ int xhci_update_hub_device(struct usb_hcd *hcd, struct usb_device *hdev, ctrl_ctx->add_flags |= cpu_to_le32(SLOT_FLAG); slot_ctx = xhci_get_slot_ctx(xhci, config_cmd->in_ctx); slot_ctx->dev_info |= cpu_to_le32(DEV_HUB); + /* + * refer to section 6.2.2: MTT should be 0 for full speed hub, + * but it may be already set to 1 when setup an xHCI virtual + * device, so clear it anyway. + */ if (tt->multi) slot_ctx->dev_info |= cpu_to_le32(DEV_MTT); + else if (hdev->speed == USB_SPEED_FULL) + slot_ctx->dev_info &= cpu_to_le32(~DEV_MTT); + if (xhci->hci_version > 0x95) { xhci_dbg(xhci, "xHCI version %x needs hub " "TT think time and number of ports\n", -- cgit v0.10.2 From 6406eeb3f5bb376c7d9674e61f8da34ce7f05e8d Mon Sep 17 00:00:00 2001 From: Don Zickus Date: Thu, 3 Dec 2015 17:26:27 -0500 Subject: usb: Quiet down false peer failure messages My recent Intel box is spewing these messages: xhci_hcd 0000:00:14.0: xHCI Host Controller xhci_hcd 0000:00:14.0: new USB bus registered, assigned bus number 2 usb usb2: New USB device found, idVendor=1d6b, idProduct=0003 usb usb2: New USB device strings: Mfr=3, Product=2, SerialNumber=1 usb usb2: Product: xHCI Host Controller usb usb2: Manufacturer: Linux 4.3.0+ xhci-hcd usb usb2: SerialNumber: 0000:00:14.0 hub 2-0:1.0: USB hub found hub 2-0:1.0: 6 ports detected usb: failed to peer usb2-port2 and usb1-port1 by location (usb2-port2:none) (usb1-port1:usb2-port1) usb usb2-port2: failed to peer to usb1-port1 (-16) usb: port power management may be unreliable usb: failed to peer usb2-port3 and usb1-port1 by location (usb2-port3:none) (usb1-port1:usb2-port1) usb usb2-port3: failed to peer to usb1-port1 (-16) usb: failed to peer usb2-port5 and usb1-port1 by location (usb2-port5:none) (usb1-port1:usb2-port1) usb usb2-port5: failed to peer to usb1-port1 (-16) usb: failed to peer usb2-port6 and usb1-port1 by location (usb2-port6:none) (usb1-port1:usb2-port1) usb usb2-port6: failed to peer to usb1-port1 (-16) Diving into the acpi tables, I noticed the EHCI hub has 12 ports while the XHCI hub has 8 ports. Most of those ports are of connect type USB_PORT_NOT_USED (including port 1 of the EHCI hub). Further the unused ports have location data initialized to 0x80000000. Now each unused port on the xhci hub walks the port list and finds a matching peer with port1 of the EHCI hub because the zero'd out group id bits falsely match. After port1 of the XHCI hub, each following matching peer will generate the above warning. These warnings seem to be harmless for this scenario as I don't think it matters that unused ports could not create a peer link. The attached patch utilizes that assumption and just turns the pr_warn into pr_debug to quiet things down. Tested on my Intel box. Signed-off-by: Don Zickus Acked-by: Dan Williams Signed-off-by: Greg Kroah-Hartman diff --git a/drivers/usb/core/port.c b/drivers/usb/core/port.c index 2106183..5487fe308 100644 --- a/drivers/usb/core/port.c +++ b/drivers/usb/core/port.c @@ -206,7 +206,7 @@ static int link_peers(struct usb_port *left, struct usb_port *right) else method = "default"; - pr_warn("usb: failed to peer %s and %s by %s (%s:%s) (%s:%s)\n", + pr_debug("usb: failed to peer %s and %s by %s (%s:%s) (%s:%s)\n", dev_name(&left->dev), dev_name(&right->dev), method, dev_name(&left->dev), lpeer ? dev_name(&lpeer->dev) : "none", @@ -265,7 +265,7 @@ static void link_peers_report(struct usb_port *left, struct usb_port *right) if (rc == 0) { dev_dbg(&left->dev, "peered to %s\n", dev_name(&right->dev)); } else { - dev_warn(&left->dev, "failed to peer to %s (%d)\n", + dev_dbg(&left->dev, "failed to peer to %s (%d)\n", dev_name(&right->dev), rc); pr_warn_once("usb: port power management may be unreliable\n"); usb_port_block_power_off = 1; -- cgit v0.10.2 From 4a0c4c36094cefd85cbe489590382ef69516ccef Mon Sep 17 00:00:00 2001 From: Alexandre Belloni Date: Wed, 2 Dec 2015 20:36:55 +0100 Subject: USB: host: ohci-at91: fix a crash in ohci_hcd_at91_overcurrent_irq The interrupt handler, ohci_hcd_at91_overcurrent_irq may be called right after registration. At that time, pdev->dev.platform_data is not yet set, leading to a NULL pointer dereference. Fixes: e4df92279fd9 (USB: host: ohci-at91: merge loops in ohci_hcd_at91_drv_probe) Reported-by: Peter Rosin Tested-by: Peter Rosin Signed-off-by: Alexandre Belloni Cc: stable@vger.kernel.org # 4.3+ Acked-by: Nicolas Ferre Acked-by: Alan Stern Signed-off-by: Greg Kroah-Hartman diff --git a/drivers/usb/host/ohci-at91.c b/drivers/usb/host/ohci-at91.c index 342ffd1..8c6e15b 100644 --- a/drivers/usb/host/ohci-at91.c +++ b/drivers/usb/host/ohci-at91.c @@ -473,6 +473,8 @@ static int ohci_hcd_at91_drv_probe(struct platform_device *pdev) if (!pdata) return -ENOMEM; + pdev->dev.platform_data = pdata; + if (!of_property_read_u32(np, "num-ports", &ports)) pdata->ports = ports; @@ -483,6 +485,7 @@ static int ohci_hcd_at91_drv_probe(struct platform_device *pdev) */ if (i >= pdata->ports) { pdata->vbus_pin[i] = -EINVAL; + pdata->overcurrent_pin[i] = -EINVAL; continue; } @@ -513,10 +516,8 @@ static int ohci_hcd_at91_drv_probe(struct platform_device *pdev) } at91_for_each_port(i) { - if (i >= pdata->ports) { - pdata->overcurrent_pin[i] = -EINVAL; - continue; - } + if (i >= pdata->ports) + break; pdata->overcurrent_pin[i] = of_get_named_gpio_flags(np, "atmel,oc-gpio", i, &flags); @@ -552,8 +553,6 @@ static int ohci_hcd_at91_drv_probe(struct platform_device *pdev) } } - pdev->dev.platform_data = pdata; - device_init_wakeup(&pdev->dev, 1); return usb_hcd_at91_probe(&ohci_at91_hc_driver, pdev); } -- cgit v0.10.2 From 3845d2953aacf00ad069806ba8d1495675069f23 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Fri, 4 Dec 2015 10:28:14 -0600 Subject: PCI/MSI: Only use the generic MSI layer when domain is hierarchical Since d8a1cb757550 ("PCI/MSI: Let pci_msi_get_domain use struct device::msi_domain"), we use the MSI domain associated with the PCI device. But finding an MSI domain doesn't mean that the domain is implemented using the generic MSI domain API, and a number of MSI controllers are still using arch_setup_msi_irq() and arch_teardown_msi_irqs(). Check that the domain we just obtained is hierarchical. If it is, we can use the new generic MSI stuff. Otherwise we have to fall back to the old arch_setup_msi_irq() and arch_teardown_msi_irqs() interfaces. This avoids an oops in msi_domain_alloc_irqs() on systems with R-Car, Tegra, Armada 370, and probably other DesignWare-based host controllers. Fixes: d8a1cb757550 ("PCI/MSI: Let pci_msi_get_domain use struct device::msi_domain") Reported-by: Phil Edworthy Tested-by: Phil Edworthy Signed-off-by: Marc Zyngier Signed-off-by: Bjorn Helgaas Acked-by: Thomas Gleixner CC: stable@vger.kernel.org # v4.3+ diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c index 53e4632..7eaa4c8 100644 --- a/drivers/pci/msi.c +++ b/drivers/pci/msi.c @@ -54,7 +54,7 @@ static int pci_msi_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) struct irq_domain *domain; domain = pci_msi_get_domain(dev); - if (domain) + if (domain && irq_domain_is_hierarchy(domain)) return pci_msi_domain_alloc_irqs(domain, dev, nvec, type); return arch_setup_msi_irqs(dev, nvec, type); @@ -65,7 +65,7 @@ static void pci_msi_teardown_msi_irqs(struct pci_dev *dev) struct irq_domain *domain; domain = pci_msi_get_domain(dev); - if (domain) + if (domain && irq_domain_is_hierarchy(domain)) pci_msi_domain_free_irqs(domain, dev); else arch_teardown_msi_irqs(dev); -- cgit v0.10.2 From 863c1ca73f56d3473ff1d06357b31b0063538a96 Mon Sep 17 00:00:00 2001 From: Nicolas Ferre Date: Tue, 17 Nov 2015 16:25:45 +0100 Subject: ARM: at91/dt: add always-on to 1.8V regulator As the SDHCI controller needs the 1.8V line to be always enabled for some eMMC configurations, set the proper "regulator-always-on" property to the board DTS files. Note that the sdhci classical regulator definitions doesn't suit our controller for this 1.8V purpose. Signed-off-by: Nicolas Ferre Acked-by: Ludovic Desroches Signed-off-by: Alexandre Belloni diff --git a/arch/arm/boot/dts/at91-sama5d2_xplained.dts b/arch/arm/boot/dts/at91-sama5d2_xplained.dts index ad6de73..e74df32 100644 --- a/arch/arm/boot/dts/at91-sama5d2_xplained.dts +++ b/arch/arm/boot/dts/at91-sama5d2_xplained.dts @@ -184,6 +184,7 @@ regulator-name = "VDD_SDHC_1V8"; regulator-min-microvolt = <1800000>; regulator-max-microvolt = <1800000>; + regulator-always-on; }; }; }; -- cgit v0.10.2 From 087ffd4eae9929afd06f6a709861df3c3508492a Mon Sep 17 00:00:00 2001 From: Junxiao Bi Date: Fri, 4 Dec 2015 12:29:28 -0500 Subject: jbd2: fix null committed data return in undo_access introduced jbd2_write_access_granted() to improve write|undo_access speed, but missed to check the status of b_committed_data which caused a kernel panic on ocfs2. [ 6538.405938] ------------[ cut here ]------------ [ 6538.406686] kernel BUG at fs/ocfs2/suballoc.c:2400! [ 6538.406686] invalid opcode: 0000 [#1] SMP [ 6538.406686] Modules linked in: ocfs2 nfsd lockd grace nfs_acl auth_rpcgss sunrpc autofs4 ocfs2_dlmfs ocfs2_stack_o2cb ocfs2_dlm ocfs2_nodemanager ocfs2_stackglue configfs sd_mod sg ip6t_REJECT nf_reject_ipv6 nf_conntrack_ipv6 nf_defrag_ipv6 xt_state nf_conntrack ip6table_filter ip6_tables be2iscsi iscsi_boot_sysfs bnx2i cnic uio cxgb4i cxgb4 cxgb3i libcxgbi cxgb3 mdio ib_iser rdma_cm ib_cm iw_cm ib_sa ib_mad ib_core ib_addr ipv6 iscsi_tcp libiscsi_tcp libiscsi scsi_transport_iscsi ppdev xen_kbdfront xen_netfront xen_fbfront parport_pc parport pcspkr i2c_piix4 acpi_cpufreq ext4 jbd2 mbcache xen_blkfront floppy pata_acpi ata_generic ata_piix cirrus ttm drm_kms_helper drm fb_sys_fops sysimgblt sysfillrect i2c_core syscopyarea dm_mirror dm_region_hash dm_log dm_mod [ 6538.406686] CPU: 1 PID: 16265 Comm: mmap_truncate Not tainted 4.3.0 #1 [ 6538.406686] Hardware name: Xen HVM domU, BIOS 4.3.1OVM 05/14/2014 [ 6538.406686] task: ffff88007c2bab00 ti: ffff880075b78000 task.ti: ffff880075b78000 [ 6538.406686] RIP: 0010:[] [] ocfs2_block_group_clear_bits+0x23b/0x250 [ocfs2] [ 6538.406686] RSP: 0018:ffff880075b7b7f8 EFLAGS: 00010246 [ 6538.406686] RAX: ffff8800760c5b40 RBX: ffff88006c06a000 RCX: ffffffffa06e6df0 [ 6538.406686] RDX: 0000000000000000 RSI: ffff88007a6f6ea0 RDI: ffff88007a760430 [ 6538.406686] RBP: ffff880075b7b878 R08: 0000000000000002 R09: 0000000000000001 [ 6538.406686] R10: ffffffffa06769be R11: 0000000000000000 R12: 0000000000000001 [ 6538.406686] R13: ffffffffa06a1750 R14: 0000000000000001 R15: ffff88007a6f6ea0 [ 6538.406686] FS: 00007f17fde30720(0000) GS:ffff88007f040000(0000) knlGS:0000000000000000 [ 6538.406686] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 6538.406686] CR2: 0000000000601730 CR3: 000000007aea0000 CR4: 00000000000406e0 [ 6538.406686] Stack: [ 6538.406686] ffff88007c2bb5b0 ffff880075b7b8e0 ffff88007a7604b0 ffff88006c640800 [ 6538.406686] ffff88007a7604b0 ffff880075d77390 0000000075b7b878 ffffffffa06a309d [ 6538.406686] ffff880075d752d8 ffff880075b7b990 ffff880075b7b898 0000000000000000 [ 6538.406686] Call Trace: [ 6538.406686] [] ? ocfs2_read_group_descriptor+0x6d/0xa0 [ocfs2] [ 6538.406686] [] _ocfs2_free_suballoc_bits+0xe4/0x320 [ocfs2] [ 6538.406686] [] ? ocfs2_put_slot+0xf0/0xf0 [ocfs2] [ 6538.406686] [] _ocfs2_free_clusters+0xee/0x210 [ocfs2] [ 6538.406686] [] ? ocfs2_put_slot+0xf0/0xf0 [ocfs2] [ 6538.406686] [] ? ocfs2_put_slot+0xf0/0xf0 [ocfs2] [ 6538.406686] [] ? ocfs2_extend_trans+0x50/0x1a0 [ocfs2] [ 6538.406686] [] ocfs2_free_clusters+0x15/0x20 [ocfs2] [ 6538.406686] [] ocfs2_replay_truncate_records+0xfc/0x290 [ocfs2] [ 6538.406686] [] ? ocfs2_start_trans+0xec/0x1d0 [ocfs2] [ 6538.406686] [] __ocfs2_flush_truncate_log+0x140/0x2d0 [ocfs2] [ 6538.406686] [] ? ocfs2_reserve_blocks_for_rec_trunc.clone.0+0x44/0x170 [ocfs2] [ 6538.406686] [] ocfs2_remove_btree_range+0x374/0x630 [ocfs2] [ 6538.406686] [] ? jbd2_journal_stop+0x25b/0x470 [jbd2] [ 6538.406686] [] ocfs2_commit_truncate+0x305/0x670 [ocfs2] [ 6538.406686] [] ? ocfs2_journal_access_eb+0x20/0x20 [ocfs2] [ 6538.406686] [] ocfs2_truncate_file+0x297/0x380 [ocfs2] [ 6538.406686] [] ? jbd2_journal_begin_ordered_truncate+0x64/0xc0 [jbd2] [ 6538.406686] [] ocfs2_setattr+0x572/0x860 [ocfs2] [ 6538.406686] [] ? current_fs_time+0x3f/0x50 [ 6538.406686] [] notify_change+0x1d7/0x340 [ 6538.406686] [] ? generic_getxattr+0x79/0x80 [ 6538.406686] [] do_truncate+0x66/0x90 [ 6538.406686] [] ? __audit_syscall_entry+0xb0/0x110 [ 6538.406686] [] do_sys_ftruncate.clone.0+0xf3/0x120 [ 6538.406686] [] SyS_ftruncate+0xe/0x10 [ 6538.406686] [] entry_SYSCALL_64_fastpath+0x12/0x71 [ 6538.406686] Code: 28 48 81 ee b0 04 00 00 48 8b 92 50 fb ff ff 48 8b 80 b0 03 00 00 48 39 90 88 00 00 00 0f 84 30 fe ff ff 0f 0b eb fe 0f 0b eb fe <0f> 0b 0f 1f 00 eb fb 66 66 66 66 66 2e 0f 1f 84 00 00 00 00 00 [ 6538.406686] RIP [] ocfs2_block_group_clear_bits+0x23b/0x250 [ocfs2] [ 6538.406686] RSP [ 6538.691128] ---[ end trace 31cd7011d6770d7e ]--- [ 6538.694492] Kernel panic - not syncing: Fatal exception [ 6538.695484] Kernel Offset: disabled Fixes: de92c8caf16c("jbd2: speedup jbd2_journal_get_[write|undo]_access()") Cc: Signed-off-by: Junxiao Bi Signed-off-by: Theodore Ts'o diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index b996212..1498ad9 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -1009,7 +1009,8 @@ out: } /* Fast check whether buffer is already attached to the required transaction */ -static bool jbd2_write_access_granted(handle_t *handle, struct buffer_head *bh) +static bool jbd2_write_access_granted(handle_t *handle, struct buffer_head *bh, + bool undo) { struct journal_head *jh; bool ret = false; @@ -1036,6 +1037,9 @@ static bool jbd2_write_access_granted(handle_t *handle, struct buffer_head *bh) jh = READ_ONCE(bh->b_private); if (!jh) goto out; + /* For undo access buffer must have data copied */ + if (undo && !jh->b_committed_data) + goto out; if (jh->b_transaction != handle->h_transaction && jh->b_next_transaction != handle->h_transaction) goto out; @@ -1073,7 +1077,7 @@ int jbd2_journal_get_write_access(handle_t *handle, struct buffer_head *bh) struct journal_head *jh; int rc; - if (jbd2_write_access_granted(handle, bh)) + if (jbd2_write_access_granted(handle, bh, false)) return 0; jh = jbd2_journal_add_journal_head(bh); @@ -1210,7 +1214,7 @@ int jbd2_journal_get_undo_access(handle_t *handle, struct buffer_head *bh) char *committed_data = NULL; JBUFFER_TRACE(jh, "entry"); - if (jbd2_write_access_granted(handle, bh)) + if (jbd2_write_access_granted(handle, bh, true)) return 0; jh = jbd2_journal_add_journal_head(bh); -- cgit v0.10.2 From 8423536fe453b3623f4ed7c2c3cca9b86af1ace5 Mon Sep 17 00:00:00 2001 From: Ludovic Desroches Date: Tue, 1 Dec 2015 11:44:40 +0100 Subject: ARM: at91: fix pinctrl driver selection Move the selection of the pinctrl driver to SoC family level since we have two pinctrl drivers. It is useless to select one which is not compatible with the SoC. [abelloni: fixed pm.c when only sama2d2 is selected] Signed-off-by: Ludovic Desroches Acked-by: Nicolas Ferre Signed-off-by: Alexandre Belloni diff --git a/arch/arm/mach-at91/Kconfig b/arch/arm/mach-at91/Kconfig index 9267300..28656c2 100644 --- a/arch/arm/mach-at91/Kconfig +++ b/arch/arm/mach-at91/Kconfig @@ -4,7 +4,6 @@ menuconfig ARCH_AT91 select ARCH_REQUIRE_GPIOLIB select COMMON_CLK_AT91 select PINCTRL - select PINCTRL_AT91 select SOC_BUS if ARCH_AT91 @@ -17,6 +16,7 @@ config SOC_SAMA5D2 select HAVE_AT91_USB_CLK select HAVE_AT91_H32MX select HAVE_AT91_GENERATED_CLK + select PINCTRL_AT91PIO4 help Select this if ou are using one of Atmel's SAMA5D2 family SoC. @@ -27,6 +27,7 @@ config SOC_SAMA5D3 select HAVE_AT91_UTMI select HAVE_AT91_SMD select HAVE_AT91_USB_CLK + select PINCTRL_AT91 help Select this if you are using one of Atmel's SAMA5D3 family SoC. This support covers SAMA5D31, SAMA5D33, SAMA5D34, SAMA5D35, SAMA5D36. @@ -40,6 +41,7 @@ config SOC_SAMA5D4 select HAVE_AT91_SMD select HAVE_AT91_USB_CLK select HAVE_AT91_H32MX + select PINCTRL_AT91 help Select this if you are using one of Atmel's SAMA5D4 family SoC. @@ -50,6 +52,7 @@ config SOC_AT91RM9200 select CPU_ARM920T select HAVE_AT91_USB_CLK select MIGHT_HAVE_PCI + select PINCTRL_AT91 select SOC_SAM_V4_V5 select SRAM if PM help @@ -65,6 +68,7 @@ config SOC_AT91SAM9 select HAVE_AT91_UTMI select HAVE_FB_ATMEL select MEMORY + select PINCTRL_AT91 select SOC_SAM_V4_V5 select SRAM if PM help diff --git a/arch/arm/mach-at91/pm.c b/arch/arm/mach-at91/pm.c index 80e277c..23726fb 100644 --- a/arch/arm/mach-at91/pm.c +++ b/arch/arm/mach-at91/pm.c @@ -41,8 +41,10 @@ * implementation should be moved down into the pinctrl driver and get * called as part of the generic suspend/resume path. */ +#ifdef CONFIG_PINCTRL_AT91 extern void at91_pinctrl_gpio_suspend(void); extern void at91_pinctrl_gpio_resume(void); +#endif static struct { unsigned long uhp_udp_mask; @@ -151,8 +153,9 @@ static void at91_pm_suspend(suspend_state_t state) static int at91_pm_enter(suspend_state_t state) { +#ifdef CONFIG_PINCTRL_AT91 at91_pinctrl_gpio_suspend(); - +#endif switch (state) { /* * Suspend-to-RAM is like STANDBY plus slow clock mode, so @@ -192,7 +195,9 @@ static int at91_pm_enter(suspend_state_t state) error: target_state = PM_SUSPEND_ON; +#ifdef CONFIG_PINCTRL_AT91 at91_pinctrl_gpio_resume(); +#endif return 0; } -- cgit v0.10.2 From 43d1c0eb7e11919f85200d2fce211173526f7304 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Fri, 20 Nov 2015 22:22:34 +0100 Subject: block: detach bdev inode from its wb in __blkdev_put() Since 52ebea749aae ("writeback: make backing_dev_info host cgroup-specific bdi_writebacks") inode, at some point in its lifetime, gets attached to a wb (struct bdi_writeback). Detaching happens on evict, in inode_detach_wb() called from __destroy_inode(), and involves updating wb. However, detaching an internal bdev inode from its wb in __destroy_inode() is too late. Its bdi and by extension root wb are embedded into struct request_queue, which has different lifetime rules and can be freed long before the final bdput() is called (can be from __fput() of a corresponding /dev inode, through dput() - evict() - bd_forget(). bdevs hold onto the underlying disk/queue pair only while opened; as soon as bdev is closed all bets are off. In fact, disk/queue can be gone before __blkdev_put() even returns: 1499 static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part) 1500 { ... 1518 if (bdev->bd_contains == bdev) { 1519 if (disk->fops->release) 1520 disk->fops->release(disk, mode); [ Driver puts its references to disk/queue ] 1521 } 1522 if (!bdev->bd_openers) { 1523 struct module *owner = disk->fops->owner; 1524 1525 disk_put_part(bdev->bd_part); 1526 bdev->bd_part = NULL; 1527 bdev->bd_disk = NULL; 1528 if (bdev != bdev->bd_contains) 1529 victim = bdev->bd_contains; 1530 bdev->bd_contains = NULL; 1531 1532 put_disk(disk); [ We put ours, the queue is gone The last bdput() would result in a write to invalid memory ] 1533 module_put(owner); ... 1539 } Since bdev inodes are special anyway, detach them in __blkdev_put() after clearing inode's dirty bits, turning the problematic inode_detach_wb() in __destroy_inode() into a noop. add_disk() grabs its disk->queue since 523e1d399ce0 ("block: make gendisk hold a reference to its queue"), so the old ->release comment is removed in favor of the new inode_detach_wb() comment. Cc: stable@vger.kernel.org # 4.2+, needs backporting Signed-off-by: Ilya Dryomov Acked-by: Tejun Heo Tested-by: Raghavendra K T Signed-off-by: Jens Axboe diff --git a/fs/block_dev.c b/fs/block_dev.c index c25639e..44d4a1e 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1523,11 +1523,14 @@ static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part) WARN_ON_ONCE(bdev->bd_holders); sync_blockdev(bdev); kill_bdev(bdev); + + bdev_write_inode(bdev); /* - * ->release can cause the queue to disappear, so flush all - * dirty data before. + * Detaching bdev inode from its wb in __destroy_inode() + * is too late: the queue which embeds its bdi (along with + * root wb) can be gone as soon as we put_disk() below. */ - bdev_write_inode(bdev); + inode_detach_wb(bdev->bd_inode); } if (bdev->bd_contains == bdev) { if (disk->fops->release) -- cgit v0.10.2 From a74a821624c0c75388a193337babd17a8c02c740 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Fri, 4 Dec 2015 16:44:24 +0100 Subject: ALSA: rme96: Fix unexpected volume reset after rate changes rme96 driver needs to reset DAC depending on the sample rate, and this results in resetting to the max volume suddenly. It's because of the missing call of snd_rme96_apply_dac_volume(). However, calling this function right after the DAC reset still may not work, and we need some delay before this call. Since the DAC reset and the procedure after that are performed in the spinlock, we delay the DAC volume restore at the end after the spinlock. Reported-and-tested-by: Sylvain LABOISNE Cc: Signed-off-by: Takashi Iwai diff --git a/sound/pci/rme96.c b/sound/pci/rme96.c index 714df90..41c31db 100644 --- a/sound/pci/rme96.c +++ b/sound/pci/rme96.c @@ -741,10 +741,11 @@ snd_rme96_playback_setrate(struct rme96 *rme96, { /* change to/from double-speed: reset the DAC (if available) */ snd_rme96_reset_dac(rme96); + return 1; /* need to restore volume */ } else { writel(rme96->wcreg, rme96->iobase + RME96_IO_CONTROL_REGISTER); + return 0; } - return 0; } static int @@ -980,6 +981,7 @@ snd_rme96_playback_hw_params(struct snd_pcm_substream *substream, struct rme96 *rme96 = snd_pcm_substream_chip(substream); struct snd_pcm_runtime *runtime = substream->runtime; int err, rate, dummy; + bool apply_dac_volume = false; runtime->dma_area = (void __force *)(rme96->iobase + RME96_IO_PLAY_BUFFER); @@ -993,24 +995,26 @@ snd_rme96_playback_hw_params(struct snd_pcm_substream *substream, { /* slave clock */ if ((int)params_rate(params) != rate) { - spin_unlock_irq(&rme96->lock); - return -EIO; - } - } else if ((err = snd_rme96_playback_setrate(rme96, params_rate(params))) < 0) { - spin_unlock_irq(&rme96->lock); - return err; - } - if ((err = snd_rme96_playback_setformat(rme96, params_format(params))) < 0) { - spin_unlock_irq(&rme96->lock); - return err; + err = -EIO; + goto error; + } + } else { + err = snd_rme96_playback_setrate(rme96, params_rate(params)); + if (err < 0) + goto error; + apply_dac_volume = err > 0; /* need to restore volume later? */ } + + err = snd_rme96_playback_setformat(rme96, params_format(params)); + if (err < 0) + goto error; snd_rme96_setframelog(rme96, params_channels(params), 1); if (rme96->capture_periodsize != 0) { if (params_period_size(params) << rme96->playback_frlog != rme96->capture_periodsize) { - spin_unlock_irq(&rme96->lock); - return -EBUSY; + err = -EBUSY; + goto error; } } rme96->playback_periodsize = @@ -1021,9 +1025,16 @@ snd_rme96_playback_hw_params(struct snd_pcm_substream *substream, rme96->wcreg &= ~(RME96_WCR_PRO | RME96_WCR_DOLBY | RME96_WCR_EMP); writel(rme96->wcreg |= rme96->wcreg_spdif_stream, rme96->iobase + RME96_IO_CONTROL_REGISTER); } + + err = 0; + error: spin_unlock_irq(&rme96->lock); - - return 0; + if (apply_dac_volume) { + usleep_range(3000, 10000); + snd_rme96_apply_dac_volume(rme96); + } + + return err; } static int -- cgit v0.10.2 From 7f52f31443ae9b2e6a71ec54cfc5a2c89006ea27 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 4 Dec 2015 16:21:08 -0600 Subject: PCI: altera: Fix loop in tlp_read_packet() TLP_LOOP is 500 and the "loop" variable was a u8 so "loop < TLP_LOOP" is always true. We only need this condition to work if there is a problem so it would have been easy to miss this in testing. Make it a normal for loop with "int i" instead of over thinking things and making it complicated. Fixes: 6bb4dd154ae8 ("PCI: altera: Add Altera PCIe host controller driver") Signed-off-by: Dan Carpenter Signed-off-by: Bjorn Helgaas Acked-by: Ley Foon Tan diff --git a/drivers/pci/host/pcie-altera.c b/drivers/pci/host/pcie-altera.c index e5dda38..f0820d3 100644 --- a/drivers/pci/host/pcie-altera.c +++ b/drivers/pci/host/pcie-altera.c @@ -166,7 +166,7 @@ static bool altera_pcie_valid_config(struct altera_pcie *pcie, static int tlp_read_packet(struct altera_pcie *pcie, u32 *value) { - u8 loop; + int i; bool sop = 0; u32 ctrl; u32 reg0, reg1; @@ -175,7 +175,7 @@ static int tlp_read_packet(struct altera_pcie *pcie, u32 *value) * Minimum 2 loops to read TLP headers and 1 loop to read data * payload. */ - for (loop = 0; loop < TLP_LOOP; loop++) { + for (i = 0; i < TLP_LOOP; i++) { ctrl = cra_readl(pcie, RP_RXCPL_STATUS); if ((ctrl & RP_RXCPL_SOP) || (ctrl & RP_RXCPL_EOP) || sop) { reg0 = cra_readl(pcie, RP_RXCPL_REG0); -- cgit v0.10.2 From 23ec56708af7155414a979675dc8132a5d6e16ef Mon Sep 17 00:00:00 2001 From: Ley Foon Tan Date: Fri, 4 Dec 2015 16:21:12 -0600 Subject: PCI: altera: Fix Requester ID for config accesses The Requester ID should use the Root Port devfn and it should be always 0. Previously we constructed the Requester ID using the *Completer* devfn, i.e., the devfn of the Function we expect to respond to the config access. This causes issues when accessing configuration space for devices other than the Root Port. Build the Requester ID using the Root Port devfn. Tested on Ethernet adapter card with multi-functions. Signed-off-by: Ley Foon Tan Signed-off-by: Bjorn Helgaas diff --git a/drivers/pci/host/pcie-altera.c b/drivers/pci/host/pcie-altera.c index f0820d3..ed736a9 100644 --- a/drivers/pci/host/pcie-altera.c +++ b/drivers/pci/host/pcie-altera.c @@ -57,6 +57,7 @@ #define TLP_REQ_ID(bus, devfn) (((bus) << 8) | (devfn)) #define TLP_HDR_SIZE 3 #define TLP_LOOP 500 +#define RP_DEVFN 0 #define INTX_NUM 4 @@ -233,7 +234,7 @@ static int tlp_cfg_dword_read(struct altera_pcie *pcie, u8 bus, u32 devfn, else headers[0] = TLP_CFG_DW0(TLP_FMTTYPE_CFGRD1); - headers[1] = TLP_CFG_DW1(TLP_REQ_ID(pcie->root_bus_nr, devfn), + headers[1] = TLP_CFG_DW1(TLP_REQ_ID(pcie->root_bus_nr, RP_DEVFN), TLP_READ_TAG, byte_en); headers[2] = TLP_CFG_DW2(bus, devfn, where); @@ -253,7 +254,7 @@ static int tlp_cfg_dword_write(struct altera_pcie *pcie, u8 bus, u32 devfn, else headers[0] = TLP_CFG_DW0(TLP_FMTTYPE_CFGWR1); - headers[1] = TLP_CFG_DW1(TLP_REQ_ID(pcie->root_bus_nr, devfn), + headers[1] = TLP_CFG_DW1(TLP_REQ_ID(pcie->root_bus_nr, RP_DEVFN), TLP_WRITE_TAG, byte_en); headers[2] = TLP_CFG_DW2(bus, devfn, where); -- cgit v0.10.2 From ea1d3795f65e14619642b5f67382800f58e43f3d Mon Sep 17 00:00:00 2001 From: Ley Foon Tan Date: Fri, 4 Dec 2015 16:21:16 -0600 Subject: PCI: altera: Check TLP completion status Check TLP packet successful completion status. This fix the issue when accessing multi-function devices in enumeration process, TLP will return error when accessing non-exist function number. Returns PCI error code instead of generic errno. Tested on Ethernet adapter card with multi-functions. [bhelgaas: simplify completion status checking code] Signed-off-by: Ley Foon Tan Signed-off-by: Bjorn Helgaas diff --git a/drivers/pci/host/pcie-altera.c b/drivers/pci/host/pcie-altera.c index ed736a9..072255d 100644 --- a/drivers/pci/host/pcie-altera.c +++ b/drivers/pci/host/pcie-altera.c @@ -55,6 +55,7 @@ #define TLP_CFG_DW2(bus, devfn, offset) \ (((bus) << 24) | ((devfn) << 16) | (offset)) #define TLP_REQ_ID(bus, devfn) (((bus) << 8) | (devfn)) +#define TLP_COMP_STATUS(s) (((s) >> 12) & 7) #define TLP_HDR_SIZE 3 #define TLP_LOOP 500 #define RP_DEVFN 0 @@ -171,6 +172,7 @@ static int tlp_read_packet(struct altera_pcie *pcie, u32 *value) bool sop = 0; u32 ctrl; u32 reg0, reg1; + u32 comp_status = 1; /* * Minimum 2 loops to read TLP headers and 1 loop to read data @@ -182,19 +184,25 @@ static int tlp_read_packet(struct altera_pcie *pcie, u32 *value) reg0 = cra_readl(pcie, RP_RXCPL_REG0); reg1 = cra_readl(pcie, RP_RXCPL_REG1); - if (ctrl & RP_RXCPL_SOP) + if (ctrl & RP_RXCPL_SOP) { sop = true; + comp_status = TLP_COMP_STATUS(reg1); + } if (ctrl & RP_RXCPL_EOP) { + if (comp_status) + return PCIBIOS_DEVICE_NOT_FOUND; + if (value) *value = reg0; + return PCIBIOS_SUCCESSFUL; } } udelay(5); } - return -ENOENT; + return PCIBIOS_DEVICE_NOT_FOUND; } static void tlp_write_packet(struct altera_pcie *pcie, u32 *headers, -- cgit v0.10.2 From 99496bd2971fc378226ad4413e5b72c4545714bd Mon Sep 17 00:00:00 2001 From: Ley Foon Tan Date: Fri, 4 Dec 2015 16:21:21 -0600 Subject: PCI: altera: Fix error when INTx is 4 PCI interrupt lines start at 1, not at 0. So, creates additional one interrupt when register for irq domain. Error when PCIe devices have 4 INTx: WARNING: CPU: 1 PID: 1 at kernel/irq/irqdomain.c:280 irq_domain_associate+0x17c/0x1cc() error: hwirq 0x4 is too large for dummy Tested on Ethernet adapter card with multi-functions. Signed-off-by: Ley Foon Tan Signed-off-by: Bjorn Helgaas diff --git a/drivers/pci/host/pcie-altera.c b/drivers/pci/host/pcie-altera.c index 072255d..99da549 100644 --- a/drivers/pci/host/pcie-altera.c +++ b/drivers/pci/host/pcie-altera.c @@ -467,7 +467,7 @@ static int altera_pcie_init_irq_domain(struct altera_pcie *pcie) struct device_node *node = dev->of_node; /* Setup INTx */ - pcie->irq_domain = irq_domain_add_linear(node, INTX_NUM, + pcie->irq_domain = irq_domain_add_linear(node, INTX_NUM + 1, &intx_domain_ops, pcie); if (!pcie->irq_domain) { dev_err(dev, "Failed to get a INTx IRQ domain\n"); -- cgit v0.10.2 From e0fbac1cd49c5ca6cb6a0aa918d0943ce95d906c Mon Sep 17 00:00:00 2001 From: Yuanfang Chen Date: Tue, 24 Nov 2015 12:05:01 -0500 Subject: perf/x86/intel: Make L1D_PEND_MISS.FB_FULL not constrained on Haswell There was a mistake in the Haswell constraints table. Signed-off-by: Yuanfang Chen Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Andi Kleen Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Link: http://lkml.kernel.org/r/1448384701-9110-1-git-send-email-cheny@udel.edu Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index f63360b..e2a4300 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -232,7 +232,7 @@ static struct event_constraint intel_hsw_event_constraints[] = { FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */ - INTEL_EVENT_CONSTRAINT(0x48, 0x4), /* L1D_PEND_MISS.* */ + INTEL_UEVENT_CONSTRAINT(0x148, 0x4), /* L1D_PEND_MISS.PENDING */ INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PREC_DIST */ INTEL_EVENT_CONSTRAINT(0xcd, 0x8), /* MEM_TRANS_RETIRED.LOAD_LATENCY */ /* CYCLE_ACTIVITY.CYCLES_L1D_PENDING */ -- cgit v0.10.2 From 169b932a15318e8e9f2f0f12eeb55dda09c8737f Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Mon, 9 Nov 2015 10:24:31 +0100 Subject: perf/x86/intel: Fix INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_NA macro We need to add rest of the flags to the constraint mask instead of another INTEL_ARCH_EVENT_MASK, fixing a typo. Signed-off-by: Jiri Olsa Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Link: http://lkml.kernel.org/r/1447061071-28085-1-git-send-email-jolsa@kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index ab18b8a..d0e35eb 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -387,7 +387,7 @@ struct cpu_hw_events { /* Check flags and event code/umask, and set the HSW N/A flag */ #define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_NA(code, n) \ __EVENT_CONSTRAINT(code, n, \ - INTEL_ARCH_EVENT_MASK|INTEL_ARCH_EVENT_MASK, \ + INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \ HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_NA_HSW) -- cgit v0.10.2 From 4e93ad601a4308d4a67673c81556580817d56940 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 4 Nov 2015 16:00:05 +0100 Subject: perf: Do not send exit event twice In case we monitor events system wide, we get EXIT event (when configured) twice for each task that exited. Note doubled lines with same pid/tid in following example: $ sudo ./perf record -a ^C[ perf record: Woken up 1 times to write data ] [ perf record: Captured and wrote 0.480 MB perf.data (2518 samples) ] $ sudo ./perf report -D | grep EXIT 0 60290687567581 0x59910 [0x38]: PERF_RECORD_EXIT(1250:1250):(1250:1250) 0 60290687568354 0x59948 [0x38]: PERF_RECORD_EXIT(1250:1250):(1250:1250) 0 60290687988744 0x59ad8 [0x38]: PERF_RECORD_EXIT(1250:1250):(1250:1250) 0 60290687989198 0x59b10 [0x38]: PERF_RECORD_EXIT(1250:1250):(1250:1250) 1 60290692567895 0x62af0 [0x38]: PERF_RECORD_EXIT(1253:1253):(1253:1253) 1 60290692568322 0x62b28 [0x38]: PERF_RECORD_EXIT(1253:1253):(1253:1253) 2 60290692739276 0x69a18 [0x38]: PERF_RECORD_EXIT(1252:1252):(1252:1252) 2 60290692739910 0x69a50 [0x38]: PERF_RECORD_EXIT(1252:1252):(1252:1252) The reason is that the cpu contexts are processes each time we call perf_event_task. I'm changing the perf_event_aux logic to serve task_ctx and cpu contexts separately, which ensure we don't get EXIT event generated twice on same cpu context. This does not affect other auxiliary events, as they don't use task_ctx at all. Signed-off-by: Jiri Olsa Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: David Ahern Cc: Jiri Olsa Cc: Linus Torvalds Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Link: http://lkml.kernel.org/r/1446649205-5822-1-git-send-email-jolsa@kernel.org Signed-off-by: Ingo Molnar diff --git a/kernel/events/core.c b/kernel/events/core.c index 49a5118..39cf4a4 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -5683,6 +5683,17 @@ perf_event_aux_ctx(struct perf_event_context *ctx, } static void +perf_event_aux_task_ctx(perf_event_aux_output_cb output, void *data, + struct perf_event_context *task_ctx) +{ + rcu_read_lock(); + preempt_disable(); + perf_event_aux_ctx(task_ctx, output, data); + preempt_enable(); + rcu_read_unlock(); +} + +static void perf_event_aux(perf_event_aux_output_cb output, void *data, struct perf_event_context *task_ctx) { @@ -5691,14 +5702,23 @@ perf_event_aux(perf_event_aux_output_cb output, void *data, struct pmu *pmu; int ctxn; + /* + * If we have task_ctx != NULL we only notify + * the task context itself. The task_ctx is set + * only for EXIT events before releasing task + * context. + */ + if (task_ctx) { + perf_event_aux_task_ctx(output, data, task_ctx); + return; + } + rcu_read_lock(); list_for_each_entry_rcu(pmu, &pmus, entry) { cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); if (cpuctx->unique_pmu != pmu) goto next; perf_event_aux_ctx(&cpuctx->ctx, output, data); - if (task_ctx) - goto next; ctxn = pmu->task_ctx_nr; if (ctxn < 0) goto next; @@ -5708,12 +5728,6 @@ perf_event_aux(perf_event_aux_output_cb output, void *data, next: put_cpu_ptr(pmu->pmu_cpu_context); } - - if (task_ctx) { - preempt_disable(); - perf_event_aux_ctx(task_ctx, output, data); - preempt_enable(); - } rcu_read_unlock(); } @@ -8803,10 +8817,8 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn) struct perf_event_context *child_ctx, *clone_ctx = NULL; unsigned long flags; - if (likely(!child->perf_event_ctxp[ctxn])) { - perf_event_task(child, NULL, 0); + if (likely(!child->perf_event_ctxp[ctxn])) return; - } local_irq_save(flags); /* @@ -8890,6 +8902,14 @@ void perf_event_exit_task(struct task_struct *child) for_each_task_context_nr(ctxn) perf_event_exit_task_context(child, ctxn); + + /* + * The perf_event_exit_task_context calls perf_event_task + * with child's task_ctx, which generates EXIT events for + * child contexts and sets child->perf_event_ctxp[] to NULL. + * At this point we need to send EXIT events to cpu contexts. + */ + perf_event_task(child, NULL, 0); } static void perf_free_event(struct perf_event *event, -- cgit v0.10.2 From 7c23b7c1996597dd9d60bb282fb5fa1be6ebd18b Mon Sep 17 00:00:00 2001 From: "Lu, Han" Date: Mon, 7 Dec 2015 15:59:13 +0800 Subject: ALSA: hda - Fix playback noise with 24/32 bit sample size on BXT In BXT-P A0, HD-Audio DMA requests is later than expected, and makes an audio stream sensitive to system latencies when 24/32 bits are playing. Adjusting threshold of DMA fifo to force the DMA request sooner to improve latency tolerance at the expense of power. v2: move Intel specific code to hda_intel.c Signed-off-by: Lu, Han Signed-off-by: Takashi Iwai diff --git a/include/sound/hda_register.h b/include/sound/hda_register.h index 2ae8812..94dc6a9 100644 --- a/include/sound/hda_register.h +++ b/include/sound/hda_register.h @@ -93,6 +93,9 @@ enum { SDI0, SDI1, SDI2, SDI3, SDO0, SDO1, SDO2, SDO3 }; #define AZX_REG_HSW_EM4 0x100c #define AZX_REG_HSW_EM5 0x1010 +/* Skylake/Broxton display HD-A controller Extended Mode registers */ +#define AZX_REG_SKL_EM4L 0x1040 + /* PCI space */ #define AZX_PCIREG_TCSEL 0x44 diff --git a/sound/pci/hda/hda_intel.c b/sound/pci/hda/hda_intel.c index 963f824..bff5c8b 100644 --- a/sound/pci/hda/hda_intel.c +++ b/sound/pci/hda/hda_intel.c @@ -355,6 +355,8 @@ enum { ((pci)->device == 0x0d0c) || \ ((pci)->device == 0x160c)) +#define IS_BROXTON(pci) ((pci)->device == 0x5a98) + static char *driver_short_names[] = { [AZX_DRIVER_ICH] = "HDA Intel", [AZX_DRIVER_PCH] = "HDA Intel PCH", @@ -506,15 +508,36 @@ static void azx_init_pci(struct azx *chip) } } +/* + * In BXT-P A0, HD-Audio DMA requests is later than expected, + * and makes an audio stream sensitive to system latencies when + * 24/32 bits are playing. + * Adjusting threshold of DMA fifo to force the DMA request + * sooner to improve latency tolerance at the expense of power. + */ +static void bxt_reduce_dma_latency(struct azx *chip) +{ + u32 val; + + val = azx_readl(chip, SKL_EM4L); + val &= (0x3 << 20); + azx_writel(chip, SKL_EM4L, val); +} + static void hda_intel_init_chip(struct azx *chip, bool full_reset) { struct hdac_bus *bus = azx_bus(chip); + struct pci_dev *pci = chip->pci; if (chip->driver_caps & AZX_DCAPS_I915_POWERWELL) snd_hdac_set_codec_wakeup(bus, true); azx_init_chip(chip, full_reset); if (chip->driver_caps & AZX_DCAPS_I915_POWERWELL) snd_hdac_set_codec_wakeup(bus, false); + + /* reduce dma latency to avoid noise */ + if (IS_BROXTON(pci)) + bxt_reduce_dma_latency(chip); } /* calculate runtime delay from LPIB */ -- cgit v0.10.2 From 02f6ff90400d055f08b0ba0b5f0707630b6faed7 Mon Sep 17 00:00:00 2001 From: David Henningsson Date: Mon, 7 Dec 2015 11:29:31 +0100 Subject: ALSA: hda - Add inverted dmic for Packard Bell DOTS On the internal mic of the Packard Bell DOTS, one channel has an inverted signal. Add a quirk to fix this up. Cc: stable@vger.kernel.org BugLink: https://bugs.launchpad.net/bugs/1523232 Signed-off-by: David Henningsson Signed-off-by: Takashi Iwai diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index 9bedf7c..ebc5362 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -6409,6 +6409,7 @@ static const struct hda_fixup alc662_fixups[] = { static const struct snd_pci_quirk alc662_fixup_tbl[] = { SND_PCI_QUIRK(0x1019, 0x9087, "ECS", ALC662_FIXUP_ASUS_MODE2), SND_PCI_QUIRK(0x1025, 0x022f, "Acer Aspire One", ALC662_FIXUP_INV_DMIC), + SND_PCI_QUIRK(0x1025, 0x0241, "Packard Bell DOTS", ALC662_FIXUP_INV_DMIC), SND_PCI_QUIRK(0x1025, 0x0308, "Acer Aspire 8942G", ALC662_FIXUP_ASPIRE), SND_PCI_QUIRK(0x1025, 0x031c, "Gateway NV79", ALC662_FIXUP_SKU_IGNORE), SND_PCI_QUIRK(0x1025, 0x0349, "eMachines eM250", ALC662_FIXUP_INV_DMIC), -- cgit v0.10.2 From 2ac46030331e2952a56c05bd15d99082fc410088 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Sun, 15 Nov 2015 15:11:00 +0200 Subject: virtio-net: Stop doing DMA from the stack Once virtio starts using the DMA API, we won't be able to safely DMA from the stack. virtio-net does a couple of config DMA requests from small stack buffers -- switch to using dynamically-allocated memory. This should have no effect on any performance-critical code paths. Reported-by: Andy Lutomirski Signed-off-by: Michael S. Tsirkin Tested-by: Andy Lutomirski diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index d8838ded..f94ab78 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -140,6 +140,12 @@ struct virtnet_info { /* CPU hot plug notifier */ struct notifier_block nb; + + /* Control VQ buffers: protected by the rtnl lock */ + struct virtio_net_ctrl_hdr ctrl_hdr; + virtio_net_ctrl_ack ctrl_status; + u8 ctrl_promisc; + u8 ctrl_allmulti; }; struct padded_vnet_hdr { @@ -976,31 +982,30 @@ static bool virtnet_send_command(struct virtnet_info *vi, u8 class, u8 cmd, struct scatterlist *out) { struct scatterlist *sgs[4], hdr, stat; - struct virtio_net_ctrl_hdr ctrl; - virtio_net_ctrl_ack status = ~0; unsigned out_num = 0, tmp; /* Caller should know better */ BUG_ON(!virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VQ)); - ctrl.class = class; - ctrl.cmd = cmd; + vi->ctrl_status = ~0; + vi->ctrl_hdr.class = class; + vi->ctrl_hdr.cmd = cmd; /* Add header */ - sg_init_one(&hdr, &ctrl, sizeof(ctrl)); + sg_init_one(&hdr, &vi->ctrl_hdr, sizeof(vi->ctrl_hdr)); sgs[out_num++] = &hdr; if (out) sgs[out_num++] = out; /* Add return status. */ - sg_init_one(&stat, &status, sizeof(status)); + sg_init_one(&stat, &vi->ctrl_status, sizeof(vi->ctrl_status)); sgs[out_num] = &stat; BUG_ON(out_num + 1 > ARRAY_SIZE(sgs)); virtqueue_add_sgs(vi->cvq, sgs, out_num, 1, vi, GFP_ATOMIC); if (unlikely(!virtqueue_kick(vi->cvq))) - return status == VIRTIO_NET_OK; + return vi->ctrl_status == VIRTIO_NET_OK; /* Spin for a response, the kick causes an ioport write, trapping * into the hypervisor, so the request should be handled immediately. @@ -1009,7 +1014,7 @@ static bool virtnet_send_command(struct virtnet_info *vi, u8 class, u8 cmd, !virtqueue_is_broken(vi->cvq)) cpu_relax(); - return status == VIRTIO_NET_OK; + return vi->ctrl_status == VIRTIO_NET_OK; } static int virtnet_set_mac_address(struct net_device *dev, void *p) @@ -1151,7 +1156,6 @@ static void virtnet_set_rx_mode(struct net_device *dev) { struct virtnet_info *vi = netdev_priv(dev); struct scatterlist sg[2]; - u8 promisc, allmulti; struct virtio_net_ctrl_mac *mac_data; struct netdev_hw_addr *ha; int uc_count; @@ -1163,22 +1167,22 @@ static void virtnet_set_rx_mode(struct net_device *dev) if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_RX)) return; - promisc = ((dev->flags & IFF_PROMISC) != 0); - allmulti = ((dev->flags & IFF_ALLMULTI) != 0); + vi->ctrl_promisc = ((dev->flags & IFF_PROMISC) != 0); + vi->ctrl_allmulti = ((dev->flags & IFF_ALLMULTI) != 0); - sg_init_one(sg, &promisc, sizeof(promisc)); + sg_init_one(sg, &vi->ctrl_promisc, sizeof(vi->ctrl_promisc)); if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_RX, VIRTIO_NET_CTRL_RX_PROMISC, sg)) dev_warn(&dev->dev, "Failed to %sable promisc mode.\n", - promisc ? "en" : "dis"); + vi->ctrl_promisc ? "en" : "dis"); - sg_init_one(sg, &allmulti, sizeof(allmulti)); + sg_init_one(sg, &vi->ctrl_allmulti, sizeof(vi->ctrl_allmulti)); if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_RX, VIRTIO_NET_CTRL_RX_ALLMULTI, sg)) dev_warn(&dev->dev, "Failed to %sable allmulti mode.\n", - allmulti ? "en" : "dis"); + vi->ctrl_allmulti ? "en" : "dis"); uc_count = netdev_uc_count(dev); mc_count = netdev_mc_count(dev); -- cgit v0.10.2 From ea013a9b205b47b1fcbc72522146fad560af0712 Mon Sep 17 00:00:00 2001 From: Andreas Werner Date: Fri, 4 Dec 2015 18:12:49 +0100 Subject: libata-eh.c: Introduce new ata port flag for controller which lockup on read log page Some controller lockup on a ata_read_log_page. Add new ata port flag ATA_FLAG_NO_LOG_PAGE which can used to blacklist a controller. If this flag is set, any attempt to read a log page returns an error without actually issuing the command. Signed-off-by: Andreas Werner Signed-off-by: Tejun Heo diff --git a/drivers/ata/libata-eh.c b/drivers/ata/libata-eh.c index cb0508a..961acc7 100644 --- a/drivers/ata/libata-eh.c +++ b/drivers/ata/libata-eh.c @@ -1505,12 +1505,20 @@ static const char *ata_err_string(unsigned int err_mask) unsigned int ata_read_log_page(struct ata_device *dev, u8 log, u8 page, void *buf, unsigned int sectors) { + unsigned long ap_flags = dev->link->ap->flags; struct ata_taskfile tf; unsigned int err_mask; bool dma = false; DPRINTK("read log page - log 0x%x, page 0x%x\n", log, page); + /* + * Return error without actually issuing the command on controllers + * which e.g. lockup on a read log page. + */ + if (ap_flags & ATA_FLAG_NO_LOG_PAGE) + return AC_ERR_DEV; + retry: ata_tf_init(dev, &tf); if (dev->dma_mode && ata_id_has_read_log_dma_ext(dev->id) && diff --git a/include/linux/libata.h b/include/linux/libata.h index 83577f8..600c1e0 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -210,6 +210,7 @@ enum { ATA_FLAG_SLAVE_POSS = (1 << 0), /* host supports slave dev */ /* (doesn't imply presence) */ ATA_FLAG_SATA = (1 << 1), + ATA_FLAG_NO_LOG_PAGE = (1 << 5), /* do not issue log page read */ ATA_FLAG_NO_ATAPI = (1 << 6), /* No ATAPI support */ ATA_FLAG_PIO_DMA = (1 << 7), /* PIO cmds via DMA */ ATA_FLAG_PIO_LBA48 = (1 << 8), /* Host DMA engine is LBA28 only */ -- cgit v0.10.2 From 4f2568f5cb475529aa2894adc7c7912517c83cb0 Mon Sep 17 00:00:00 2001 From: Andreas Werner Date: Fri, 4 Dec 2015 18:14:14 +0100 Subject: ata/sata_fsl.c: add ATA_FLAG_NO_LOG_PAGE to blacklist the controller for log page reads Every attempt to issue a read log page command lockup the controller. The command is currently sent if the sata device includes the devlsp feature to read out the timing data. This attempt to read the data, locks up the controller and the device is not recognzied correctly (failed to set xfermode) and cannot be accessed. This was found on Freescale P1013/P1022 and T4240 CPUs using a ATP IG mSATA 4GB with the devslp feature. fsl-sata ff718000.sata: Sata FSL Platform/CSB Driver init [ 1.254195] scsi0 : sata_fsl [ 1.256004] ata1: SATA max UDMA/133 irq 74 [ 1.370666] fsl-gianfar ethernet.3: enabled errata workarounds, flags: 0x4 [ 1.470671] fsl-gianfar ethernet.4: enabled errata workarounds, flags: 0x4 [ 1.775584] ata1: Signature Update detected @ 504 msecs [ 1.947594] ata1: SATA link up 3.0 Gbps (SStatus 123 SControl 300) [ 1.948366] ata1.00: ATA-8: ATP IG mSATA, 20150311, max UDMA/133 [ 1.948371] ata1.00: 7732368 sectors, multi 0: LBA [ 1.948843] ata1.00: failed to get Identify Device Data, Emask 0x1 [ 1.948857] ata1.00: failed to set xfermode (err_mask=0x40) [ 7.467557] ata1: Signature Update detected @ 504 msecs [ 7.639560] ata1: SATA link up 3.0 Gbps (SStatus 123 SControl 300) [ 7.651320] ata1.00: failed to get Identify Device Data, Emask 0x1 [ 7.651360] ata1.00: failed to set xfermode (err_mask=0x40) [ 7.655628] ata1: limiting SATA link speed to 1.5 Gbps [ 7.659458] ata1.00: limiting speed to UDMA/133:PIO3 [ 13.163554] ata1: Signature Update detected @ 504 msecs [ 13.335558] ata1: SATA link up 3.0 Gbps (SStatus 123 SControl 300) [ 13.347298] ata1.00: failed to get Identify Device Data, Emask 0x1 [ 13.347334] ata1.00: failed to set xfermode (err_mask=0x40) [ 13.351601] ata1.00: disabled [ 13.353278] ata1: exception Emask 0x50 SAct 0x0 SErr 0x800 action 0x6 frozen t4 [ 13.359281] ata1: SError: { HostInt } [ 13.361644] ata1: hard resetting link Signed-off-by: Andreas Werner Signed-off-by: Tejun Heo diff --git a/drivers/ata/sata_fsl.c b/drivers/ata/sata_fsl.c index 5389579..a723ae9 100644 --- a/drivers/ata/sata_fsl.c +++ b/drivers/ata/sata_fsl.c @@ -45,7 +45,8 @@ enum { SATA_FSL_MAX_PRD_DIRECT = 16, /* Direct PRDT entries */ SATA_FSL_HOST_FLAGS = (ATA_FLAG_SATA | ATA_FLAG_PIO_DMA | - ATA_FLAG_PMP | ATA_FLAG_NCQ | ATA_FLAG_AN), + ATA_FLAG_PMP | ATA_FLAG_NCQ | + ATA_FLAG_AN | ATA_FLAG_NO_LOG_PAGE), SATA_FSL_MAX_CMDS = SATA_FSL_QUEUE_DEPTH, SATA_FSL_CMD_HDR_SIZE = 16, /* 4 DWORDS */ -- cgit v0.10.2 From d542483876f6ed720f573de3fbb1d8e60ccd0d6e Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Mon, 16 Nov 2015 16:57:08 +0200 Subject: vhost: relax log address alignment commit 5d9a07b0de512b77bf28d2401e5fe3351f00a240 ("vhost: relax used address alignment") fixed the alignment for the used virtual address, but not for the physical address used for logging. That's a mistake: alignment should clearly be the same for virtual and physical addresses, Cc: stable@vger.kernel.org Signed-off-by: Michael S. Tsirkin diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index eec2f11..080422f 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -819,7 +819,7 @@ long vhost_vring_ioctl(struct vhost_dev *d, int ioctl, void __user *argp) BUILD_BUG_ON(__alignof__ *vq->used > VRING_USED_ALIGN_SIZE); if ((a.avail_user_addr & (VRING_AVAIL_ALIGN_SIZE - 1)) || (a.used_user_addr & (VRING_USED_ALIGN_SIZE - 1)) || - (a.log_guest_addr & (sizeof(u64) - 1))) { + (a.log_guest_addr & (VRING_USED_ALIGN_SIZE - 1))) { r = -EINVAL; break; } -- cgit v0.10.2 From c13f99b7e945dad5273a8b7ee230f4d1f22d3354 Mon Sep 17 00:00:00 2001 From: Suman Anna Date: Wed, 16 Sep 2015 19:29:17 -0500 Subject: virtio: fix memory leak of virtio ida cache layers The virtio core uses a static ida named virtio_index_ida for assigning index numbers to virtio devices during registration. The ida core may allocate some internal idr cache layers and an ida bitmap upon any ida allocation, and all these layers are truely freed only upon the ida destruction. The virtio_index_ida is not destroyed at present, leading to a memory leak when using the virtio core as a module and atleast one virtio device is registered and unregistered. Fix this by invoking ida_destroy() in the virtio core module exit. Cc: stable@vger.kernel.org Signed-off-by: Suman Anna Signed-off-by: Michael S. Tsirkin diff --git a/drivers/virtio/virtio.c b/drivers/virtio/virtio.c index b1877d7..7062bb0 100644 --- a/drivers/virtio/virtio.c +++ b/drivers/virtio/virtio.c @@ -412,6 +412,7 @@ static int virtio_init(void) static void __exit virtio_exit(void) { bus_unregister(&virtio_bus); + ida_destroy(&virtio_index_ida); } core_initcall(virtio_init); module_exit(virtio_exit); -- cgit v0.10.2 From 40c172e5d99220c8a99dbc4468e792c74494a596 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Sun, 29 Nov 2015 12:43:48 +0200 Subject: tools/virtio: move list macro stubs Makes them more generally available. Signed-off-by: Michael S. Tsirkin diff --git a/tools/virtio/linux/kernel.h b/tools/virtio/linux/kernel.h index 0a3da64..4db7d56 100644 --- a/tools/virtio/linux/kernel.h +++ b/tools/virtio/linux/kernel.h @@ -110,4 +110,10 @@ static inline void free_page(unsigned long addr) (void) (&_min1 == &_min2); \ _min1 < _min2 ? _min1 : _min2; }) +/* TODO: empty stubs for now. Broken but enough for virtio_ring.c */ +#define list_add_tail(a, b) do {} while (0) +#define list_del(a) do {} while (0) +#define list_for_each_entry(a, b, c) while (0) +/* end of stubs */ + #endif /* KERNEL_H */ diff --git a/tools/virtio/linux/virtio.h b/tools/virtio/linux/virtio.h index a3e0701..ee125e7 100644 --- a/tools/virtio/linux/virtio.h +++ b/tools/virtio/linux/virtio.h @@ -3,12 +3,6 @@ #include #include -/* TODO: empty stubs for now. Broken but enough for virtio_ring.c */ -#define list_add_tail(a, b) do {} while (0) -#define list_del(a) do {} while (0) -#define list_for_each_entry(a, b, c) while (0) -/* end of stubs */ - struct virtio_device { void *dev; u64 features; -- cgit v0.10.2 From 55564a02b2ad4dab0c29fd9391d5118592c7fdc8 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Sun, 29 Nov 2015 13:03:23 +0200 Subject: tools/virtio: fix byteswap logic commit cf561f0d2eb74574ad9985a2feab134267a9d298 ("virtio: introduce virtio_is_little_endian() helper") changed byteswap logic to skip feature bit checks for LE platforms, but didn't update tools/virtio, so vring_bench started failing. Update the copy under tools/virtio/ (TODO: find a way to avoid this code duplication). Cc: Greg Kurz Signed-off-by: Michael S. Tsirkin diff --git a/tools/virtio/linux/virtio_config.h b/tools/virtio/linux/virtio_config.h index 806d683..57a6964 100644 --- a/tools/virtio/linux/virtio_config.h +++ b/tools/virtio/linux/virtio_config.h @@ -40,33 +40,39 @@ static inline void __virtio_clear_bit(struct virtio_device *vdev, #define virtio_has_feature(dev, feature) \ (__virtio_test_bit((dev), feature)) +static inline bool virtio_is_little_endian(struct virtio_device *vdev) +{ + return virtio_has_feature(vdev, VIRTIO_F_VERSION_1) || + virtio_legacy_is_little_endian(); +} + +/* Memory accessors */ static inline u16 virtio16_to_cpu(struct virtio_device *vdev, __virtio16 val) { - return __virtio16_to_cpu(virtio_has_feature(vdev, VIRTIO_F_VERSION_1), val); + return __virtio16_to_cpu(virtio_is_little_endian(vdev), val); } static inline __virtio16 cpu_to_virtio16(struct virtio_device *vdev, u16 val) { - return __cpu_to_virtio16(virtio_has_feature(vdev, VIRTIO_F_VERSION_1), val); + return __cpu_to_virtio16(virtio_is_little_endian(vdev), val); } static inline u32 virtio32_to_cpu(struct virtio_device *vdev, __virtio32 val) { - return __virtio32_to_cpu(virtio_has_feature(vdev, VIRTIO_F_VERSION_1), val); + return __virtio32_to_cpu(virtio_is_little_endian(vdev), val); } static inline __virtio32 cpu_to_virtio32(struct virtio_device *vdev, u32 val) { - return __cpu_to_virtio32(virtio_has_feature(vdev, VIRTIO_F_VERSION_1), val); + return __cpu_to_virtio32(virtio_is_little_endian(vdev), val); } static inline u64 virtio64_to_cpu(struct virtio_device *vdev, __virtio64 val) { - return __virtio64_to_cpu(virtio_has_feature(vdev, VIRTIO_F_VERSION_1), val); + return __virtio64_to_cpu(virtio_is_little_endian(vdev), val); } static inline __virtio64 cpu_to_virtio64(struct virtio_device *vdev, u64 val) { - return __cpu_to_virtio64(virtio_has_feature(vdev, VIRTIO_F_VERSION_1), val); + return __cpu_to_virtio64(virtio_is_little_endian(vdev), val); } - -- cgit v0.10.2 From 5fba13b5cf5856e725de35665c37b647323d3b9a Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Sun, 29 Nov 2015 13:34:44 +0200 Subject: vhost: replace % with & on data path We know vring num is a power of 2, so use & to mask the high bits. Signed-off-by: Michael S. Tsirkin diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index 080422f..ad2146a 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -1369,7 +1369,7 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq, /* Grab the next descriptor number they're advertising, and increment * the index we've seen. */ if (unlikely(__get_user(ring_head, - &vq->avail->ring[last_avail_idx % vq->num]))) { + &vq->avail->ring[last_avail_idx & (vq->num - 1)]))) { vq_err(vq, "Failed to read head: idx %d address %p\n", last_avail_idx, &vq->avail->ring[last_avail_idx % vq->num]); @@ -1489,7 +1489,7 @@ static int __vhost_add_used_n(struct vhost_virtqueue *vq, u16 old, new; int start; - start = vq->last_used_idx % vq->num; + start = vq->last_used_idx & (vq->num - 1); used = vq->used->ring + start; if (count == 1) { if (__put_user(heads[0].id, &used->id)) { @@ -1531,7 +1531,7 @@ int vhost_add_used_n(struct vhost_virtqueue *vq, struct vring_used_elem *heads, { int start, n, r; - start = vq->last_used_idx % vq->num; + start = vq->last_used_idx & (vq->num - 1); n = vq->num - start; if (n < count) { r = __vhost_add_used_n(vq, heads, n); -- cgit v0.10.2 From 82107539bbb9db303fb6676c78c836add5680bb0 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Tue, 1 Dec 2015 15:32:49 +0100 Subject: virtio: Do not drop __GFP_HIGH in alloc_indirect b92b1b89a33c ("virtio: force vring descriptors to be allocated from lowmem") tried to exclude highmem pages for descriptors so it cleared __GFP_HIGHMEM from a given gfp mask. The patch also cleared __GFP_HIGH which doesn't make much sense for this fix because __GFP_HIGH only controls access to memory reserves and it doesn't have any influence on the zone selection. Some of the call paths use GFP_ATOMIC and dropping __GFP_HIGH will reduce their changes for success because the lack of access to memory reserves. Signed-off-by: Michal Hocko Signed-off-by: Michael S. Tsirkin Acked-by: Will Deacon Reviewed-by: Mel Gorman diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index 096b857..abdb341 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -109,7 +109,7 @@ static struct vring_desc *alloc_indirect(struct virtqueue *_vq, * otherwise virt_to_phys will give us bogus addresses in the * virtqueue. */ - gfp &= ~(__GFP_HIGHMEM | __GFP_HIGH); + gfp &= ~__GFP_HIGHMEM; desc = kmalloc(total_sg * sizeof(struct vring_desc), gfp); if (!desc) -- cgit v0.10.2 From f277ec42f38f406ed073569b24b04f1c291fec0f Mon Sep 17 00:00:00 2001 From: Venkatesh Srinivas Date: Tue, 10 Nov 2015 16:21:07 -0800 Subject: virtio_ring: shadow available ring flags & index Improves cacheline transfer flow of available ring header. Virtqueues are implemented as a pair of rings, one producer->consumer avail ring and one consumer->producer used ring; preceding the avail ring in memory are two contiguous u16 fields -- avail->flags and avail->idx. A producer posts work by writing to avail->idx and a consumer reads avail->idx. The flags and idx fields only need to be written by a producer CPU and only read by a consumer CPU; when the producer and consumer are running on different CPUs and the virtio_ring code is structured to only have source writes/sink reads, we can continuously transfer the avail header cacheline between 'M' states between cores. This flow optimizes core -> core bandwidth on certain CPUs. (see: "Software Optimization Guide for AMD Family 15h Processors", Section 11.6; similar language appears in the 10h guide and should apply to CPUs w/ exclusive caches, using LLC as a transfer cache) Unfortunately the existing virtio_ring code issued reads to the avail->idx and read-modify-writes to avail->flags on the producer. This change shadows the flags and index fields in producer memory; the vring code now reads from the shadows and only ever writes to avail->flags and avail->idx, allowing the cacheline to transfer core -> core optimally. In a concurrent version of vring_bench, the time required for 10,000,000 buffer checkout/returns was reduced by ~2% (average across many runs) on an AMD Piledriver (15h) CPU: (w/o shadowing): Performance counter stats for './vring_bench': 5,451,082,016 L1-dcache-loads ... 2.221477739 seconds time elapsed (w/ shadowing): Performance counter stats for './vring_bench': 5,405,701,361 L1-dcache-loads ... 2.168405376 seconds time elapsed The further away (in a NUMA sense) virtio producers and consumers are from each other, the more we expect to benefit. Physical implementations of virtio devices and implementations of virtio where the consumer polls vring avail indexes (vhost) should also benefit. Signed-off-by: Venkatesh Srinivas Signed-off-by: Michael S. Tsirkin diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index abdb341..ee663c4 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -80,6 +80,12 @@ struct vring_virtqueue { /* Last used index we've seen. */ u16 last_used_idx; + /* Last written value to avail->flags */ + u16 avail_flags_shadow; + + /* Last written value to avail->idx in guest byte order */ + u16 avail_idx_shadow; + /* How to notify other side. FIXME: commonalize hcalls! */ bool (*notify)(struct virtqueue *vq); @@ -235,13 +241,14 @@ static inline int virtqueue_add(struct virtqueue *_vq, /* Put entry in available array (but don't update avail->idx until they * do sync). */ - avail = virtio16_to_cpu(_vq->vdev, vq->vring.avail->idx) & (vq->vring.num - 1); + avail = vq->avail_idx_shadow & (vq->vring.num - 1); vq->vring.avail->ring[avail] = cpu_to_virtio16(_vq->vdev, head); /* Descriptors and available array need to be set before we expose the * new available array entries. */ virtio_wmb(vq->weak_barriers); - vq->vring.avail->idx = cpu_to_virtio16(_vq->vdev, virtio16_to_cpu(_vq->vdev, vq->vring.avail->idx) + 1); + vq->avail_idx_shadow++; + vq->vring.avail->idx = cpu_to_virtio16(_vq->vdev, vq->avail_idx_shadow); vq->num_added++; pr_debug("Added buffer head %i to %p\n", head, vq); @@ -354,8 +361,8 @@ bool virtqueue_kick_prepare(struct virtqueue *_vq) * event. */ virtio_mb(vq->weak_barriers); - old = virtio16_to_cpu(_vq->vdev, vq->vring.avail->idx) - vq->num_added; - new = virtio16_to_cpu(_vq->vdev, vq->vring.avail->idx); + old = vq->avail_idx_shadow - vq->num_added; + new = vq->avail_idx_shadow; vq->num_added = 0; #ifdef DEBUG @@ -510,7 +517,7 @@ void *virtqueue_get_buf(struct virtqueue *_vq, unsigned int *len) /* If we expect an interrupt for the next entry, tell host * by writing event index and flush out the write before * the read in the next get_buf call. */ - if (!(vq->vring.avail->flags & cpu_to_virtio16(_vq->vdev, VRING_AVAIL_F_NO_INTERRUPT))) { + if (!(vq->avail_flags_shadow & VRING_AVAIL_F_NO_INTERRUPT)) { vring_used_event(&vq->vring) = cpu_to_virtio16(_vq->vdev, vq->last_used_idx); virtio_mb(vq->weak_barriers); } @@ -537,7 +544,11 @@ void virtqueue_disable_cb(struct virtqueue *_vq) { struct vring_virtqueue *vq = to_vvq(_vq); - vq->vring.avail->flags |= cpu_to_virtio16(_vq->vdev, VRING_AVAIL_F_NO_INTERRUPT); + if (!(vq->avail_flags_shadow & VRING_AVAIL_F_NO_INTERRUPT)) { + vq->avail_flags_shadow |= VRING_AVAIL_F_NO_INTERRUPT; + vq->vring.avail->flags = cpu_to_virtio16(_vq->vdev, vq->avail_flags_shadow); + } + } EXPORT_SYMBOL_GPL(virtqueue_disable_cb); @@ -565,7 +576,10 @@ unsigned virtqueue_enable_cb_prepare(struct virtqueue *_vq) /* Depending on the VIRTIO_RING_F_EVENT_IDX feature, we need to * either clear the flags bit or point the event index at the next * entry. Always do both to keep code simple. */ - vq->vring.avail->flags &= cpu_to_virtio16(_vq->vdev, ~VRING_AVAIL_F_NO_INTERRUPT); + if (vq->avail_flags_shadow & VRING_AVAIL_F_NO_INTERRUPT) { + vq->avail_flags_shadow &= ~VRING_AVAIL_F_NO_INTERRUPT; + vq->vring.avail->flags = cpu_to_virtio16(_vq->vdev, vq->avail_flags_shadow); + } vring_used_event(&vq->vring) = cpu_to_virtio16(_vq->vdev, last_used_idx = vq->last_used_idx); END_USE(vq); return last_used_idx; @@ -633,9 +647,12 @@ bool virtqueue_enable_cb_delayed(struct virtqueue *_vq) /* Depending on the VIRTIO_RING_F_USED_EVENT_IDX feature, we need to * either clear the flags bit or point the event index at the next * entry. Always do both to keep code simple. */ - vq->vring.avail->flags &= cpu_to_virtio16(_vq->vdev, ~VRING_AVAIL_F_NO_INTERRUPT); + if (vq->avail_flags_shadow & VRING_AVAIL_F_NO_INTERRUPT) { + vq->avail_flags_shadow &= ~VRING_AVAIL_F_NO_INTERRUPT; + vq->vring.avail->flags = cpu_to_virtio16(_vq->vdev, vq->avail_flags_shadow); + } /* TODO: tune this threshold */ - bufs = (u16)(virtio16_to_cpu(_vq->vdev, vq->vring.avail->idx) - vq->last_used_idx) * 3 / 4; + bufs = (u16)(vq->avail_idx_shadow - vq->last_used_idx) * 3 / 4; vring_used_event(&vq->vring) = cpu_to_virtio16(_vq->vdev, vq->last_used_idx + bufs); virtio_mb(vq->weak_barriers); if (unlikely((u16)(virtio16_to_cpu(_vq->vdev, vq->vring.used->idx) - vq->last_used_idx) > bufs)) { @@ -670,7 +687,8 @@ void *virtqueue_detach_unused_buf(struct virtqueue *_vq) /* detach_buf clears data, so grab it now. */ buf = vq->data[i]; detach_buf(vq, i); - vq->vring.avail->idx = cpu_to_virtio16(_vq->vdev, virtio16_to_cpu(_vq->vdev, vq->vring.avail->idx) - 1); + vq->avail_idx_shadow--; + vq->vring.avail->idx = cpu_to_virtio16(_vq->vdev, vq->avail_idx_shadow); END_USE(vq); return buf; } @@ -735,6 +753,8 @@ struct virtqueue *vring_new_virtqueue(unsigned int index, vq->weak_barriers = weak_barriers; vq->broken = false; vq->last_used_idx = 0; + vq->avail_flags_shadow = 0; + vq->avail_idx_shadow = 0; vq->num_added = 0; list_add_tail(&vq->vq.list, &vdev->vqs); #ifdef DEBUG @@ -746,8 +766,10 @@ struct virtqueue *vring_new_virtqueue(unsigned int index, vq->event = virtio_has_feature(vdev, VIRTIO_RING_F_EVENT_IDX); /* No callback? Tell other side not to bother us. */ - if (!callback) - vq->vring.avail->flags |= cpu_to_virtio16(vdev, VRING_AVAIL_F_NO_INTERRUPT); + if (!callback) { + vq->avail_flags_shadow |= VRING_AVAIL_F_NO_INTERRUPT; + vq->vring.avail->flags = cpu_to_virtio16(vdev, vq->avail_flags_shadow); + } /* Put everything in free lists. */ vq->free_head = 0; -- cgit v0.10.2 From f27a62995330fdc5f804acf2ebf1444f9025c879 Mon Sep 17 00:00:00 2001 From: Wenwei Tao Date: Sun, 6 Dec 2015 11:25:43 +0100 Subject: lightnvm: use flags in rrpc_get_blk MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit rrpc_get_blk use constant 0 as the input parameter of nvm_get_blk, this may result in getting gc block failed unexpectedly. Signed-off-by: Wenwei Tao Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe diff --git a/drivers/lightnvm/rrpc.c b/drivers/lightnvm/rrpc.c index 75e59c3..d606c7a 100644 --- a/drivers/lightnvm/rrpc.c +++ b/drivers/lightnvm/rrpc.c @@ -182,7 +182,7 @@ static struct rrpc_block *rrpc_get_blk(struct rrpc *rrpc, struct rrpc_lun *rlun, struct nvm_block *blk; struct rrpc_block *rblk; - blk = nvm_get_blk(rrpc->dev, rlun->parent, 0); + blk = nvm_get_blk(rrpc->dev, rlun->parent, flags); if (!blk) return NULL; -- cgit v0.10.2 From d3d1a43842077c53e22dedaebfe7cb52022b18e1 Mon Sep 17 00:00:00 2001 From: Wenwei Tao Date: Sun, 6 Dec 2015 11:25:44 +0100 Subject: lightnvm: put blks when luns configure failed MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Put the allocated blocks back to the free list when the luns configure failed, to make these blocks useable to others. Signed-off-by: Wenwei Tao Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe diff --git a/drivers/lightnvm/rrpc.c b/drivers/lightnvm/rrpc.c index d606c7a..cf1a4a5 100644 --- a/drivers/lightnvm/rrpc.c +++ b/drivers/lightnvm/rrpc.c @@ -202,6 +202,20 @@ static void rrpc_put_blk(struct rrpc *rrpc, struct rrpc_block *rblk) nvm_put_blk(rrpc->dev, rblk->parent); } +static void rrpc_put_blks(struct rrpc *rrpc) +{ + struct rrpc_lun *rlun; + int i; + + for (i = 0; i < rrpc->nr_luns; i++) { + rlun = &rrpc->luns[i]; + if (rlun->cur) + rrpc_put_blk(rrpc, rlun->cur); + if (rlun->gc_cur) + rrpc_put_blk(rrpc, rlun->gc_cur); + } +} + static struct rrpc_lun *get_next_lun(struct rrpc *rrpc) { int next = atomic_inc_return(&rrpc->next_lun); @@ -1224,18 +1238,21 @@ static int rrpc_luns_configure(struct rrpc *rrpc) rblk = rrpc_get_blk(rrpc, rlun, 0); if (!rblk) - return -EINVAL; + goto err; rrpc_set_lun_cur(rlun, rblk); /* Emergency gc block */ rblk = rrpc_get_blk(rrpc, rlun, 1); if (!rblk) - return -EINVAL; + goto err; rlun->gc_cur = rblk; } return 0; +err: + rrpc_put_blks(rrpc); + return -EINVAL; } static struct nvm_tgt_type tt_rrpc; -- cgit v0.10.2 From e9b76a80f1f165cb031f62128dc07d219e29f5a3 Mon Sep 17 00:00:00 2001 From: Wenwei Tao Date: Sun, 6 Dec 2015 11:25:45 +0100 Subject: lightnvm: refactor spin_unlock in gennvm_get_blk MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The spin_unlock is duplicated multiple times. Jump to a single unlock to improve the code flow. Signed-off-by: Wenwei Tao Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe diff --git a/drivers/lightnvm/gennvm.c b/drivers/lightnvm/gennvm.c index 35dde84..ce60254 100644 --- a/drivers/lightnvm/gennvm.c +++ b/drivers/lightnvm/gennvm.c @@ -262,14 +262,11 @@ static struct nvm_block *gennvm_get_blk(struct nvm_dev *dev, if (list_empty(&lun->free_list)) { pr_err_ratelimited("gennvm: lun %u have no free pages available", lun->vlun.id); - spin_unlock(&vlun->lock); goto out; } - while (!is_gc && lun->vlun.nr_free_blocks < lun->reserved_blocks) { - spin_unlock(&vlun->lock); + if (!is_gc && lun->vlun.nr_free_blocks < lun->reserved_blocks) goto out; - } blk = list_first_entry(&lun->free_list, struct nvm_block, list); list_move_tail(&blk->list, &lun->used_list); @@ -278,8 +275,8 @@ static struct nvm_block *gennvm_get_blk(struct nvm_dev *dev, lun->vlun.nr_free_blocks--; lun->vlun.nr_inuse_blocks++; - spin_unlock(&vlun->lock); out: + spin_unlock(&vlun->lock); return blk; } -- cgit v0.10.2 From 437f8f9a1e923deab6c61822c36f29f4a2198cfe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matias=20Bj=C3=B8rling?= Date: Sun, 6 Dec 2015 11:25:46 +0100 Subject: lightnvm: check mm before use MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The core can may issue I/Os before a media manager is registered with the lightnvm subsystem. Make sure that we don't call the media manager ->end_io prematurely with a null pointer. Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c index 06c3364..762c9a7 100644 --- a/drivers/nvme/host/lightnvm.c +++ b/drivers/nvme/host/lightnvm.c @@ -455,7 +455,7 @@ static void nvme_nvm_end_io(struct request *rq, int error) struct nvm_rq *rqd = rq->end_io_data; struct nvm_dev *dev = rqd->dev; - if (dev->mt->end_io(rqd, error)) + if (dev->mt && dev->mt->end_io(rqd, error)) pr_err("nvme: err status: %x result: %lx\n", rq->errors, (unsigned long)rq->special); -- cgit v0.10.2 From 57b4bd06ff0372fe1e3617889c4b37fbd500364a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matias=20Bj=C3=B8rling?= Date: Sun, 6 Dec 2015 11:25:47 +0100 Subject: lightnvm: comments on constants MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It is not obvious what NVM_IO_* and NVM_BLK_T_* are used for. Make sure to comment them appropriately as the other constants. Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index c6916ae..935ef38 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -50,9 +50,16 @@ enum { NVM_IO_DUAL_ACCESS = 0x1, NVM_IO_QUAD_ACCESS = 0x2, + /* NAND Access Modes */ NVM_IO_SUSPEND = 0x80, NVM_IO_SLC_MODE = 0x100, NVM_IO_SCRAMBLE_DISABLE = 0x200, + + /* Block Types */ + NVM_BLK_T_FREE = 0x0, + NVM_BLK_T_BAD = 0x1, + NVM_BLK_T_DEV = 0x2, + NVM_BLK_T_HOST = 0x4, }; struct nvm_id_group { -- cgit v0.10.2 From 16f26c3aa9b9c36a9d1092ae3258461d1008481e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matias=20Bj=C3=B8rling?= Date: Sun, 6 Dec 2015 11:25:48 +0100 Subject: lightnvm: replace req queue with nvmdev for lld MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In the case where a request queue is passed to the low lever lightnvm device drive integration, the device driver might pass its admin commands through another queue. Instead pass nvm_dev, and let the low level drive the appropriate queue. Reported-by: Christoph Hellwig Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c index 0c3940e..7981b74 100644 --- a/drivers/block/null_blk.c +++ b/drivers/block/null_blk.c @@ -444,8 +444,9 @@ static void null_lnvm_end_io(struct request *rq, int error) blk_put_request(rq); } -static int null_lnvm_submit_io(struct request_queue *q, struct nvm_rq *rqd) +static int null_lnvm_submit_io(struct nvm_dev *dev, struct nvm_rq *rqd) { + struct request_queue *q = dev->q; struct request *rq; struct bio *bio = rqd->bio; @@ -470,7 +471,7 @@ static int null_lnvm_submit_io(struct request_queue *q, struct nvm_rq *rqd) return 0; } -static int null_lnvm_id(struct request_queue *q, struct nvm_id *id) +static int null_lnvm_id(struct nvm_dev *dev, struct nvm_id *id) { sector_t size = gb * 1024 * 1024 * 1024ULL; sector_t blksize; @@ -523,7 +524,7 @@ static int null_lnvm_id(struct request_queue *q, struct nvm_id *id) return 0; } -static void *null_lnvm_create_dma_pool(struct request_queue *q, char *name) +static void *null_lnvm_create_dma_pool(struct nvm_dev *dev, char *name) { mempool_t *virtmem_pool; @@ -541,7 +542,7 @@ static void null_lnvm_destroy_dma_pool(void *pool) mempool_destroy(pool); } -static void *null_lnvm_dev_dma_alloc(struct request_queue *q, void *pool, +static void *null_lnvm_dev_dma_alloc(struct nvm_dev *dev, void *pool, gfp_t mem_flags, dma_addr_t *dma_handler) { return mempool_alloc(pool, mem_flags); diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c index 86ce887..4a8d1fe 100644 --- a/drivers/lightnvm/core.c +++ b/drivers/lightnvm/core.c @@ -74,7 +74,7 @@ EXPORT_SYMBOL(nvm_unregister_target); void *nvm_dev_dma_alloc(struct nvm_dev *dev, gfp_t mem_flags, dma_addr_t *dma_handler) { - return dev->ops->dev_dma_alloc(dev->q, dev->ppalist_pool, mem_flags, + return dev->ops->dev_dma_alloc(dev, dev->ppalist_pool, mem_flags, dma_handler); } EXPORT_SYMBOL(nvm_dev_dma_alloc); @@ -246,7 +246,7 @@ static int nvm_init(struct nvm_dev *dev) if (!dev->q || !dev->ops) return ret; - if (dev->ops->identity(dev->q, &dev->identity)) { + if (dev->ops->identity(dev, &dev->identity)) { pr_err("nvm: device could not be identified\n"); goto err; } @@ -326,8 +326,7 @@ int nvm_register(struct request_queue *q, char *disk_name, } if (dev->ops->max_phys_sect > 1) { - dev->ppalist_pool = dev->ops->create_dma_pool(dev->q, - "ppalist"); + dev->ppalist_pool = dev->ops->create_dma_pool(dev, "ppalist"); if (!dev->ppalist_pool) { pr_err("nvm: could not create ppa pool\n"); ret = -ENOMEM; diff --git a/drivers/lightnvm/gennvm.c b/drivers/lightnvm/gennvm.c index ce60254..52b513a 100644 --- a/drivers/lightnvm/gennvm.c +++ b/drivers/lightnvm/gennvm.c @@ -195,7 +195,7 @@ static int gennvm_blocks_init(struct nvm_dev *dev, struct gen_nvm *gn) } if (dev->ops->get_l2p_tbl) { - ret = dev->ops->get_l2p_tbl(dev->q, 0, dev->total_pages, + ret = dev->ops->get_l2p_tbl(dev, 0, dev->total_pages, gennvm_block_map, dev); if (ret) { pr_err("gennvm: could not read L2P table.\n"); @@ -346,7 +346,7 @@ static int gennvm_submit_io(struct nvm_dev *dev, struct nvm_rq *rqd) gennvm_generic_to_addr_mode(dev, rqd); rqd->dev = dev; - return dev->ops->submit_io(dev->q, rqd); + return dev->ops->submit_io(dev, rqd); } static void gennvm_blk_set_type(struct nvm_dev *dev, struct ppa_addr *ppa, @@ -382,7 +382,7 @@ static void gennvm_mark_blk_bad(struct nvm_dev *dev, struct nvm_rq *rqd) if (!dev->ops->set_bb_tbl) return; - if (dev->ops->set_bb_tbl(dev->q, rqd, 1)) + if (dev->ops->set_bb_tbl(dev, rqd, 1)) return; gennvm_addr_to_generic_mode(dev, rqd); @@ -450,7 +450,7 @@ static int gennvm_erase_blk(struct nvm_dev *dev, struct nvm_block *blk, gennvm_generic_to_addr_mode(dev, &rqd); - ret = dev->ops->erase_block(dev->q, &rqd); + ret = dev->ops->erase_block(dev, &rqd); if (plane_cnt) nvm_dev_dma_free(dev, rqd.ppa_list, rqd.dma_ppa_list); diff --git a/drivers/lightnvm/rrpc.c b/drivers/lightnvm/rrpc.c index cf1a4a5..134e4fa 100644 --- a/drivers/lightnvm/rrpc.c +++ b/drivers/lightnvm/rrpc.c @@ -1016,7 +1016,7 @@ static int rrpc_map_init(struct rrpc *rrpc) return 0; /* Bring up the mapping table from device */ - ret = dev->ops->get_l2p_tbl(dev->q, 0, dev->total_pages, + ret = dev->ops->get_l2p_tbl(dev, 0, dev->total_pages, rrpc_l2p_update, rrpc); if (ret) { pr_err("nvm: rrpc: could not read L2P table.\n"); diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c index 762c9a7..15f2acb 100644 --- a/drivers/nvme/host/lightnvm.c +++ b/drivers/nvme/host/lightnvm.c @@ -271,9 +271,9 @@ static int init_grps(struct nvm_id *nvm_id, struct nvme_nvm_id *nvme_nvm_id) return 0; } -static int nvme_nvm_identity(struct request_queue *q, struct nvm_id *nvm_id) +static int nvme_nvm_identity(struct nvm_dev *nvmdev, struct nvm_id *nvm_id) { - struct nvme_ns *ns = q->queuedata; + struct nvme_ns *ns = nvmdev->q->queuedata; struct nvme_dev *dev = ns->dev; struct nvme_nvm_id *nvme_nvm_id; struct nvme_nvm_command c = {}; @@ -308,10 +308,10 @@ out: return ret; } -static int nvme_nvm_get_l2p_tbl(struct request_queue *q, u64 slba, u32 nlb, +static int nvme_nvm_get_l2p_tbl(struct nvm_dev *nvmdev, u64 slba, u32 nlb, nvm_l2p_update_fn *update_l2p, void *priv) { - struct nvme_ns *ns = q->queuedata; + struct nvme_ns *ns = nvmdev->q->queuedata; struct nvme_dev *dev = ns->dev; struct nvme_nvm_command c = {}; u32 len = queue_max_hw_sectors(dev->admin_q) << 9; @@ -415,10 +415,10 @@ out: return ret; } -static int nvme_nvm_set_bb_tbl(struct request_queue *q, struct nvm_rq *rqd, +static int nvme_nvm_set_bb_tbl(struct nvm_dev *nvmdev, struct nvm_rq *rqd, int type) { - struct nvme_ns *ns = q->queuedata; + struct nvme_ns *ns = nvmdev->q->queuedata; struct nvme_dev *dev = ns->dev; struct nvme_nvm_command c = {}; int ret = 0; @@ -463,8 +463,9 @@ static void nvme_nvm_end_io(struct request *rq, int error) blk_mq_free_request(rq); } -static int nvme_nvm_submit_io(struct request_queue *q, struct nvm_rq *rqd) +static int nvme_nvm_submit_io(struct nvm_dev *dev, struct nvm_rq *rqd) { + struct request_queue *q = dev->q; struct nvme_ns *ns = q->queuedata; struct request *rq; struct bio *bio = rqd->bio; @@ -502,8 +503,9 @@ static int nvme_nvm_submit_io(struct request_queue *q, struct nvm_rq *rqd) return 0; } -static int nvme_nvm_erase_block(struct request_queue *q, struct nvm_rq *rqd) +static int nvme_nvm_erase_block(struct nvm_dev *dev, struct nvm_rq *rqd) { + struct request_queue *q = dev->q; struct nvme_ns *ns = q->queuedata; struct nvme_nvm_command c = {}; @@ -515,9 +517,9 @@ static int nvme_nvm_erase_block(struct request_queue *q, struct nvm_rq *rqd) return nvme_submit_sync_cmd(q, (struct nvme_command *)&c, NULL, 0); } -static void *nvme_nvm_create_dma_pool(struct request_queue *q, char *name) +static void *nvme_nvm_create_dma_pool(struct nvm_dev *nvmdev, char *name) { - struct nvme_ns *ns = q->queuedata; + struct nvme_ns *ns = nvmdev->q->queuedata; struct nvme_dev *dev = ns->dev; return dma_pool_create(name, dev->dev, PAGE_SIZE, PAGE_SIZE, 0); @@ -530,7 +532,7 @@ static void nvme_nvm_destroy_dma_pool(void *pool) dma_pool_destroy(dma_pool); } -static void *nvme_nvm_dev_dma_alloc(struct request_queue *q, void *pool, +static void *nvme_nvm_dev_dma_alloc(struct nvm_dev *dev, void *pool, gfp_t mem_flags, dma_addr_t *dma_handler) { return dma_pool_alloc(pool, mem_flags, dma_handler); diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index 935ef38..034117b 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -183,17 +183,17 @@ struct nvm_block; typedef int (nvm_l2p_update_fn)(u64, u32, __le64 *, void *); typedef int (nvm_bb_update_fn)(struct ppa_addr, int, u8 *, void *); -typedef int (nvm_id_fn)(struct request_queue *, struct nvm_id *); -typedef int (nvm_get_l2p_tbl_fn)(struct request_queue *, u64, u32, +typedef int (nvm_id_fn)(struct nvm_dev *, struct nvm_id *); +typedef int (nvm_get_l2p_tbl_fn)(struct nvm_dev *, u64, u32, nvm_l2p_update_fn *, void *); typedef int (nvm_op_bb_tbl_fn)(struct nvm_dev *, struct ppa_addr, int, nvm_bb_update_fn *, void *); -typedef int (nvm_op_set_bb_fn)(struct request_queue *, struct nvm_rq *, int); -typedef int (nvm_submit_io_fn)(struct request_queue *, struct nvm_rq *); -typedef int (nvm_erase_blk_fn)(struct request_queue *, struct nvm_rq *); -typedef void *(nvm_create_dma_pool_fn)(struct request_queue *, char *); +typedef int (nvm_op_set_bb_fn)(struct nvm_dev *, struct nvm_rq *, int); +typedef int (nvm_submit_io_fn)(struct nvm_dev *, struct nvm_rq *); +typedef int (nvm_erase_blk_fn)(struct nvm_dev *, struct nvm_rq *); +typedef void *(nvm_create_dma_pool_fn)(struct nvm_dev *, char *); typedef void (nvm_destroy_dma_pool_fn)(void *); -typedef void *(nvm_dev_dma_alloc_fn)(struct request_queue *, void *, gfp_t, +typedef void *(nvm_dev_dma_alloc_fn)(struct nvm_dev *, void *, gfp_t, dma_addr_t *); typedef void (nvm_dev_dma_free_fn)(void *, void*, dma_addr_t); -- cgit v0.10.2 From 762796bc9e5758df6c72253e4ec19657055cea1f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matias=20Bj=C3=B8rling?= Date: Sun, 6 Dec 2015 11:25:49 +0100 Subject: lightnvm: fix media mgr registration MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch fixes two issues during media manager registration. 1. The ppa pool can be used at media manager registration. Allocate the ppa pool before that. 2. If a media manager can't be found, this should not lead to the device being unallocated. A media manager can be registered later, that can manage the device. Only warn if a media manager fails initialization. Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c index 4a8d1fe..8f41b24 100644 --- a/drivers/lightnvm/core.c +++ b/drivers/lightnvm/core.c @@ -97,15 +97,47 @@ static struct nvmm_type *nvm_find_mgr_type(const char *name) return NULL; } +struct nvmm_type *nvm_init_mgr(struct nvm_dev *dev) +{ + struct nvmm_type *mt; + int ret; + + lockdep_assert_held(&nvm_lock); + + list_for_each_entry(mt, &nvm_mgrs, list) { + ret = mt->register_mgr(dev); + if (ret < 0) { + pr_err("nvm: media mgr failed to init (%d) on dev %s\n", + ret, dev->name); + return NULL; /* initialization failed */ + } else if (ret > 0) + return mt; + } + + return NULL; +} + int nvm_register_mgr(struct nvmm_type *mt) { + struct nvm_dev *dev; int ret = 0; down_write(&nvm_lock); - if (nvm_find_mgr_type(mt->name)) + if (nvm_find_mgr_type(mt->name)) { ret = -EEXIST; - else + goto finish; + } else { list_add(&mt->list, &nvm_mgrs); + } + + /* try to register media mgr if any device have none configured */ + list_for_each_entry(dev, &nvm_devices, devices) { + if (dev->mt) + continue; + + dev->mt = nvm_init_mgr(dev); + } +finish: up_write(&nvm_lock); return ret; @@ -123,26 +155,6 @@ void nvm_unregister_mgr(struct nvmm_type *mt) } EXPORT_SYMBOL(nvm_unregister_mgr); -/* register with device with a supported manager */ -static int register_mgr(struct nvm_dev *dev) -{ - struct nvmm_type *mt; - int ret = 0; - - list_for_each_entry(mt, &nvm_mgrs, list) { - ret = mt->register_mgr(dev); - if (ret > 0) { - dev->mt = mt; - break; /* successfully initialized */ - } - } - - if (!ret) - pr_info("nvm: no compatible nvm manager found.\n"); - - return ret; -} - static struct nvm_dev *nvm_find_nvm_dev(const char *name) { struct nvm_dev *dev; @@ -271,14 +283,6 @@ static int nvm_init(struct nvm_dev *dev) goto err; } - down_write(&nvm_lock); - ret = register_mgr(dev); - up_write(&nvm_lock); - if (ret < 0) - goto err; - if (!ret) - return 0; - pr_info("nvm: registered %s [%u/%u/%u/%u/%u/%u]\n", dev->name, dev->sec_per_pg, dev->nr_planes, dev->pgs_per_blk, dev->blks_per_lun, dev->nr_luns, @@ -334,7 +338,9 @@ int nvm_register(struct request_queue *q, char *disk_name, } } + /* register device with a supported media manager */ down_write(&nvm_lock); + dev->mt = nvm_init_mgr(dev); list_add(&dev->devices, &nvm_devices); up_write(&nvm_lock); @@ -379,19 +385,13 @@ static int nvm_create_target(struct nvm_dev *dev, struct nvm_tgt_type *tt; struct nvm_target *t; void *targetdata; - int ret = 0; - down_write(&nvm_lock); if (!dev->mt) { - ret = register_mgr(dev); - if (!ret) - ret = -ENODEV; - if (ret < 0) { - up_write(&nvm_lock); - return ret; - } + pr_info("nvm: device has no media manager registered.\n"); + return -ENODEV; } + down_write(&nvm_lock); tt = nvm_find_target_type(create->tgttype); if (!tt) { pr_err("nvm: target type %s not found\n", create->tgttype); -- cgit v0.10.2 From 008b74438269b368d57a14452afbc194f7f9ff2f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matias=20Bj=C3=B8rling?= Date: Sun, 6 Dec 2015 11:25:50 +0100 Subject: lightnvm: prevent gennvm module unload on use MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After the gennvm module has been initialized. It might be attached to one or several devices. In that case, the module is in use. Make sure that it can not be unloaded. Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe diff --git a/drivers/lightnvm/gennvm.c b/drivers/lightnvm/gennvm.c index 52b513a..f434e89 100644 --- a/drivers/lightnvm/gennvm.c +++ b/drivers/lightnvm/gennvm.c @@ -219,6 +219,9 @@ static int gennvm_register(struct nvm_dev *dev) struct gen_nvm *gn; int ret; + if (!try_module_get(THIS_MODULE)) + return -ENODEV; + gn = kzalloc(sizeof(struct gen_nvm), GFP_KERNEL); if (!gn) return -ENOMEM; @@ -242,12 +245,14 @@ static int gennvm_register(struct nvm_dev *dev) return 1; err: gennvm_free(dev); + module_put(THIS_MODULE); return ret; } static void gennvm_unregister(struct nvm_dev *dev) { gennvm_free(dev); + module_put(THIS_MODULE); } static struct nvm_block *gennvm_get_blk(struct nvm_dev *dev, -- cgit v0.10.2 From 4158624454db3756c1f1b3641f1ab4215e697936 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matias=20Bj=C3=B8rling?= Date: Sun, 6 Dec 2015 11:25:51 +0100 Subject: lightnvm: do not compile in debugging by default MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The LightNVM module exposes a debug interface when CONFIG_NVM_DEBUG is set. This interfaces takes a string to configure media managers and targets. Make sure this interface is only exposed when chosen deliberately. Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe diff --git a/drivers/lightnvm/Kconfig b/drivers/lightnvm/Kconfig index a16bf56..85a3390 100644 --- a/drivers/lightnvm/Kconfig +++ b/drivers/lightnvm/Kconfig @@ -18,6 +18,7 @@ if NVM config NVM_DEBUG bool "Open-Channel SSD debugging support" + default n ---help--- Exposes a debug management interface to create/remove targets at: -- cgit v0.10.2 From 98fb754831e1d139f1151e968bdebb6eefb9b581 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 1 Dec 2015 13:20:40 +0100 Subject: arm64: update linker script to increased L1_CACHE_BYTES value Bring the linker script in line with the recent increase of L1_CACHE_BYTES to 128. Replace the hardcoded value of 64 with the symbolic constant. Signed-off-by: Ard Biesheuvel Acked-by: Mark Rutland [catalin.marinas@arm.com: fix up RW_DATA_SECTION as well] Signed-off-by: Catalin Marinas diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S index 1ee2c39..71426a7 100644 --- a/arch/arm64/kernel/vmlinux.lds.S +++ b/arch/arm64/kernel/vmlinux.lds.S @@ -5,6 +5,7 @@ */ #include +#include #include #include #include @@ -140,7 +141,7 @@ SECTIONS ARM_EXIT_KEEP(EXIT_DATA) } - PERCPU_SECTION(64) + PERCPU_SECTION(L1_CACHE_BYTES) . = ALIGN(PAGE_SIZE); __init_end = .; @@ -158,7 +159,7 @@ SECTIONS . = ALIGN(PAGE_SIZE); _data = .; _sdata = .; - RW_DATA_SECTION(64, PAGE_SIZE, THREAD_SIZE) + RW_DATA_SECTION(L1_CACHE_BYTES, PAGE_SIZE, THREAD_SIZE) PECOFF_EDATA_PADDING _edata = .; -- cgit v0.10.2 From 756b9b37cfb2e3dc76b2e43a8c097402ac736e07 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 7 Dec 2015 12:52:23 -0800 Subject: SUNRPC: Fix callback channel The NFSv4.1 callback channel is currently broken because the receive message will keep shrinking because the backchannel receive buffer size never gets reset. The easiest solution to this problem is instead of changing the receive buffer, to rather adjust the copied request. Fixes: 38b7631fbe42 ("nfs4: limit callback decoding to received bytes") Cc: Benjamin Coddington Cc: stable@vger.kernel.org Signed-off-by: Trond Myklebust diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c index beac58b..646cdac 100644 --- a/fs/nfs/callback_xdr.c +++ b/fs/nfs/callback_xdr.c @@ -78,8 +78,7 @@ static __be32 *read_buf(struct xdr_stream *xdr, int nbytes) p = xdr_inline_decode(xdr, nbytes); if (unlikely(p == NULL)) - printk(KERN_WARNING "NFS: NFSv4 callback reply buffer overflowed " - "or truncated request.\n"); + printk(KERN_WARNING "NFS: NFSv4 callback reply buffer overflowed!\n"); return p; } @@ -890,7 +889,6 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r struct cb_compound_hdr_arg hdr_arg = { 0 }; struct cb_compound_hdr_res hdr_res = { NULL }; struct xdr_stream xdr_in, xdr_out; - struct xdr_buf *rq_arg = &rqstp->rq_arg; __be32 *p, status; struct cb_process_state cps = { .drc_status = 0, @@ -902,8 +900,7 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r dprintk("%s: start\n", __func__); - rq_arg->len = rq_arg->head[0].iov_len + rq_arg->page_len; - xdr_init_decode(&xdr_in, rq_arg, rq_arg->head[0].iov_base); + xdr_init_decode(&xdr_in, &rqstp->rq_arg, rqstp->rq_arg.head[0].iov_base); p = (__be32*)((char *)rqstp->rq_res.head[0].iov_base + rqstp->rq_res.head[0].iov_len); xdr_init_encode(&xdr_out, &rqstp->rq_res, p); diff --git a/net/sunrpc/backchannel_rqst.c b/net/sunrpc/backchannel_rqst.c index 95f82d8..229956b 100644 --- a/net/sunrpc/backchannel_rqst.c +++ b/net/sunrpc/backchannel_rqst.c @@ -353,20 +353,12 @@ void xprt_complete_bc_request(struct rpc_rqst *req, uint32_t copied) { struct rpc_xprt *xprt = req->rq_xprt; struct svc_serv *bc_serv = xprt->bc_serv; - struct xdr_buf *rq_rcv_buf = &req->rq_rcv_buf; spin_lock(&xprt->bc_pa_lock); list_del(&req->rq_bc_pa_list); xprt_dec_alloc_count(xprt, 1); spin_unlock(&xprt->bc_pa_lock); - if (copied <= rq_rcv_buf->head[0].iov_len) { - rq_rcv_buf->head[0].iov_len = copied; - rq_rcv_buf->page_len = 0; - } else { - rq_rcv_buf->page_len = copied - rq_rcv_buf->head[0].iov_len; - } - req->rq_private_buf.len = copied; set_bit(RPC_BC_PA_IN_USE, &req->rq_bc_pa_state); diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index 7fccf96..cc98528 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -1363,7 +1363,19 @@ bc_svc_process(struct svc_serv *serv, struct rpc_rqst *req, memcpy(&rqstp->rq_addr, &req->rq_xprt->addr, rqstp->rq_addrlen); memcpy(&rqstp->rq_arg, &req->rq_rcv_buf, sizeof(rqstp->rq_arg)); memcpy(&rqstp->rq_res, &req->rq_snd_buf, sizeof(rqstp->rq_res)); + + /* Adjust the argument buffer length */ rqstp->rq_arg.len = req->rq_private_buf.len; + if (rqstp->rq_arg.len <= rqstp->rq_arg.head[0].iov_len) { + rqstp->rq_arg.head[0].iov_len = rqstp->rq_arg.len; + rqstp->rq_arg.page_len = 0; + } else if (rqstp->rq_arg.len <= rqstp->rq_arg.head[0].iov_len + + rqstp->rq_arg.page_len) + rqstp->rq_arg.page_len = rqstp->rq_arg.len - + rqstp->rq_arg.head[0].iov_len; + else + rqstp->rq_arg.len = rqstp->rq_arg.head[0].iov_len + + rqstp->rq_arg.page_len; /* reset result send buffer "put" position */ resv->iov_len = 0; -- cgit v0.10.2 From 785f742223c03ec70f9dcf36b87f59e9df2067e0 Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Mon, 30 Nov 2015 09:34:26 -0500 Subject: IB/qib: Fix qib_mr structure struct qib_mr requires the mr member be the last because struct qib_mregion contains a dynamic array at the end. The additions of members should have been placed before this structure as the comment noted. Failure to do so was causing random memory corruption. Reproducing this bug was easy to do by running the client and server of ib_write_bw -s 8 -n 5 on the same node. This BUG() was tripped in a slab debug kernel: kernel BUG at mm/slab.c:2572! Fixes: 38071a461f0a ("IB/qib: Support the new memory registration API") Reviewed-by: Mike Marciniszyn Signed-off-by: Ira Weiny Signed-off-by: Doug Ledford diff --git a/drivers/infiniband/hw/qib/qib_verbs.h b/drivers/infiniband/hw/qib/qib_verbs.h index 2baf5ad..bc803f3 100644 --- a/drivers/infiniband/hw/qib/qib_verbs.h +++ b/drivers/infiniband/hw/qib/qib_verbs.h @@ -329,9 +329,9 @@ struct qib_sge { struct qib_mr { struct ib_mr ibmr; struct ib_umem *umem; - struct qib_mregion mr; /* must be last */ u64 *pages; u32 npages; + struct qib_mregion mr; /* must be last */ }; /* -- cgit v0.10.2 From 1d784b890c0101db2556b981e864c2282a3c1b02 Mon Sep 17 00:00:00 2001 From: Mike Marciniszyn Date: Tue, 1 Dec 2015 10:13:51 -0500 Subject: IB/core: Fix user mode post wr corruption Commit e622f2f4ad21 ("IB: split struct ib_send_wr") introduced a regression for HCAs whose user mode post sends go through ib_uverbs_post_send(). The code didn't account for the fact that the first sge is offset by an operation dependent length. The allocation did, but the pointer to the destination sge list is computed without that knowledge. The sge list copy_from_user() then corrupts fields in the work request Store the operation dependent length in a local variable and compute the sge list copy_from_user() destination using that length. Reviewed-by: Ira Weiny Signed-off-by: Mike Marciniszyn Reviewed-by: Christoph Hellwig Signed-off-by: Doug Ledford diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index 94816ae..4cb8e9d 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -2446,6 +2446,7 @@ ssize_t ib_uverbs_post_send(struct ib_uverbs_file *file, int i, sg_ind; int is_ud; ssize_t ret = -EINVAL; + size_t next_size; if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; @@ -2490,7 +2491,8 @@ ssize_t ib_uverbs_post_send(struct ib_uverbs_file *file, goto out_put; } - ud = alloc_wr(sizeof(*ud), user_wr->num_sge); + next_size = sizeof(*ud); + ud = alloc_wr(next_size, user_wr->num_sge); if (!ud) { ret = -ENOMEM; goto out_put; @@ -2511,7 +2513,8 @@ ssize_t ib_uverbs_post_send(struct ib_uverbs_file *file, user_wr->opcode == IB_WR_RDMA_READ) { struct ib_rdma_wr *rdma; - rdma = alloc_wr(sizeof(*rdma), user_wr->num_sge); + next_size = sizeof(*rdma); + rdma = alloc_wr(next_size, user_wr->num_sge); if (!rdma) { ret = -ENOMEM; goto out_put; @@ -2525,7 +2528,8 @@ ssize_t ib_uverbs_post_send(struct ib_uverbs_file *file, user_wr->opcode == IB_WR_ATOMIC_FETCH_AND_ADD) { struct ib_atomic_wr *atomic; - atomic = alloc_wr(sizeof(*atomic), user_wr->num_sge); + next_size = sizeof(*atomic); + atomic = alloc_wr(next_size, user_wr->num_sge); if (!atomic) { ret = -ENOMEM; goto out_put; @@ -2540,7 +2544,8 @@ ssize_t ib_uverbs_post_send(struct ib_uverbs_file *file, } else if (user_wr->opcode == IB_WR_SEND || user_wr->opcode == IB_WR_SEND_WITH_IMM || user_wr->opcode == IB_WR_SEND_WITH_INV) { - next = alloc_wr(sizeof(*next), user_wr->num_sge); + next_size = sizeof(*next); + next = alloc_wr(next_size, user_wr->num_sge); if (!next) { ret = -ENOMEM; goto out_put; @@ -2572,7 +2577,7 @@ ssize_t ib_uverbs_post_send(struct ib_uverbs_file *file, if (next->num_sge) { next->sg_list = (void *) next + - ALIGN(sizeof *next, sizeof (struct ib_sge)); + ALIGN(next_size, sizeof(struct ib_sge)); if (copy_from_user(next->sg_list, buf + sizeof cmd + cmd.wr_count * cmd.wqe_size + -- cgit v0.10.2 From 57ab2512138205fe7836332fb4742441e53907ff Mon Sep 17 00:00:00 2001 From: Easwar Hariharan Date: Mon, 2 Nov 2015 12:13:14 -0500 Subject: IB/qib: Minor fixes to qib per SFF 8636 Minor errors found via code inspection during future development. SFF 8636 defines bit position 2 to hold the status indication of QSFP memory paging. The mask used to test for the value was incorrect and is fixed in this patch. Additionally, the dump function had a mismatch between the field being printed out and the field used to source the data which was fixed. Reviewed-by: Mitko Haralanov Reviewed-by: Mike Marciniszyn Reported-by: Easwar Hariharan Signed-off-by: Easwar Hariharan Signed-off-by: Mike Marciniszyn Signed-off-by: Doug Ledford diff --git a/drivers/infiniband/hw/qib/qib_qsfp.c b/drivers/infiniband/hw/qib/qib_qsfp.c index 5e27f76..4c7c3c8 100644 --- a/drivers/infiniband/hw/qib/qib_qsfp.c +++ b/drivers/infiniband/hw/qib/qib_qsfp.c @@ -292,7 +292,7 @@ int qib_refresh_qsfp_cache(struct qib_pportdata *ppd, struct qib_qsfp_cache *cp) qib_dev_porterr(ppd->dd, ppd->port, "QSFP byte0 is 0x%02X, S/B 0x0C/D\n", peek[0]); - if ((peek[2] & 2) == 0) { + if ((peek[2] & 4) == 0) { /* * If cable is paged, rather than "flat memory", we need to * set the page to zero, Even if it already appears to be zero. @@ -538,7 +538,7 @@ int qib_qsfp_dump(struct qib_pportdata *ppd, char *buf, int len) sofar += scnprintf(buf + sofar, len - sofar, "Date:%.*s\n", QSFP_DATE_LEN, cd.date); sofar += scnprintf(buf + sofar, len - sofar, "Lot:%.*s\n", - QSFP_LOT_LEN, cd.date); + QSFP_LOT_LEN, cd.lot); while (bidx < QSFP_DEFAULT_HDR_CNT) { int iidx; -- cgit v0.10.2 From d144da8c6f51f48ec39d891ea9dff80169c45f3b Mon Sep 17 00:00:00 2001 From: Mike Marciniszyn Date: Mon, 2 Nov 2015 12:13:25 -0500 Subject: IB/core: use RCU for uverbs id lookup The current implementation gets a spin_lock, and at any scale with qib and hfi1 post send, the lock contention grows exponentially with the number of QPs. idr_find() is RCU compatibile, so read doesn't need the lock. Change to use rcu_read_lock() and rcu_read_unlock() in __idr_get_uobj(). kfree_rcu() is used to insure a grace period between the idr removal and actual free. Reviewed-by: Ira Weiny Signed-off-by: Mike Marciniszyn Reviewed-By: Jason Gunthorpe Signed-off-by: Doug Ledford diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index 4cb8e9d..1c02dea 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -62,9 +62,11 @@ static struct uverbs_lock_class rule_lock_class = { .name = "RULE-uobj" }; * The ib_uobject locking scheme is as follows: * * - ib_uverbs_idr_lock protects the uverbs idrs themselves, so it - * needs to be held during all idr operations. When an object is + * needs to be held during all idr write operations. When an object is * looked up, a reference must be taken on the object's kref before - * dropping this lock. + * dropping this lock. For read operations, the rcu_read_lock() + * and rcu_write_lock() but similarly the kref reference is grabbed + * before the rcu_read_unlock(). * * - Each object also has an rwsem. This rwsem must be held for * reading while an operation that uses the object is performed. @@ -96,7 +98,7 @@ static void init_uobj(struct ib_uobject *uobj, u64 user_handle, static void release_uobj(struct kref *kref) { - kfree(container_of(kref, struct ib_uobject, ref)); + kfree_rcu(container_of(kref, struct ib_uobject, ref), rcu); } static void put_uobj(struct ib_uobject *uobj) @@ -145,7 +147,7 @@ static struct ib_uobject *__idr_get_uobj(struct idr *idr, int id, { struct ib_uobject *uobj; - spin_lock(&ib_uverbs_idr_lock); + rcu_read_lock(); uobj = idr_find(idr, id); if (uobj) { if (uobj->context == context) @@ -153,7 +155,7 @@ static struct ib_uobject *__idr_get_uobj(struct idr *idr, int id, else uobj = NULL; } - spin_unlock(&ib_uverbs_idr_lock); + rcu_read_unlock(); return uobj; } diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 9a68a19..120da1d 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -1271,6 +1271,7 @@ struct ib_uobject { int id; /* index into kernel idr */ struct kref ref; struct rw_semaphore mutex; /* protects .live */ + struct rcu_head rcu; /* kfree_rcu() overhead */ int live; }; -- cgit v0.10.2 From 2c63d1072ad7cf1059333ef5cfa06075bead4a39 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 20 Nov 2015 17:41:36 +0100 Subject: IB/iser: use sector_div instead of do_div do_div is the wrong way to divide a sector_t, as it is less efficient when sector_t is 32-bit wide. With the upcoming do_div optimizations, the kernel starts warning about this: drivers/infiniband/ulp/iser/iser_verbs.c:1296:4: note: in expansion of macro 'do_div' include/asm-generic/div64.h:224:22: warning: passing argument 1 of '__div64_32' from incompatible pointer type This changes the code to use sector_div instead, which always produces optimal code. Signed-off-by: Arnd Bergmann Reviewed-by: Sagi Grimberg Signed-off-by: Doug Ledford diff --git a/drivers/infiniband/ulp/iser/iser_verbs.c b/drivers/infiniband/ulp/iser/iser_verbs.c index a930702..42f4da6 100644 --- a/drivers/infiniband/ulp/iser/iser_verbs.c +++ b/drivers/infiniband/ulp/iser/iser_verbs.c @@ -1293,7 +1293,7 @@ u8 iser_check_task_pi_status(struct iscsi_iser_task *iser_task, if (mr_status.fail_status & IB_MR_CHECK_SIG_STATUS) { sector_t sector_off = mr_status.sig_err.sig_err_offset; - do_div(sector_off, sector_size + 8); + sector_div(sector_off, sector_size + 8); *sector = scsi_get_lba(iser_task->sc) + sector_off; pr_err("PI error found type %d at sector %llx " -- cgit v0.10.2 From 3ebd2fd0d0119a5ac7906bf17be637b527f63d31 Mon Sep 17 00:00:00 2001 From: Kaike Wan Date: Fri, 30 Oct 2015 08:23:45 -0400 Subject: IB/sa: Put netlink request into the request list before sending It was found by Saurabh Sengar that the netlink code tried to allocate memory with GFP_KERNEL while holding a spinlock. While it is possible to fix the issue by replacing GFP_KERNEL with GFP_ATOMIC, it is better to get rid of the spinlock while sending the packet. However, in order to protect against a race condition that a quick response may be received before the request is put on the request list, we need to put the request on the list first. Signed-off-by: Kaike Wan Reviewed-by: Jason Gunthorpe Reviewed-by: Ira Weiny Reported-by: Saurabh Sengar Signed-off-by: Doug Ledford diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c index 2aba774..a95a32b 100644 --- a/drivers/infiniband/core/sa_query.c +++ b/drivers/infiniband/core/sa_query.c @@ -512,7 +512,7 @@ static int ib_nl_get_path_rec_attrs_len(ib_sa_comp_mask comp_mask) return len; } -static int ib_nl_send_msg(struct ib_sa_query *query) +static int ib_nl_send_msg(struct ib_sa_query *query, gfp_t gfp_mask) { struct sk_buff *skb = NULL; struct nlmsghdr *nlh; @@ -526,7 +526,7 @@ static int ib_nl_send_msg(struct ib_sa_query *query) if (len <= 0) return -EMSGSIZE; - skb = nlmsg_new(len, GFP_KERNEL); + skb = nlmsg_new(len, gfp_mask); if (!skb) return -ENOMEM; @@ -544,7 +544,7 @@ static int ib_nl_send_msg(struct ib_sa_query *query) /* Repair the nlmsg header length */ nlmsg_end(skb, nlh); - ret = ibnl_multicast(skb, nlh, RDMA_NL_GROUP_LS, GFP_KERNEL); + ret = ibnl_multicast(skb, nlh, RDMA_NL_GROUP_LS, gfp_mask); if (!ret) ret = len; else @@ -553,7 +553,7 @@ static int ib_nl_send_msg(struct ib_sa_query *query) return ret; } -static int ib_nl_make_request(struct ib_sa_query *query) +static int ib_nl_make_request(struct ib_sa_query *query, gfp_t gfp_mask) { unsigned long flags; unsigned long delay; @@ -562,25 +562,27 @@ static int ib_nl_make_request(struct ib_sa_query *query) INIT_LIST_HEAD(&query->list); query->seq = (u32)atomic_inc_return(&ib_nl_sa_request_seq); + /* Put the request on the list first.*/ spin_lock_irqsave(&ib_nl_request_lock, flags); - ret = ib_nl_send_msg(query); - if (ret <= 0) { - ret = -EIO; - goto request_out; - } else { - ret = 0; - } - delay = msecs_to_jiffies(sa_local_svc_timeout_ms); query->timeout = delay + jiffies; list_add_tail(&query->list, &ib_nl_request_list); /* Start the timeout if this is the only request */ if (ib_nl_request_list.next == &query->list) queue_delayed_work(ib_nl_wq, &ib_nl_timed_work, delay); - -request_out: spin_unlock_irqrestore(&ib_nl_request_lock, flags); + ret = ib_nl_send_msg(query, gfp_mask); + if (ret <= 0) { + ret = -EIO; + /* Remove the request */ + spin_lock_irqsave(&ib_nl_request_lock, flags); + list_del(&query->list); + spin_unlock_irqrestore(&ib_nl_request_lock, flags); + } else { + ret = 0; + } + return ret; } @@ -1108,7 +1110,7 @@ static int send_mad(struct ib_sa_query *query, int timeout_ms, gfp_t gfp_mask) if (query->flags & IB_SA_ENABLE_LOCAL_SERVICE) { if (!ibnl_chk_listeners(RDMA_NL_GROUP_LS)) { - if (!ib_nl_make_request(query)) + if (!ib_nl_make_request(query, gfp_mask)) return id; } ib_sa_disable_local_svc(query); -- cgit v0.10.2 From 4d59ad2995e4128c06d889f9e223099b1f19548e Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Tue, 1 Dec 2015 10:17:32 -0800 Subject: IB/srp: Fix a memory leak If srp_connect_ch() returns a positive value then that is considered by its caller as a connection failure but this does not result in a scsi_host_put() call and additionally causes the srp_create_target() function to return a positive value while it should return a negative value. Avoid all this confusion and additionally fix a memory leak by ensuring that srp_connect_ch() always returns a value that is <= 0. This patch avoids that a rejected login triggers the following memory leak: unreferenced object 0xffff88021b24a220 (size 8): comm "srp_daemon", pid 56421, jiffies 4295006762 (age 4240.750s) hex dump (first 8 bytes): 68 6f 73 74 35 38 00 a5 host58.. backtrace: [] kmemleak_alloc+0x7a/0xc0 [] __kmalloc_track_caller+0xfe/0x160 [] kvasprintf+0x5b/0x90 [] kvasprintf_const+0x8d/0xb0 [] kobject_set_name_vargs+0x3c/0xa0 [] dev_set_name+0x3c/0x40 [] scsi_host_alloc+0x327/0x4b0 [] srp_create_target+0x4e/0x8a0 [ib_srp] [] dev_attr_store+0x1b/0x20 [] sysfs_kf_write+0x4a/0x60 [] kernfs_fop_write+0x14e/0x180 [] __vfs_write+0x2f/0xf0 [] vfs_write+0xa4/0x100 [] SyS_write+0x54/0xc0 [] entry_SYSCALL_64_fastpath+0x12/0x6f Signed-off-by: Bart Van Assche Reviewed-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Cc: Sebastian Parschauer Cc: stable Signed-off-by: Doug Ledford diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c index 9909022..a074dad 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.c +++ b/drivers/infiniband/ulp/srp/ib_srp.c @@ -994,16 +994,16 @@ static int srp_connect_ch(struct srp_rdma_ch *ch, bool multich) ret = srp_lookup_path(ch); if (ret) - return ret; + goto out; while (1) { init_completion(&ch->done); ret = srp_send_req(ch, multich); if (ret) - return ret; + goto out; ret = wait_for_completion_interruptible(&ch->done); if (ret < 0) - return ret; + goto out; /* * The CM event handling code will set status to @@ -1011,15 +1011,16 @@ static int srp_connect_ch(struct srp_rdma_ch *ch, bool multich) * back, or SRP_DLID_REDIRECT if we get a lid/qp * redirect REJ back. */ - switch (ch->status) { + ret = ch->status; + switch (ret) { case 0: ch->connected = true; - return 0; + goto out; case SRP_PORT_REDIRECT: ret = srp_lookup_path(ch); if (ret) - return ret; + goto out; break; case SRP_DLID_REDIRECT: @@ -1028,13 +1029,16 @@ static int srp_connect_ch(struct srp_rdma_ch *ch, bool multich) case SRP_STALE_CONN: shost_printk(KERN_ERR, target->scsi_host, PFX "giving up on stale connection\n"); - ch->status = -ECONNRESET; - return ch->status; + ret = -ECONNRESET; + goto out; default: - return ch->status; + goto out; } } + +out: + return ret <= 0 ? ret : -ENODEV; } static int srp_inv_rkey(struct srp_rdma_ch *ch, u32 rkey) -- cgit v0.10.2 From 09c0c0bea500a9ad362589990ee316c9b2482f44 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Tue, 1 Dec 2015 10:18:03 -0800 Subject: IB/srp: Fix possible send queue overflow When using work request based memory registration (fast_reg) we must reserve SQ entries for registration and invalidation in addition to send operations. Each IO consumes 3 SQ entries (registration, send, invalidation) so we need to allocate 3x larger send-queue instead of 2x. Signed-off-by: Sagi Grimberg CC: Stable Reviewed-by: Christoph Hellwig Signed-off-by: Doug Ledford diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c index a074dad..c7a95d2 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.c +++ b/drivers/infiniband/ulp/srp/ib_srp.c @@ -488,7 +488,7 @@ static int srp_create_ch_ib(struct srp_rdma_ch *ch) struct ib_qp *qp; struct ib_fmr_pool *fmr_pool = NULL; struct srp_fr_pool *fr_pool = NULL; - const int m = 1 + dev->use_fast_reg; + const int m = dev->use_fast_reg ? 3 : 1; struct ib_cq_init_attr cq_attr = {}; int ret; -- cgit v0.10.2 From fc925518aa04ee9f03c4e0cadd9c59f1ef48bcfa Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 1 Dec 2015 10:18:30 -0800 Subject: IB/srp: Initialize dma_length in srp_map_idb Without this sg_dma_len will return 0 on architectures tha have the dma_length field. Fixes: commit f7f7aab1a5c0 ("IB/srp: Convert to new registration API") Signed-off-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Signed-off-by: Doug Ledford diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c index c7a95d2..72fac20 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.c +++ b/drivers/infiniband/ulp/srp/ib_srp.c @@ -1524,6 +1524,9 @@ static int srp_map_idb(struct srp_rdma_ch *ch, struct srp_request *req, state.sg_nents = 1; sg_set_buf(idb_sg, req->indirect_desc, idb_len); idb_sg->dma_address = req->indirect_dma_addr; /* hack! */ +#ifdef CONFIG_NEED_SG_DMA_LENGTH + idb_sg->dma_length = idb_sg->length; /* hack^2 */ +#endif ret = srp_map_finish_fr(&state, ch); if (ret < 0) return ret; -- cgit v0.10.2 From a745f4f410082fedc17be75e10e4098f300943db Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Tue, 1 Dec 2015 10:18:47 -0800 Subject: IB/srp: Fix indirect data buffer rkey endianness Detected by sparse. Fixes: commit 330179f2fa93 ("IB/srp: Register the indirect data buffer descriptor") Signed-off-by: Bart Van Assche Cc: stable # v4.3+ Cc: Sagi Grimberg Cc: Christoph Hellwig Cc: Sebastian Parschauer Reviewed-by: Christoph Hellwig Signed-off-by: Doug Ledford diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c index 72fac20..fac1423 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.c +++ b/drivers/infiniband/ulp/srp/ib_srp.c @@ -1662,7 +1662,7 @@ static int srp_map_data(struct scsi_cmnd *scmnd, struct srp_rdma_ch *ch, return ret; req->nmdesc++; } else { - idb_rkey = target->global_mr->rkey; + idb_rkey = cpu_to_be32(target->global_mr->rkey); } indirect_hdr->table_desc.va = cpu_to_be64(req->indirect_dma_addr); -- cgit v0.10.2 From 57b0be9c0fb0ba3a35683c6ce21db7162d6758c5 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Tue, 1 Dec 2015 10:19:38 -0800 Subject: IB/srp: Fix srp_map_sg_fr() After dma_map_sg() has been called the return value of that function must be used as the number of elements in the scatterlist instead of scsi_sg_count(). Fixes: commit f7f7aab1a5c0 ("IB/srp: Convert to new registration API") Reported-by: Christoph Hellwig Signed-off-by: Bart Van Assche Cc: stable # v4.4+ Cc: Sagi Grimberg Cc: Sebastian Parschauer Reviewed-by: Sagi Grimberg Signed-off-by: Doug Ledford diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c index fac1423..3db9a65 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.c +++ b/drivers/infiniband/ulp/srp/ib_srp.c @@ -1313,7 +1313,7 @@ reset_state: } static int srp_map_finish_fr(struct srp_map_state *state, - struct srp_rdma_ch *ch) + struct srp_rdma_ch *ch, int sg_nents) { struct srp_target_port *target = ch->target; struct srp_device *dev = target->srp_host->srp_dev; @@ -1328,10 +1328,10 @@ static int srp_map_finish_fr(struct srp_map_state *state, WARN_ON_ONCE(!dev->use_fast_reg); - if (state->sg_nents == 0) + if (sg_nents == 0) return 0; - if (state->sg_nents == 1 && target->global_mr) { + if (sg_nents == 1 && target->global_mr) { srp_map_desc(state, sg_dma_address(state->sg), sg_dma_len(state->sg), target->global_mr->rkey); @@ -1345,8 +1345,7 @@ static int srp_map_finish_fr(struct srp_map_state *state, rkey = ib_inc_rkey(desc->mr->rkey); ib_update_fast_reg_key(desc->mr, rkey); - n = ib_map_mr_sg(desc->mr, state->sg, state->sg_nents, - dev->mr_page_size); + n = ib_map_mr_sg(desc->mr, state->sg, sg_nents, dev->mr_page_size); if (unlikely(n < 0)) return n; @@ -1452,16 +1451,15 @@ static int srp_map_sg_fr(struct srp_map_state *state, struct srp_rdma_ch *ch, state->fr.next = req->fr_list; state->fr.end = req->fr_list + ch->target->cmd_sg_cnt; state->sg = scat; - state->sg_nents = scsi_sg_count(req->scmnd); - while (state->sg_nents) { + while (count) { int i, n; - n = srp_map_finish_fr(state, ch); + n = srp_map_finish_fr(state, ch, count); if (unlikely(n < 0)) return n; - state->sg_nents -= n; + count -= n; for (i = 0; i < n; i++) state->sg = sg_next(state->sg); } @@ -1521,13 +1519,12 @@ static int srp_map_idb(struct srp_rdma_ch *ch, struct srp_request *req, if (dev->use_fast_reg) { state.sg = idb_sg; - state.sg_nents = 1; sg_set_buf(idb_sg, req->indirect_desc, idb_len); idb_sg->dma_address = req->indirect_dma_addr; /* hack! */ #ifdef CONFIG_NEED_SG_DMA_LENGTH idb_sg->dma_length = idb_sg->length; /* hack^2 */ #endif - ret = srp_map_finish_fr(&state, ch); + ret = srp_map_finish_fr(&state, ch, 1); if (ret < 0) return ret; } else if (dev->use_fmr) { diff --git a/drivers/infiniband/ulp/srp/ib_srp.h b/drivers/infiniband/ulp/srp/ib_srp.h index 87a2a91..f6af531 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.h +++ b/drivers/infiniband/ulp/srp/ib_srp.h @@ -300,10 +300,7 @@ struct srp_map_state { dma_addr_t base_dma_addr; u32 dma_len; u32 total_len; - union { - unsigned int npages; - int sg_nents; - }; + unsigned int npages; unsigned int nmdesc; unsigned int ndesc; }; -- cgit v0.10.2 From 8f5ba10ed40a9d3ffe84854984227d011a7428bd Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 3 Dec 2015 16:04:17 -0800 Subject: IB core: Fix ib_sg_to_pages() On 12/03/2015 01:18 AM, Christoph Hellwig wrote: > The patch looks good to me, but while we touch this area, how about > throwing in a few cosmetic fixes as well? How about the patch below ? In that version of the ib_sg_to_pages() fix these concerns have been addressed and additionally to more bugs have been fixed. ------------ [PATCH] IB core: Fix ib_sg_to_pages() Fix the code for detecting gaps. A gap occurs not only if the second or later scatterlist element is not aligned but also if any scatterlist element other than the last does not end at a page boundary. In the code for coalescing contiguous elements, ensure that mr->length is correct and that last_page_addr is up-to-date. Ensure that this function returns a negative error code instead of zero if the first set_page() call fails. Fixes: commit 4c67e2bfc8b7 ("IB/core: Introduce new fast registration API") Reported-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Reviewed-by: Christoph Hellwig Signed-off-by: Doug Ledford diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index 043a60e..545906d 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -1516,7 +1516,7 @@ EXPORT_SYMBOL(ib_map_mr_sg); * @sg_nents: number of entries in sg * @set_page: driver page assignment function pointer * - * Core service helper for drivers to covert the largest + * Core service helper for drivers to convert the largest * prefix of given sg list to a page vector. The sg list * prefix converted is the prefix that meet the requirements * of ib_map_mr_sg. @@ -1533,7 +1533,7 @@ int ib_sg_to_pages(struct ib_mr *mr, u64 last_end_dma_addr = 0, last_page_addr = 0; unsigned int last_page_off = 0; u64 page_mask = ~((u64)mr->page_size - 1); - int i; + int i, ret; mr->iova = sg_dma_address(&sgl[0]); mr->length = 0; @@ -1544,27 +1544,29 @@ int ib_sg_to_pages(struct ib_mr *mr, u64 end_dma_addr = dma_addr + dma_len; u64 page_addr = dma_addr & page_mask; - if (i && page_addr != dma_addr) { - if (last_end_dma_addr != dma_addr) { - /* gap */ - goto done; - - } else if (last_page_off + dma_len <= mr->page_size) { - /* chunk this fragment with the last */ - mr->length += dma_len; - last_end_dma_addr += dma_len; - last_page_off += dma_len; - continue; - } else { - /* map starting from the next page */ - page_addr = last_page_addr + mr->page_size; - dma_len -= mr->page_size - last_page_off; - } + /* + * For the second and later elements, check whether either the + * end of element i-1 or the start of element i is not aligned + * on a page boundary. + */ + if (i && (last_page_off != 0 || page_addr != dma_addr)) { + /* Stop mapping if there is a gap. */ + if (last_end_dma_addr != dma_addr) + break; + + /* + * Coalesce this element with the last. If it is small + * enough just update mr->length. Otherwise start + * mapping from the next page. + */ + goto next_page; } do { - if (unlikely(set_page(mr, page_addr))) - goto done; + ret = set_page(mr, page_addr); + if (unlikely(ret < 0)) + return i ? : ret; +next_page: page_addr += mr->page_size; } while (page_addr < end_dma_addr); @@ -1574,7 +1576,6 @@ int ib_sg_to_pages(struct ib_mr *mr, last_page_off = end_dma_addr & ~page_mask; } -done: return i; } EXPORT_SYMBOL(ib_sg_to_pages); -- cgit v0.10.2 From 25642e1459ace29f6ce5a171efc8b7b59a52a2d4 Mon Sep 17 00:00:00 2001 From: Alistair Popple Date: Mon, 7 Dec 2015 11:28:28 +1100 Subject: powerpc/opal-irqchip: Fix double endian conversion The OPAL event calls return a mask of events that are active in big endian format. This is checked when unmasking the events in the irqchip by comparison with a cached value. The cached value was stored in big endian format but should've been converted to CPU endian first. This bug leads to OPAL event delivery being delayed or dropped on some systems. Symptoms may include a non-functional console. The bug is fixed by calling opal_handle_events(...) instead of duplicating code in opal_event_unmask(...). Fixes: 9f0fd0499d30 ("powerpc/powernv: Add a virtual irqchip for opal events") Cc: stable@vger.kernel.org # v4.2+ Reported-by: Douglas L Lehr Signed-off-by: Alistair Popple Signed-off-by: Michael Ellerman diff --git a/arch/powerpc/platforms/powernv/opal-irqchip.c b/arch/powerpc/platforms/powernv/opal-irqchip.c index 6ccfb6c..0a00e2a 100644 --- a/arch/powerpc/platforms/powernv/opal-irqchip.c +++ b/arch/powerpc/platforms/powernv/opal-irqchip.c @@ -43,11 +43,34 @@ static unsigned int opal_irq_count; static unsigned int *opal_irqs; static void opal_handle_irq_work(struct irq_work *work); -static __be64 last_outstanding_events; +static u64 last_outstanding_events; static struct irq_work opal_event_irq_work = { .func = opal_handle_irq_work, }; +void opal_handle_events(uint64_t events) +{ + int virq, hwirq = 0; + u64 mask = opal_event_irqchip.mask; + + if (!in_irq() && (events & mask)) { + last_outstanding_events = events; + irq_work_queue(&opal_event_irq_work); + return; + } + + while (events & mask) { + hwirq = fls64(events) - 1; + if (BIT_ULL(hwirq) & mask) { + virq = irq_find_mapping(opal_event_irqchip.domain, + hwirq); + if (virq) + generic_handle_irq(virq); + } + events &= ~BIT_ULL(hwirq); + } +} + static void opal_event_mask(struct irq_data *d) { clear_bit(d->hwirq, &opal_event_irqchip.mask); @@ -55,12 +78,12 @@ static void opal_event_mask(struct irq_data *d) static void opal_event_unmask(struct irq_data *d) { + __be64 events; + set_bit(d->hwirq, &opal_event_irqchip.mask); - opal_poll_events(&last_outstanding_events); - if (last_outstanding_events & opal_event_irqchip.mask) - /* Need to retrigger the interrupt */ - irq_work_queue(&opal_event_irq_work); + opal_poll_events(&events); + opal_handle_events(be64_to_cpu(events)); } static int opal_event_set_type(struct irq_data *d, unsigned int flow_type) @@ -96,29 +119,6 @@ static int opal_event_map(struct irq_domain *d, unsigned int irq, return 0; } -void opal_handle_events(uint64_t events) -{ - int virq, hwirq = 0; - u64 mask = opal_event_irqchip.mask; - - if (!in_irq() && (events & mask)) { - last_outstanding_events = events; - irq_work_queue(&opal_event_irq_work); - return; - } - - while (events & mask) { - hwirq = fls64(events) - 1; - if (BIT_ULL(hwirq) & mask) { - virq = irq_find_mapping(opal_event_irqchip.domain, - hwirq); - if (virq) - generic_handle_irq(virq); - } - events &= ~BIT_ULL(hwirq); - } -} - static irqreturn_t opal_interrupt(int irq, void *data) { __be64 events; @@ -131,7 +131,7 @@ static irqreturn_t opal_interrupt(int irq, void *data) static void opal_handle_irq_work(struct irq_work *work) { - opal_handle_events(be64_to_cpu(last_outstanding_events)); + opal_handle_events(last_outstanding_events); } static int opal_event_match(struct irq_domain *h, struct device_node *node, -- cgit v0.10.2 From e606e035cc7293a3824527d97359711fdda00663 Mon Sep 17 00:00:00 2001 From: Frederic Barrat Date: Mon, 7 Dec 2015 14:34:40 +0100 Subject: cxl: Set endianess of kernel contexts A process element (defined in CAIA) keeps track of the endianess of contexts through the Little Endian (LE) bit of the State Register. It is currently set for user contexts, but was somehow forgotten for kernel contexts, so this patch fixes it. It could lead to erratic behavior from an AFU when the context is attached through the kernel API. Fixes: 2f663527bd6a ("cxl: Configure PSL for kernel contexts and merge code") Cc: stable@vger.kernel.org # 4.2+ Signed-off-by: Frederic Barrat Suggested-by: Michael Neuling Signed-off-by: Michael Ellerman diff --git a/drivers/misc/cxl/native.c b/drivers/misc/cxl/native.c index d2e75c8..f409097 100644 --- a/drivers/misc/cxl/native.c +++ b/drivers/misc/cxl/native.c @@ -497,6 +497,7 @@ static u64 calculate_sr(struct cxl_context *ctx) { u64 sr = 0; + set_endian(sr); if (ctx->master) sr |= CXL_PSL_SR_An_MP; if (mfspr(SPRN_LPCR) & LPCR_TC) @@ -506,7 +507,6 @@ static u64 calculate_sr(struct cxl_context *ctx) sr |= CXL_PSL_SR_An_HV; } else { sr |= CXL_PSL_SR_An_PR | CXL_PSL_SR_An_R; - set_endian(sr); sr &= ~(CXL_PSL_SR_An_HV); if (!test_tsk_thread_flag(current, TIF_32BIT)) sr |= CXL_PSL_SR_An_SF; -- cgit v0.10.2 From 23adc192b862b69ad80a40bd5206e337f41264ac Mon Sep 17 00:00:00 2001 From: Hui Wang Date: Tue, 8 Dec 2015 12:27:18 +0800 Subject: ALSA: hda - Fixing speaker noise on the two latest thinkpad models We have two latest thinkpad laptop models which are all based on the Intel skylake platforms, and all of them have the codec alc293 on them. When the machines boot to the desktop, an greeting dialogue shows up with the notification sound. But on these two models, there is noise with the notification sound. We have 3 SKUs for each of the models, all of them have this problem. So far, this problem is only specific to these two thinkpad models, we did not find this problem on the old thinkpad models with the codec alc293 or alc292. A workaround for this problem is disabling the aamix. Cc: stable@vger.kernel.org BugLink: https://bugs.launchpad.net/bugs/1523517 Signed-off-by: Hui Wang Signed-off-by: Takashi Iwai diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index ebc5362..49b1d36 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -4596,6 +4596,7 @@ enum { ALC298_FIXUP_DELL1_MIC_NO_PRESENCE, ALC275_FIXUP_DELL_XPS, ALC256_FIXUP_DELL_XPS_13_HEADPHONE_NOISE, + ALC293_FIXUP_LENOVO_SPK_NOISE, }; static const struct hda_fixup alc269_fixups[] = { @@ -5187,6 +5188,12 @@ static const struct hda_fixup alc269_fixups[] = { .chained = true, .chain_id = ALC255_FIXUP_DELL1_MIC_NO_PRESENCE }, + [ALC293_FIXUP_LENOVO_SPK_NOISE] = { + .type = HDA_FIXUP_FUNC, + .v.func = alc_fixup_disable_aamix, + .chained = true, + .chain_id = ALC269_FIXUP_THINKPAD_ACPI + }, }; static const struct snd_pci_quirk alc269_fixup_tbl[] = { @@ -5334,6 +5341,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x17aa, 0x2215, "Thinkpad", ALC269_FIXUP_LIMIT_INT_MIC_BOOST), SND_PCI_QUIRK(0x17aa, 0x2223, "ThinkPad T550", ALC292_FIXUP_TPT440_DOCK), SND_PCI_QUIRK(0x17aa, 0x2226, "ThinkPad X250", ALC292_FIXUP_TPT440_DOCK), + SND_PCI_QUIRK(0x17aa, 0x2233, "Thinkpad", ALC293_FIXUP_LENOVO_SPK_NOISE), SND_PCI_QUIRK(0x17aa, 0x3977, "IdeaPad S210", ALC283_FIXUP_INT_MIC), SND_PCI_QUIRK(0x17aa, 0x3978, "IdeaPad Y410P", ALC269_FIXUP_NO_SHUTUP), SND_PCI_QUIRK(0x17aa, 0x5013, "Thinkpad", ALC269_FIXUP_LIMIT_INT_MIC_BOOST), @@ -5343,6 +5351,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x17aa, 0x5034, "Thinkpad T450", ALC292_FIXUP_TPT440_DOCK), SND_PCI_QUIRK(0x17aa, 0x5036, "Thinkpad T450s", ALC292_FIXUP_TPT440_DOCK), SND_PCI_QUIRK(0x17aa, 0x503c, "Thinkpad L450", ALC292_FIXUP_TPT440_DOCK), + SND_PCI_QUIRK(0x17aa, 0x504b, "Thinkpad", ALC293_FIXUP_LENOVO_SPK_NOISE), SND_PCI_QUIRK(0x17aa, 0x5109, "Thinkpad", ALC269_FIXUP_LIMIT_INT_MIC_BOOST), SND_PCI_QUIRK(0x17aa, 0x3bf8, "Quanta FL1", ALC269_FIXUP_PCM_44K), SND_PCI_QUIRK(0x17aa, 0x9e54, "LENOVO NB", ALC269_FIXUP_LENOVO_EAPD), -- cgit v0.10.2 From 8fbf9d92a7bc4cadd3a0139698cf17031dfcdfca Mon Sep 17 00:00:00 2001 From: Thomas Hellstrom Date: Thu, 26 Nov 2015 19:45:16 +0100 Subject: drm/vmwgfx: Implement the cursor_set2 callback v2 Fixes native drm clients like Fedora 23 Wayland which now appears to be able to use cursor hotspots without strange cursor offsets. Also fixes a couple of ignored error paths. Since the core drm cursor hotspot is incompatible with the legacy vmwgfx hotspot (the core drm hotspot is reset when the drm_mode_cursor ioctl is used), we need to keep track of both and add them when the device hotspot is set. We assume that either is always zero. Signed-off-by: Thomas Hellstrom Reviewed-by: Sinclair Yeh diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c index a09cf85..c49812b 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c @@ -1233,6 +1233,7 @@ static void vmw_master_drop(struct drm_device *dev, vmw_fp->locked_master = drm_master_get(file_priv->master); ret = ttm_vt_lock(&vmaster->lock, false, vmw_fp->tfile); + vmw_kms_legacy_hotspot_clear(dev_priv); if (unlikely((ret != 0))) { DRM_ERROR("Unable to lock TTM at VT switch.\n"); drm_master_put(&vmw_fp->locked_master); diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.h b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.h index a8ae9df..469cdd5 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.h +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.h @@ -925,6 +925,7 @@ int vmw_kms_present(struct vmw_private *dev_priv, uint32_t num_clips); int vmw_kms_update_layout_ioctl(struct drm_device *dev, void *data, struct drm_file *file_priv); +void vmw_kms_legacy_hotspot_clear(struct vmw_private *dev_priv); int vmw_dumb_create(struct drm_file *file_priv, struct drm_device *dev, diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_kms.c b/drivers/gpu/drm/vmwgfx/vmwgfx_kms.c index 9fcd7f8..9b4bb9e 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_kms.c +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_kms.c @@ -133,13 +133,19 @@ void vmw_cursor_update_position(struct vmw_private *dev_priv, vmw_mmio_write(++count, fifo_mem + SVGA_FIFO_CURSOR_COUNT); } -int vmw_du_crtc_cursor_set(struct drm_crtc *crtc, struct drm_file *file_priv, - uint32_t handle, uint32_t width, uint32_t height) + +/* + * vmw_du_crtc_cursor_set2 - Driver cursor_set2 callback. + */ +int vmw_du_crtc_cursor_set2(struct drm_crtc *crtc, struct drm_file *file_priv, + uint32_t handle, uint32_t width, uint32_t height, + int32_t hot_x, int32_t hot_y) { struct vmw_private *dev_priv = vmw_priv(crtc->dev); struct vmw_display_unit *du = vmw_crtc_to_du(crtc); struct vmw_surface *surface = NULL; struct vmw_dma_buffer *dmabuf = NULL; + s32 hotspot_x, hotspot_y; int ret; /* @@ -151,6 +157,8 @@ int vmw_du_crtc_cursor_set(struct drm_crtc *crtc, struct drm_file *file_priv, */ drm_modeset_unlock_crtc(crtc); drm_modeset_lock_all(dev_priv->dev); + hotspot_x = hot_x + du->hotspot_x; + hotspot_y = hot_y + du->hotspot_y; /* A lot of the code assumes this */ if (handle && (width != 64 || height != 64)) { @@ -187,31 +195,34 @@ int vmw_du_crtc_cursor_set(struct drm_crtc *crtc, struct drm_file *file_priv, vmw_dmabuf_unreference(&du->cursor_dmabuf); /* setup new image */ + ret = 0; if (surface) { /* vmw_user_surface_lookup takes one reference */ du->cursor_surface = surface; du->cursor_surface->snooper.crtc = crtc; du->cursor_age = du->cursor_surface->snooper.age; - vmw_cursor_update_image(dev_priv, surface->snooper.image, - 64, 64, du->hotspot_x, du->hotspot_y); + ret = vmw_cursor_update_image(dev_priv, surface->snooper.image, + 64, 64, hotspot_x, hotspot_y); } else if (dmabuf) { /* vmw_user_surface_lookup takes one reference */ du->cursor_dmabuf = dmabuf; ret = vmw_cursor_update_dmabuf(dev_priv, dmabuf, width, height, - du->hotspot_x, du->hotspot_y); + hotspot_x, hotspot_y); } else { vmw_cursor_update_position(dev_priv, false, 0, 0); - ret = 0; goto out; } - vmw_cursor_update_position(dev_priv, true, - du->cursor_x + du->hotspot_x, - du->cursor_y + du->hotspot_y); + if (!ret) { + vmw_cursor_update_position(dev_priv, true, + du->cursor_x + hotspot_x, + du->cursor_y + hotspot_y); + du->core_hotspot_x = hot_x; + du->core_hotspot_y = hot_y; + } - ret = 0; out: drm_modeset_unlock_all(dev_priv->dev); drm_modeset_lock_crtc(crtc, crtc->cursor); @@ -239,8 +250,10 @@ int vmw_du_crtc_cursor_move(struct drm_crtc *crtc, int x, int y) drm_modeset_lock_all(dev_priv->dev); vmw_cursor_update_position(dev_priv, shown, - du->cursor_x + du->hotspot_x, - du->cursor_y + du->hotspot_y); + du->cursor_x + du->hotspot_x + + du->core_hotspot_x, + du->cursor_y + du->hotspot_y + + du->core_hotspot_y); drm_modeset_unlock_all(dev_priv->dev); drm_modeset_lock_crtc(crtc, crtc->cursor); @@ -334,6 +347,29 @@ err_unreserve: ttm_bo_unreserve(bo); } +/** + * vmw_kms_legacy_hotspot_clear - Clear legacy hotspots + * + * @dev_priv: Pointer to the device private struct. + * + * Clears all legacy hotspots. + */ +void vmw_kms_legacy_hotspot_clear(struct vmw_private *dev_priv) +{ + struct drm_device *dev = dev_priv->dev; + struct vmw_display_unit *du; + struct drm_crtc *crtc; + + drm_modeset_lock_all(dev); + drm_for_each_crtc(crtc, dev) { + du = vmw_crtc_to_du(crtc); + + du->hotspot_x = 0; + du->hotspot_y = 0; + } + drm_modeset_unlock_all(dev); +} + void vmw_kms_cursor_post_execbuf(struct vmw_private *dev_priv) { struct drm_device *dev = dev_priv->dev; @@ -351,7 +387,9 @@ void vmw_kms_cursor_post_execbuf(struct vmw_private *dev_priv) du->cursor_age = du->cursor_surface->snooper.age; vmw_cursor_update_image(dev_priv, du->cursor_surface->snooper.image, - 64, 64, du->hotspot_x, du->hotspot_y); + 64, 64, + du->hotspot_x + du->core_hotspot_x, + du->hotspot_y + du->core_hotspot_y); } mutex_unlock(&dev->mode_config.mutex); diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_kms.h b/drivers/gpu/drm/vmwgfx/vmwgfx_kms.h index 782df7c..edd8150 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_kms.h +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_kms.h @@ -159,6 +159,8 @@ struct vmw_display_unit { int hotspot_x; int hotspot_y; + s32 core_hotspot_x; + s32 core_hotspot_y; unsigned unit; @@ -193,8 +195,9 @@ void vmw_du_crtc_restore(struct drm_crtc *crtc); void vmw_du_crtc_gamma_set(struct drm_crtc *crtc, u16 *r, u16 *g, u16 *b, uint32_t start, uint32_t size); -int vmw_du_crtc_cursor_set(struct drm_crtc *crtc, struct drm_file *file_priv, - uint32_t handle, uint32_t width, uint32_t height); +int vmw_du_crtc_cursor_set2(struct drm_crtc *crtc, struct drm_file *file_priv, + uint32_t handle, uint32_t width, uint32_t height, + int32_t hot_x, int32_t hot_y); int vmw_du_crtc_cursor_move(struct drm_crtc *crtc, int x, int y); int vmw_du_connector_dpms(struct drm_connector *connector, int mode); void vmw_du_connector_save(struct drm_connector *connector); diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_ldu.c b/drivers/gpu/drm/vmwgfx/vmwgfx_ldu.c index bb63e4d..52caecb 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_ldu.c +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_ldu.c @@ -297,7 +297,7 @@ static int vmw_ldu_crtc_set_config(struct drm_mode_set *set) static struct drm_crtc_funcs vmw_legacy_crtc_funcs = { .save = vmw_du_crtc_save, .restore = vmw_du_crtc_restore, - .cursor_set = vmw_du_crtc_cursor_set, + .cursor_set2 = vmw_du_crtc_cursor_set2, .cursor_move = vmw_du_crtc_cursor_move, .gamma_set = vmw_du_crtc_gamma_set, .destroy = vmw_ldu_crtc_destroy, diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_scrn.c b/drivers/gpu/drm/vmwgfx/vmwgfx_scrn.c index b96d1ab..13926ff 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_scrn.c +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_scrn.c @@ -533,7 +533,7 @@ out_no_fence: static struct drm_crtc_funcs vmw_screen_object_crtc_funcs = { .save = vmw_du_crtc_save, .restore = vmw_du_crtc_restore, - .cursor_set = vmw_du_crtc_cursor_set, + .cursor_set2 = vmw_du_crtc_cursor_set2, .cursor_move = vmw_du_crtc_cursor_move, .gamma_set = vmw_du_crtc_gamma_set, .destroy = vmw_sou_crtc_destroy, diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_stdu.c b/drivers/gpu/drm/vmwgfx/vmwgfx_stdu.c index b1fc1c0..f823fc3 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_stdu.c +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_stdu.c @@ -1043,7 +1043,7 @@ out_finish: static struct drm_crtc_funcs vmw_stdu_crtc_funcs = { .save = vmw_du_crtc_save, .restore = vmw_du_crtc_restore, - .cursor_set = vmw_du_crtc_cursor_set, + .cursor_set2 = vmw_du_crtc_cursor_set2, .cursor_move = vmw_du_crtc_cursor_move, .gamma_set = vmw_du_crtc_gamma_set, .destroy = vmw_stdu_crtc_destroy, -- cgit v0.10.2 From 928c75fbeb6b62edbdff7c52ee1e9a3591faee4f Mon Sep 17 00:00:00 2001 From: LABBE Corentin Date: Tue, 24 Nov 2015 15:34:09 +0100 Subject: usb: phy: msm: fix a possible NULL dereference of_match_device could return NULL, and so cause a NULL pointer dereference later. Even if the probability of this case is very low, fixing it made static analyzers happy. Solving this with of_device_get_match_data made also code simplier. Reported-by: coverity (CID 1324133) Signed-off-by: LABBE Corentin Signed-off-by: Felipe Balbi diff --git a/drivers/usb/phy/phy-msm-usb.c b/drivers/usb/phy/phy-msm-usb.c index 80eb991..0d19a6d 100644 --- a/drivers/usb/phy/phy-msm-usb.c +++ b/drivers/usb/phy/phy-msm-usb.c @@ -1506,7 +1506,6 @@ static int msm_otg_read_dt(struct platform_device *pdev, struct msm_otg *motg) { struct msm_otg_platform_data *pdata; struct extcon_dev *ext_id, *ext_vbus; - const struct of_device_id *id; struct device_node *node = pdev->dev.of_node; struct property *prop; int len, ret, words; @@ -1518,8 +1517,9 @@ static int msm_otg_read_dt(struct platform_device *pdev, struct msm_otg *motg) motg->pdata = pdata; - id = of_match_device(msm_otg_dt_match, &pdev->dev); - pdata->phy_type = (enum msm_usb_phy_type) id->data; + pdata->phy_type = (enum msm_usb_phy_type)of_device_get_match_data(&pdev->dev); + if (!pdata->phy_type) + return 1; motg->link_rst = devm_reset_control_get(&pdev->dev, "link"); if (IS_ERR(motg->link_rst)) -- cgit v0.10.2 From 4d05591094979d24a0e728f246589963c40102e9 Mon Sep 17 00:00:00 2001 From: Tony Lindgren Date: Mon, 7 Dec 2015 21:23:15 -0800 Subject: usb: musb: core: Fix pm runtime for deferred probe If musb_init_controller fails at musb_platform_init, we have already called pm_runtime_irq_safe for musb and that causes the pm runtime count to be enabled for parent before the parent has completed initialization. This causes pm to stop working as on unload nothing gets idled. This issue can be reproduced at least with: # modprobe omap2430 HS USB OTG: no transceiver configured musb-hdrc musb-hdrc.0.auto: musb_init_controller failed with status -517 # modprobe phy-twl4030-usb # rmmod omap2430 And after the steps above omap2430 will block deeper idle states on omap3. To fix this, let's not enable pm runtime until we need to and the parent has been initialized. Note that this does not fix the issue of PM being broken for musb during runtime. Signed-off-by: Tony Lindgren Signed-off-by: Felipe Balbi diff --git a/drivers/usb/musb/musb_core.c b/drivers/usb/musb/musb_core.c index 18cfc0a..3c07ffd 100644 --- a/drivers/usb/musb/musb_core.c +++ b/drivers/usb/musb/musb_core.c @@ -2017,7 +2017,6 @@ musb_init_controller(struct device *dev, int nIrq, void __iomem *ctrl) /* We need musb_read/write functions initialized for PM */ pm_runtime_use_autosuspend(musb->controller); pm_runtime_set_autosuspend_delay(musb->controller, 200); - pm_runtime_irq_safe(musb->controller); pm_runtime_enable(musb->controller); /* The musb_platform_init() call: @@ -2218,6 +2217,12 @@ musb_init_controller(struct device *dev, int nIrq, void __iomem *ctrl) pm_runtime_put(musb->controller); + /* + * For why this is currently needed, see commit 3e43a0725637 + * ("usb: musb: core: add pm_runtime_irq_safe()") + */ + pm_runtime_irq_safe(musb->controller); + return 0; fail5: -- cgit v0.10.2 From 27681abc429554622a188a3a0b213fb1c20040f7 Mon Sep 17 00:00:00 2001 From: Mian Yousaf Kaukab Date: Sat, 28 Nov 2015 18:35:44 +0100 Subject: usb: gadget: uvc: fix permissions of configfs attributes 76e0da3 "usb-gadget/uvc: use per-attribute show and store methods" removed write permission for writeable attributes. Correct attribute permissions. Fixes: 76e0da3 "usb-gadget/uvc: use per-attribute show and store methods" Reviewed-by: Christoph Hellwig Reviewed-by: Laurent Pinchart Signed-off-by: Mian Yousaf Kaukab Signed-off-by: Felipe Balbi diff --git a/drivers/usb/gadget/function/uvc_configfs.c b/drivers/usb/gadget/function/uvc_configfs.c index 289ebca..ad8c9b0 100644 --- a/drivers/usb/gadget/function/uvc_configfs.c +++ b/drivers/usb/gadget/function/uvc_configfs.c @@ -20,7 +20,7 @@ #define UVC_ATTR(prefix, cname, aname) \ static struct configfs_attribute prefix##attr_##cname = { \ .ca_name = __stringify(aname), \ - .ca_mode = S_IRUGO, \ + .ca_mode = S_IRUGO | S_IWUGO, \ .ca_owner = THIS_MODULE, \ .show = prefix##cname##_show, \ .store = prefix##cname##_store, \ -- cgit v0.10.2 From f8062386671a596ca7022c61727a14a25679a0a1 Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Sat, 5 Dec 2015 16:13:53 -0800 Subject: of/fdt: Add mutex protection for calls to __unflatten_device_tree() __unflatten_device_tree() calls unflatten_dt_node(), which declares a static variable. It is therefore not reentrant. One of the callers of __unflatten_device_tree(), unflatten_device_tree(), is only called once during early initialization and does not need to be protected. The other caller, of_fdt_unflatten_tree(), can be called at any time, possibly multiple times in parallel. This can happen, for example, if multiple devicetree overlays have to be loaded and installed. Without this protection, errors such as the following may be seen. kernel: End of tree marker overwritten: e6a3a458 kernel: find_target_node: Failed to find target-indirect node at /fragment@0 kernel: __of_overlay_create: of_build_overlay_info() failed for tree@/ Add a mutex to of_fdt_unflatten_tree() to make the call reentrant. Cc: Pantelis Antoniou Signed-off-by: Guenter Roeck Cc: stable@vger.kernel.org # v4.1+ Signed-off-by: Rob Herring diff --git a/drivers/of/fdt.c b/drivers/of/fdt.c index 1bbe3a9..655f79d 100644 --- a/drivers/of/fdt.c +++ b/drivers/of/fdt.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -436,6 +437,8 @@ static void *kernel_tree_alloc(u64 size, u64 align) return kzalloc(size, GFP_KERNEL); } +static DEFINE_MUTEX(of_fdt_unflatten_mutex); + /** * of_fdt_unflatten_tree - create tree of device_nodes from flat blob * @@ -447,7 +450,9 @@ static void *kernel_tree_alloc(u64 size, u64 align) void of_fdt_unflatten_tree(const unsigned long *blob, struct device_node **mynodes) { + mutex_lock(&of_fdt_unflatten_mutex); __unflatten_device_tree(blob, mynodes, &kernel_tree_alloc); + mutex_unlock(&of_fdt_unflatten_mutex); } EXPORT_SYMBOL_GPL(of_fdt_unflatten_tree); -- cgit v0.10.2 From d3632493c70b6a866a77264cd8cfdeb89958b906 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Fri, 20 Nov 2015 11:04:12 -0800 Subject: IB/cma: Add a missing rcu_read_unlock() Ensure that validate_ipv4_net_dev() calls rcu_read_unlock() if fib_lookup() fails. Detected by sparse. Compile-tested only. Fixes: "IB/cma: Validate routing of incoming requests" (commit f887f2ac87c2). Cc: Haggai Eran Cc: stable Reviewed-by: Sagi Grimberg Reviewed-by: Haggai Eran Reviewed-by: Jason Gunthorpe Signed-off-by: Doug Ledford diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index 944cd90..d2d5d00 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -1126,10 +1126,7 @@ static bool validate_ipv4_net_dev(struct net_device *net_dev, rcu_read_lock(); err = fib_lookup(dev_net(net_dev), &fl4, &res, 0); - if (err) - return false; - - ret = FIB_RES_DEV(res) == net_dev; + ret = err == 0 && FIB_RES_DEV(res) == net_dev; rcu_read_unlock(); return ret; -- cgit v0.10.2 From 533708867dd6388f643f12c87465b59e732d729d Mon Sep 17 00:00:00 2001 From: Hal Rosenstock Date: Fri, 13 Nov 2015 15:22:22 -0500 Subject: IB/mad: Require CM send method for everything except ClassPortInfo Receipt of CM MAD with other than the Send method for an attribute other than the ClassPortInfo attribute is invalid. CM attributes other than ClassPortInfo only use the send method. The SRP initiator does not maintain a timeout policy for CM connect requests relies on the CM layer to do that. The result was that the SRP initiator hung as the connect request never completed. A new SRP target has been observed to respond to Send CM REQ with GetResp of CM REQ with bad status. This is non conformant with IBA spec but exposes a vulnerability in the current MAD/CM code which will respond to the incoming GetResp of CM REQ as if it was a valid incoming Send of CM REQ rather than tossing this on the floor. It also causes the MAD layer not to retransmit the original REQ even though it has not received a REP. Reviewed-by: Sagi Grimberg Signed-off-by: Hal Rosenstock Reviewed-by: Ira Weiny Signed-off-by: Doug Ledford diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c index 8d8af7a..2281de1 100644 --- a/drivers/infiniband/core/mad.c +++ b/drivers/infiniband/core/mad.c @@ -1811,6 +1811,11 @@ static int validate_mad(const struct ib_mad_hdr *mad_hdr, if (qp_num == 0) valid = 1; } else { + /* CM attributes other than ClassPortInfo only use Send method */ + if ((mad_hdr->mgmt_class == IB_MGMT_CLASS_CM) && + (mad_hdr->attr_id != IB_MGMT_CLASSPORTINFO_ATTR_ID) && + (mad_hdr->method != IB_MGMT_METHOD_SEND)) + goto out; /* Filter GSI packets sent to QP0 */ if (qp_num != 0) valid = 1; diff --git a/include/rdma/ib_mad.h b/include/rdma/ib_mad.h index 188df91..ec9b44d 100644 --- a/include/rdma/ib_mad.h +++ b/include/rdma/ib_mad.h @@ -237,6 +237,8 @@ struct ib_vendor_mad { u8 data[IB_MGMT_VENDOR_DATA]; }; +#define IB_MGMT_CLASSPORTINFO_ATTR_ID cpu_to_be16(0x0001) + struct ib_class_port_info { u8 base_version; u8 class_version; -- cgit v0.10.2 From a5e14ba334e202c58e45ef47414ec94c585c1a8c Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Wed, 28 Oct 2015 13:28:15 +0200 Subject: mlx4: Expose correct max_sge_rd limit mlx4 devices (ConnectX-2, ConnectX-3) has a limitation where rdma read work queue entries cannot exceed 512 bytes. A rdma_read wqe needs to fit in 512 bytes: - wqe control segment (16 bytes) - rdma segment (16 bytes) - scatter elements (16 bytes each) So max_sge_rd should be: (512 - 16 - 16) / 16 = 30. Signed-off-by: Sagi Grimberg Reviewed-by: Steve Wise Signed-off-by: Doug Ledford diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c index f567160..97d6878 100644 --- a/drivers/infiniband/hw/mlx4/main.c +++ b/drivers/infiniband/hw/mlx4/main.c @@ -456,7 +456,7 @@ static int mlx4_ib_query_device(struct ib_device *ibdev, props->max_qp_wr = dev->dev->caps.max_wqes - MLX4_IB_SQ_MAX_SPARE; props->max_sge = min(dev->dev->caps.max_sq_sg, dev->dev->caps.max_rq_sg); - props->max_sge_rd = props->max_sge; + props->max_sge_rd = MLX4_MAX_SGE_RD; props->max_cq = dev->dev->quotas.cq; props->max_cqe = dev->dev->caps.max_cqes; props->max_mr = dev->dev->quotas.mpt; diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h index 7501626..d3133be 100644 --- a/include/linux/mlx4/device.h +++ b/include/linux/mlx4/device.h @@ -427,6 +427,17 @@ enum { }; enum { + /* + * Max wqe size for rdma read is 512 bytes, so this + * limits our max_sge_rd as the wqe needs to fit: + * - ctrl segment (16 bytes) + * - rdma segment (16 bytes) + * - scatter elements (16 bytes each) + */ + MLX4_MAX_SGE_RD = (512 - 16 - 16) / 16 +}; + +enum { MLX4_DEV_PMC_SUBTYPE_GUID_INFO = 0x14, MLX4_DEV_PMC_SUBTYPE_PORT_INFO = 0x15, MLX4_DEV_PMC_SUBTYPE_PKEY_TABLE = 0x16, -- cgit v0.10.2 From f1a47d37fbc65bc49d32f23ab61c3e09fefeab73 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Wed, 28 Oct 2015 13:28:16 +0200 Subject: iser-target: Remove explicit mlx4 work-around The driver now exposes sufficient limits so we can avoid having mlx4 specific work-around. Signed-off-by: Sagi Grimberg Reviewed-by: Steve Wise Signed-off-by: Doug Ledford diff --git a/drivers/infiniband/ulp/isert/ib_isert.c b/drivers/infiniband/ulp/isert/ib_isert.c index dfbbbb2..8a51c3b 100644 --- a/drivers/infiniband/ulp/isert/ib_isert.c +++ b/drivers/infiniband/ulp/isert/ib_isert.c @@ -157,16 +157,9 @@ isert_create_qp(struct isert_conn *isert_conn, attr.recv_cq = comp->cq; attr.cap.max_send_wr = ISERT_QP_MAX_REQ_DTOS; attr.cap.max_recv_wr = ISERT_QP_MAX_RECV_DTOS + 1; - /* - * FIXME: Use devattr.max_sge - 2 for max_send_sge as - * work-around for RDMA_READs with ConnectX-2. - * - * Also, still make sure to have at least two SGEs for - * outgoing control PDU responses. - */ - attr.cap.max_send_sge = max(2, device->dev_attr.max_sge - 2); - isert_conn->max_sge = attr.cap.max_send_sge; - + attr.cap.max_send_sge = device->dev_attr.max_sge; + isert_conn->max_sge = min(device->dev_attr.max_sge, + device->dev_attr.max_sge_rd); attr.cap.max_recv_sge = 1; attr.sq_sig_type = IB_SIGNAL_REQ_WR; attr.qp_type = IB_QPT_RC; -- cgit v0.10.2 From 4ad78628445d26e5e9487b2e8f23274ad7b0f5d3 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 8 Dec 2015 03:07:22 -0500 Subject: 9p: ->evict_inode() should kick out ->i_data, not ->i_mapping For block devices the pagecache is associated with the inode on bdevfs, not with the aliasing ones on the mountable filesystems. The latter have its own ->i_data empty and ->i_mapping pointing to the (unique per major/minor) bdevfs inode. That guarantees cache coherence between all block device inodes with the same device number. Eviction of an alias inode has no business trying to evict the pages belonging to bdevfs one; moreover, ->i_mapping is only safe to access when the thing is opened. At the time of ->evict_inode() the victim is definitely *not* opened. We are about to kill the address space embedded into struct inode (inode->i_data) and that's what we need to empty of any pages. 9p instance tries to empty inode->i_mapping instead, which is both unsafe and bogus - if we have several device nodes with the same device number in different places, closing one of them should not try to empty the (shared) page cache. Fortunately, other instances in the tree are OK; they are evicting from &inode->i_data instead, as 9p one should. Cc: stable@vger.kernel.org # v2.6.32+, ones prior to 2.6.36 need only half of that Reported-by: "Suzuki K. Poulose" Tested-by: "Suzuki K. Poulose" Signed-off-by: Al Viro diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index 699941e..5110785 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -451,9 +451,9 @@ void v9fs_evict_inode(struct inode *inode) { struct v9fs_inode *v9inode = V9FS_I(inode); - truncate_inode_pages_final(inode->i_mapping); + truncate_inode_pages_final(&inode->i_data); clear_inode(inode); - filemap_fdatawrite(inode->i_mapping); + filemap_fdatawrite(&inode->i_data); v9fs_cache_inode_put_cookie(inode); /* clunk the fid stashed in writeback_fid */ -- cgit v0.10.2 From 2d4594acbf6d8f75a27f3578476b6a27d8b13ebb Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 8 Dec 2015 12:22:47 -0500 Subject: fix the regression from "direct-io: Fix negative return from dio read beyond eof" Sure, it's better to bail out of past-the-eof read and return 0 than return a bogus negative value on such. Only we'd better make sure we are bailing out with 0 and not -ENOMEM... Cc: stable@vger.kernel.org Signed-off-by: Al Viro diff --git a/fs/direct-io.c b/fs/direct-io.c index 1c75a3a..602e844 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -1175,6 +1175,7 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, if (dio->flags & DIO_LOCKING) mutex_unlock(&inode->i_mutex); kmem_cache_free(dio_cache, dio); + retval = 0; goto out; } -- cgit v0.10.2 From af096e2235c5de76af7e8749f59a90de07f5e943 Mon Sep 17 00:00:00 2001 From: Minfei Huang Date: Tue, 8 Dec 2015 13:47:34 -0700 Subject: null_blk: Fix error path in module initialization Module couldn't release resource properly during the initialization. To fix this issue, we will clean up the proper resource before returning. Signed-off-by: Minfei Huang Signed-off-by: Jens Axboe diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c index 7981b74..8162475 100644 --- a/drivers/block/null_blk.c +++ b/drivers/block/null_blk.c @@ -766,7 +766,9 @@ out: static int __init null_init(void) { + int ret = 0; unsigned int i; + struct nullb *nullb; if (bs > PAGE_SIZE) { pr_warn("null_blk: invalid block size\n"); @@ -808,22 +810,29 @@ static int __init null_init(void) 0, 0, NULL); if (!ppa_cache) { pr_err("null_blk: unable to create ppa cache\n"); - return -ENOMEM; + ret = -ENOMEM; + goto err_ppa; } } for (i = 0; i < nr_devices; i++) { - if (null_add_dev()) { - unregister_blkdev(null_major, "nullb"); - goto err_ppa; - } + ret = null_add_dev(); + if (ret) + goto err_dev; } pr_info("null: module loaded\n"); return 0; -err_ppa: + +err_dev: + while (!list_empty(&nullb_list)) { + nullb = list_entry(nullb_list.next, struct nullb, list); + null_del_dev(nullb); + } kmem_cache_destroy(ppa_cache); - return -EINVAL; +err_ppa: + unregister_blkdev(null_major, "nullb"); + return ret; } static void __exit null_exit(void) -- cgit v0.10.2 From db2f24dc240856fb1d78005307f1523b7b3c121b Mon Sep 17 00:00:00 2001 From: Richard Weinberger Date: Wed, 18 Nov 2015 09:37:15 +0100 Subject: um: Fix get_signal() usage If get_signal() returns us a signal to post we must not call it again, otherwise the already posted signal will be overridden. Before commit a610d6e672d this was the case as we stopped the while after a successful handle_signal(). Cc: # 3.10- Fixes: a610d6e672d ("pull clearing RESTORE_SIGMASK into block_sigmask()") Signed-off-by: Richard Weinberger diff --git a/arch/um/kernel/signal.c b/arch/um/kernel/signal.c index 57acbd6..fc8be0e 100644 --- a/arch/um/kernel/signal.c +++ b/arch/um/kernel/signal.c @@ -69,7 +69,7 @@ void do_signal(struct pt_regs *regs) struct ksignal ksig; int handled_sig = 0; - while (get_signal(&ksig)) { + if (get_signal(&ksig)) { handled_sig = 1; /* Whee! Actually deliver the signal. */ handle_signal(&ksig, regs); -- cgit v0.10.2 From fb1770aa78a43530940d0c2dd161e77bc705bdac Mon Sep 17 00:00:00 2001 From: Lorenzo Colitti Date: Wed, 18 Nov 2015 23:12:09 +0900 Subject: arch: um: fix error when linking vmlinux. On gcc Ubuntu 4.8.4-2ubuntu1~14.04, linking vmlinux fails with: arch/um/os-Linux/built-in.o: In function `os_timer_create': /android/kernel/android/arch/um/os-Linux/time.c:51: undefined reference to `timer_create' arch/um/os-Linux/built-in.o: In function `os_timer_set_interval': /android/kernel/android/arch/um/os-Linux/time.c:84: undefined reference to `timer_settime' arch/um/os-Linux/built-in.o: In function `os_timer_remain': /android/kernel/android/arch/um/os-Linux/time.c:109: undefined reference to `timer_gettime' arch/um/os-Linux/built-in.o: In function `os_timer_one_shot': /android/kernel/android/arch/um/os-Linux/time.c:132: undefined reference to `timer_settime' arch/um/os-Linux/built-in.o: In function `os_timer_disable': /android/kernel/android/arch/um/os-Linux/time.c:145: undefined reference to `timer_settime' This is because -lrt appears in the generated link commandline after arch/um/os-Linux/built-in.o. Fix this by removing -lrt from arch/um/Makefile and adding it to the UM-specific section of scripts/link-vmlinux.sh. Signed-off-by: Lorenzo Colitti Signed-off-by: Richard Weinberger diff --git a/arch/um/Makefile b/arch/um/Makefile index 25ed409..e3abe6f 100644 --- a/arch/um/Makefile +++ b/arch/um/Makefile @@ -131,7 +131,7 @@ export LDS_ELF_FORMAT := $(ELF_FORMAT) # The wrappers will select whether using "malloc" or the kernel allocator. LINK_WRAPS = -Wl,--wrap,malloc -Wl,--wrap,free -Wl,--wrap,calloc -LD_FLAGS_CMDLINE = $(foreach opt,$(LDFLAGS),-Wl,$(opt)) -lrt +LD_FLAGS_CMDLINE = $(foreach opt,$(LDFLAGS),-Wl,$(opt)) # Used by link-vmlinux.sh which has special support for um link export CFLAGS_vmlinux := $(LINK-y) $(LINK_WRAPS) $(LD_FLAGS_CMDLINE) diff --git a/scripts/link-vmlinux.sh b/scripts/link-vmlinux.sh index 1a10d8a..dacf71a 100755 --- a/scripts/link-vmlinux.sh +++ b/scripts/link-vmlinux.sh @@ -62,7 +62,7 @@ vmlinux_link() -Wl,--start-group \ ${KBUILD_VMLINUX_MAIN} \ -Wl,--end-group \ - -lutil ${1} + -lutil -lrt ${1} rm -f linux fi } -- cgit v0.10.2 From 8090bfd2bb9abc6293f5cedef8ec9be84a913d2f Mon Sep 17 00:00:00 2001 From: Richard Weinberger Date: Sun, 29 Nov 2015 21:13:50 +0100 Subject: um: Fix fpstate handling The x86 FPU cleanup changed fpstate to a plain integer. UML on x86 has to deal with that too. Signed-off-by: Richard Weinberger diff --git a/arch/x86/um/signal.c b/arch/x86/um/signal.c index 06934a8..e5f854c 100644 --- a/arch/x86/um/signal.c +++ b/arch/x86/um/signal.c @@ -211,7 +211,7 @@ static int copy_sc_from_user(struct pt_regs *regs, if (err) return 1; - err = convert_fxsr_from_user(&fpx, sc.fpstate); + err = convert_fxsr_from_user(&fpx, (void *)sc.fpstate); if (err) return 1; @@ -227,7 +227,7 @@ static int copy_sc_from_user(struct pt_regs *regs, { struct user_i387_struct fp; - err = copy_from_user(&fp, sc.fpstate, + err = copy_from_user(&fp, (void *)sc.fpstate, sizeof(struct user_i387_struct)); if (err) return 1; @@ -291,7 +291,7 @@ static int copy_sc_to_user(struct sigcontext __user *to, #endif #undef PUTREG sc.oldmask = mask; - sc.fpstate = to_fp; + sc.fpstate = (unsigned long)to_fp; err = copy_to_user(to, &sc, sizeof(struct sigcontext)); if (err) @@ -468,12 +468,10 @@ long sys_sigreturn(void) struct sigframe __user *frame = (struct sigframe __user *)(sp - 8); sigset_t set; struct sigcontext __user *sc = &frame->sc; - unsigned long __user *oldmask = &sc->oldmask; - unsigned long __user *extramask = frame->extramask; int sig_size = (_NSIG_WORDS - 1) * sizeof(unsigned long); - if (copy_from_user(&set.sig[0], oldmask, sizeof(set.sig[0])) || - copy_from_user(&set.sig[1], extramask, sig_size)) + if (copy_from_user(&set.sig[0], (void *)sc->oldmask, sizeof(set.sig[0])) || + copy_from_user(&set.sig[1], frame->extramask, sig_size)) goto segfault; set_current_blocked(&set); @@ -505,6 +503,7 @@ int setup_signal_stack_si(unsigned long stack_top, struct ksignal *ksig, { struct rt_sigframe __user *frame; int err = 0, sig = ksig->sig; + unsigned long fp_to; frame = (struct rt_sigframe __user *) round_down(stack_top - sizeof(struct rt_sigframe), 16); @@ -526,7 +525,10 @@ int setup_signal_stack_si(unsigned long stack_top, struct ksignal *ksig, err |= __save_altstack(&frame->uc.uc_stack, PT_REGS_SP(regs)); err |= copy_sc_to_user(&frame->uc.uc_mcontext, &frame->fpstate, regs, set->sig[0]); - err |= __put_user(&frame->fpstate, &frame->uc.uc_mcontext.fpstate); + + fp_to = (unsigned long)&frame->fpstate; + + err |= __put_user(fp_to, &frame->uc.uc_mcontext.fpstate); if (sizeof(*set) == 16) { err |= __put_user(set->sig[0], &frame->uc.uc_sigmask.sig[0]); err |= __put_user(set->sig[1], &frame->uc.uc_sigmask.sig[1]); -- cgit v0.10.2 From 887a9853092c09e20598f4a7f91ac1cfb762be50 Mon Sep 17 00:00:00 2001 From: "Geyslan G. Bem" Date: Tue, 1 Dec 2015 17:18:55 -0300 Subject: um: fix returns without va_end When using va_list ensure that va_start will be followed by va_end. Signed-off-by: Geyslan G. Bem Signed-off-by: Richard Weinberger diff --git a/arch/um/drivers/net_user.c b/arch/um/drivers/net_user.c index e697a41..e9f8445 100644 --- a/arch/um/drivers/net_user.c +++ b/arch/um/drivers/net_user.c @@ -249,21 +249,23 @@ void close_addr(unsigned char *addr, unsigned char *netmask, void *arg) char *split_if_spec(char *str, ...) { - char **arg, *end; + char **arg, *end, *ret = NULL; va_list ap; va_start(ap, str); while ((arg = va_arg(ap, char **)) != NULL) { if (*str == '\0') - return NULL; + goto out; end = strchr(str, ','); if (end != str) *arg = str; if (end == NULL) - return NULL; + goto out; *end++ = '\0'; str = end; } + ret = str; +out: va_end(ap); - return str; + return ret; } -- cgit v0.10.2 From 73d4da7b9fec13cbddcf548888c4d0e46f1d5087 Mon Sep 17 00:00:00 2001 From: Wengang Wang Date: Thu, 8 Oct 2015 13:21:33 +0800 Subject: IB/mlx4: Use correct order of variables in log message There is a mis-order in mlx4 log. Fix it. Signed-off-by: Wengang Wang Acked-by: Or Gerlitz Signed-off-by: Doug Ledford diff --git a/drivers/net/ethernet/mellanox/mlx4/cmd.c b/drivers/net/ethernet/mellanox/mlx4/cmd.c index 2177e56..d48d579 100644 --- a/drivers/net/ethernet/mellanox/mlx4/cmd.c +++ b/drivers/net/ethernet/mellanox/mlx4/cmd.c @@ -1010,7 +1010,7 @@ static int mlx4_MAD_IFC_wrapper(struct mlx4_dev *dev, int slave, if (!(smp->mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED && smp->method == IB_MGMT_METHOD_GET) || network_view) { mlx4_err(dev, "Unprivileged slave %d is trying to execute a Subnet MGMT MAD, class 0x%x, method 0x%x, view=%s for attr 0x%x. Rejecting\n", - slave, smp->method, smp->mgmt_class, + slave, smp->mgmt_class, smp->method, network_view ? "Network" : "Host", be16_to_cpu(smp->attr_id)); return -EPERM; -- cgit v0.10.2 From 0ef2f05c7e02ff99c0b5b583d7dee2cd12b053f2 Mon Sep 17 00:00:00 2001 From: Wengang Wang Date: Thu, 8 Oct 2015 13:27:04 +0800 Subject: IB/mlx4: Use vmalloc for WR buffers when needed There are several hits that WR buffer allocation(kmalloc) failed. It failed at order 3 and/or 4 contigous pages allocation. At the same time there are actually 100MB+ free memory but well fragmented. So try vmalloc when kmalloc failed. Signed-off-by: Wengang Wang Acked-by: Or Gerlitz Signed-off-by: Doug Ledford diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c index a2e4ca5..13eaaf4 100644 --- a/drivers/infiniband/hw/mlx4/qp.c +++ b/drivers/infiniband/hw/mlx4/qp.c @@ -34,6 +34,7 @@ #include #include #include +#include #include #include @@ -795,8 +796,14 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd, if (err) goto err_mtt; - qp->sq.wrid = kmalloc(qp->sq.wqe_cnt * sizeof (u64), gfp); - qp->rq.wrid = kmalloc(qp->rq.wqe_cnt * sizeof (u64), gfp); + qp->sq.wrid = kmalloc(qp->sq.wqe_cnt * sizeof(u64), gfp); + if (!qp->sq.wrid) + qp->sq.wrid = __vmalloc(qp->sq.wqe_cnt * sizeof(u64), + gfp, PAGE_KERNEL); + qp->rq.wrid = kmalloc(qp->rq.wqe_cnt * sizeof(u64), gfp); + if (!qp->rq.wrid) + qp->rq.wrid = __vmalloc(qp->rq.wqe_cnt * sizeof(u64), + gfp, PAGE_KERNEL); if (!qp->sq.wrid || !qp->rq.wrid) { err = -ENOMEM; goto err_wrid; @@ -886,8 +893,8 @@ err_wrid: if (qp_has_rq(init_attr)) mlx4_ib_db_unmap_user(to_mucontext(pd->uobject->context), &qp->db); } else { - kfree(qp->sq.wrid); - kfree(qp->rq.wrid); + kvfree(qp->sq.wrid); + kvfree(qp->rq.wrid); } err_mtt: @@ -1062,8 +1069,8 @@ static void destroy_qp_common(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp, &qp->db); ib_umem_release(qp->umem); } else { - kfree(qp->sq.wrid); - kfree(qp->rq.wrid); + kvfree(qp->sq.wrid); + kvfree(qp->rq.wrid); if (qp->mlx4_ib_qp_type & (MLX4_IB_QPT_PROXY_SMI_OWNER | MLX4_IB_QPT_PROXY_SMI | MLX4_IB_QPT_PROXY_GSI)) free_proxy_bufs(&dev->ib_dev, qp); diff --git a/drivers/infiniband/hw/mlx4/srq.c b/drivers/infiniband/hw/mlx4/srq.c index dce5dfe..8d133c4 100644 --- a/drivers/infiniband/hw/mlx4/srq.c +++ b/drivers/infiniband/hw/mlx4/srq.c @@ -34,6 +34,7 @@ #include #include #include +#include #include "mlx4_ib.h" #include "user.h" @@ -172,8 +173,12 @@ struct ib_srq *mlx4_ib_create_srq(struct ib_pd *pd, srq->wrid = kmalloc(srq->msrq.max * sizeof (u64), GFP_KERNEL); if (!srq->wrid) { - err = -ENOMEM; - goto err_mtt; + srq->wrid = __vmalloc(srq->msrq.max * sizeof(u64), + GFP_KERNEL, PAGE_KERNEL); + if (!srq->wrid) { + err = -ENOMEM; + goto err_mtt; + } } } @@ -204,7 +209,7 @@ err_wrid: if (pd->uobject) mlx4_ib_db_unmap_user(to_mucontext(pd->uobject->context), &srq->db); else - kfree(srq->wrid); + kvfree(srq->wrid); err_mtt: mlx4_mtt_cleanup(dev->dev, &srq->mtt); -- cgit v0.10.2 From ab5cdc31630c7596d81ca8fbe7d695f10666f39b Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Wed, 21 Oct 2015 09:21:17 +0300 Subject: IB/mlx5: Postpone remove_keys under knowledge of coming preemption The remove_keys() logic is performed as garbage collection task. Such task is intended to be run when no other active processes are running. The need_resched() will return TRUE if there are user tasks to be activated in near future. In such case, we don't execute remove_keys() and postpone the garbage collection work to try to run in next cycle, in order to free CPU resources to other tasks. The possible pseudo-code to trigger such scenario: 1. Allocate a lot of MR to fill the cache above the limit. 2. Wait a small amount of time "to calm" the system. 3. Start CPU extensive operations on multi-node cluster. 4. Expect performance degradation during MR cache shrink operation. Signed-off-by: Leon Romanovsky Signed-off-by: Eli Cohen Signed-off-by: Doug Ledford diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index ec8993a..6000f7a 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -381,7 +381,19 @@ static void __cache_work_func(struct mlx5_cache_ent *ent) } } } else if (ent->cur > 2 * ent->limit) { - if (!someone_adding(cache) && + /* + * The remove_keys() logic is performed as garbage collection + * task. Such task is intended to be run when no other active + * processes are running. + * + * The need_resched() will return TRUE if there are user tasks + * to be activated in near future. + * + * In such case, we don't execute remove_keys() and postpone + * the garbage collection work to try to run in next cycle, + * in order to free CPU resources to other tasks. + */ + if (!need_resched() && !someone_adding(cache) && time_after(jiffies, cache->last_add + 300 * HZ)) { remove_keys(dev, i, 1); if (ent->cur > ent->limit) -- cgit v0.10.2 From 714a98fc3f20934ce8098667b809de954ebab93c Mon Sep 17 00:00:00 2001 From: Ben Skeggs Date: Wed, 9 Dec 2015 10:06:05 +1000 Subject: drm/nouveau/pmu: remove whitelist for PGOB-exit WAR, enable by default NVIDIA have indicated that the workaround is required on all GK10[467] boards that have the PGOB fuse set. I've left the commandline option in place for now, as paranoia. Signed-off-by: Ben Skeggs diff --git a/drivers/gpu/drm/nouveau/include/nvkm/core/device.h b/drivers/gpu/drm/nouveau/include/nvkm/core/device.h index 8f76000..913192c 100644 --- a/drivers/gpu/drm/nouveau/include/nvkm/core/device.h +++ b/drivers/gpu/drm/nouveau/include/nvkm/core/device.h @@ -159,7 +159,6 @@ struct nvkm_device_func { struct nvkm_device_quirk { u8 tv_pin_mask; u8 tv_gpio; - bool War00C800_0; }; struct nvkm_device_chip { diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/device/pci.c b/drivers/gpu/drm/nouveau/nvkm/engine/device/pci.c index caf22b5..62ad030 100644 --- a/drivers/gpu/drm/nouveau/nvkm/engine/device/pci.c +++ b/drivers/gpu/drm/nouveau/nvkm/engine/device/pci.c @@ -259,12 +259,6 @@ nvkm_device_pci_10de_0df4[] = { }; static const struct nvkm_device_pci_vendor -nvkm_device_pci_10de_0fcd[] = { - { 0x17aa, 0x3801, NULL, { .War00C800_0 = true } }, /* Lenovo Y510P */ - {} -}; - -static const struct nvkm_device_pci_vendor nvkm_device_pci_10de_0fd2[] = { { 0x1028, 0x0595, "GeForce GT 640M LE" }, { 0x1028, 0x05b2, "GeForce GT 640M LE" }, @@ -279,12 +273,6 @@ nvkm_device_pci_10de_0fe3[] = { }; static const struct nvkm_device_pci_vendor -nvkm_device_pci_10de_0fe4[] = { - { 0x144d, 0xc740, NULL, { .War00C800_0 = true } }, - {} -}; - -static const struct nvkm_device_pci_vendor nvkm_device_pci_10de_104b[] = { { 0x1043, 0x844c, "GeForce GT 625" }, { 0x1043, 0x846b, "GeForce GT 625" }, @@ -690,13 +678,6 @@ nvkm_device_pci_10de_1189[] = { static const struct nvkm_device_pci_vendor nvkm_device_pci_10de_1199[] = { { 0x1458, 0xd001, "GeForce GTX 760" }, - { 0x1462, 0x1106, "GeForce GTX 780M", { .War00C800_0 = true } }, /* Medion Erazer X7827 */ - {} -}; - -static const struct nvkm_device_pci_vendor -nvkm_device_pci_10de_11e0[] = { - { 0x1558, 0x5106, NULL, { .War00C800_0 = true } }, {} }; @@ -707,14 +688,6 @@ nvkm_device_pci_10de_11e3[] = { }; static const struct nvkm_device_pci_vendor -nvkm_device_pci_10de_11fc[] = { - { 0x1179, 0x0001, NULL, { .War00C800_0 = true } }, /* Toshiba Tecra W50 */ - { 0x17aa, 0x2211, NULL, { .War00C800_0 = true } }, /* Lenovo W541 */ - { 0x17aa, 0x221e, NULL, { .War00C800_0 = true } }, /* Lenovo W541 */ - {} -}; - -static const struct nvkm_device_pci_vendor nvkm_device_pci_10de_1247[] = { { 0x1043, 0x212a, "GeForce GT 635M" }, { 0x1043, 0x212b, "GeForce GT 635M" }, @@ -1368,7 +1341,7 @@ nvkm_device_pci_10de[] = { { 0x0fc6, "GeForce GTX 650" }, { 0x0fc8, "GeForce GT 740" }, { 0x0fc9, "GeForce GT 730" }, - { 0x0fcd, "GeForce GT 755M", nvkm_device_pci_10de_0fcd }, + { 0x0fcd, "GeForce GT 755M" }, { 0x0fce, "GeForce GT 640M LE" }, { 0x0fd1, "GeForce GT 650M" }, { 0x0fd2, "GeForce GT 640M", nvkm_device_pci_10de_0fd2 }, @@ -1382,7 +1355,7 @@ nvkm_device_pci_10de[] = { { 0x0fe1, "GeForce GT 730M" }, { 0x0fe2, "GeForce GT 745M" }, { 0x0fe3, "GeForce GT 745M", nvkm_device_pci_10de_0fe3 }, - { 0x0fe4, "GeForce GT 750M", nvkm_device_pci_10de_0fe4 }, + { 0x0fe4, "GeForce GT 750M" }, { 0x0fe9, "GeForce GT 750M" }, { 0x0fea, "GeForce GT 755M" }, { 0x0fec, "GeForce 710A" }, @@ -1497,12 +1470,12 @@ nvkm_device_pci_10de[] = { { 0x11c6, "GeForce GTX 650 Ti" }, { 0x11c8, "GeForce GTX 650" }, { 0x11cb, "GeForce GT 740" }, - { 0x11e0, "GeForce GTX 770M", nvkm_device_pci_10de_11e0 }, + { 0x11e0, "GeForce GTX 770M" }, { 0x11e1, "GeForce GTX 765M" }, { 0x11e2, "GeForce GTX 765M" }, { 0x11e3, "GeForce GTX 760M", nvkm_device_pci_10de_11e3 }, { 0x11fa, "Quadro K4000" }, - { 0x11fc, "Quadro K2100M", nvkm_device_pci_10de_11fc }, + { 0x11fc, "Quadro K2100M" }, { 0x1200, "GeForce GTX 560 Ti" }, { 0x1201, "GeForce GTX 560" }, { 0x1203, "GeForce GTX 460 SE v2" }, diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/pmu/gk104.c b/drivers/gpu/drm/nouveau/nvkm/subdev/pmu/gk104.c index d942fa7..86f9f3b 100644 --- a/drivers/gpu/drm/nouveau/nvkm/subdev/pmu/gk104.c +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/pmu/gk104.c @@ -81,9 +81,7 @@ gk104_pmu_pgob(struct nvkm_pmu *pmu, bool enable) nvkm_mask(device, 0x000200, 0x00001000, 0x00001000); nvkm_rd32(device, 0x000200); - if ( nvkm_boolopt(device->cfgopt, "War00C800_0", - device->quirk ? device->quirk->War00C800_0 : false)) { - nvkm_info(&pmu->subdev, "hw bug workaround enabled\n"); + if (nvkm_boolopt(device->cfgopt, "War00C800_0", true)) { switch (device->chipset) { case 0xe4: magic(device, 0x04000000); -- cgit v0.10.2 From 5b01310cfc8d2302dcca1d8da42873edab2ef784 Mon Sep 17 00:00:00 2001 From: Paul Gortmaker Date: Tue, 8 Dec 2015 17:44:02 -0500 Subject: powerpc/sbc8641: drop bogus PHY IRQ entries from DTS file This file was originally cloned off of the MPC8641D-HPCN reference platform, which actually had a PHY IRQ line connected. However this board does not. The bogus entry was largely inert and went undetected until commit 321beec5047af83db90c88114b7e664b156f49fe ("net: phy: Use interrupts when available in NOLINK state") was added to the tree. With the above commit, the board fails to NFS boot since it sits waiting for a PHY IRQ event that of course never arrives. Removing the bogus entries from the DTS file fixes the issue. Cc: Andrew Lunn Signed-off-by: Paul Gortmaker Signed-off-by: Michael Ellerman diff --git a/arch/powerpc/boot/dts/sbc8641d.dts b/arch/powerpc/boot/dts/sbc8641d.dts index 631ede7..68f0ed7 100644 --- a/arch/powerpc/boot/dts/sbc8641d.dts +++ b/arch/powerpc/boot/dts/sbc8641d.dts @@ -227,23 +227,15 @@ reg = <0x520 0x20>; phy0: ethernet-phy@1f { - interrupt-parent = <&mpic>; - interrupts = <10 1>; reg = <0x1f>; }; phy1: ethernet-phy@0 { - interrupt-parent = <&mpic>; - interrupts = <10 1>; reg = <0>; }; phy2: ethernet-phy@1 { - interrupt-parent = <&mpic>; - interrupts = <10 1>; reg = <1>; }; phy3: ethernet-phy@2 { - interrupt-parent = <&mpic>; - interrupts = <10 1>; reg = <2>; }; tbi0: tbi-phy@11 { -- cgit v0.10.2 From dc9c41bd9ece090b54eb8f1bbdfb1930e10d3ae7 Mon Sep 17 00:00:00 2001 From: Andrew Donnellan Date: Tue, 8 Dec 2015 16:59:25 +1100 Subject: Revert "powerpc/eeh: Don't unfreeze PHB PE after reset" This reverts commit 527d10ef3a315d3cb9dc098dacd61889a6c26439. The reverted commit breaks cxlflash devices following an EEH reset (and possibly other cxl devices, however this has not been tested). The reverted commit changed the behaviour of eeh_reset_device() so that PHB PEs are not unfrozen following the completion of the reset. This should not be problematic, as no device resources should have been associated with the PHB PE. However, when attempting to load the cxlflash driver after a reset, the driver attempts to read Vital Product Data through a call to pci_read_vpd() (which is called on the physical cxl device, not on the virtual AFU device). pci_read_vpd() in turn attempts to read from the cxl device's config space. This fails, as the PE it's trying to read from is still frozen. In turn, the driver gets an -ENODEV and fails to initialise. It appears this issue only affects some parts of the VPD area, as "lspci -vvv", which only reads a subset of the VPD bytes, is not broken by the original patch. At this stage, we don't fully understand why we're trying to read a frozen PE, and we don't know how this affects other cxl devices. It is possible that there is an underlying bug in the cxl driver or the powerpc CAPI support code, or alternatively a bug in the PCI resource allocation/mapping code that is incorrectly mapping resources to PE#0. As such, this fix is incomplete, however it is necessary to prevent a serious regression in CAPI support. In the meantime, revert the commit, especially as it was intended to be a non-functional change. Cc: Gavin Shan Cc: Ian Munsie Cc: Daniel Axtens Signed-off-by: Andrew Donnellan Signed-off-by: Michael Ellerman diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c index 80dfe89..8d14feb 100644 --- a/arch/powerpc/kernel/eeh_driver.c +++ b/arch/powerpc/kernel/eeh_driver.c @@ -590,16 +590,10 @@ static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus) eeh_ops->configure_bridge(pe); eeh_pe_restore_bars(pe); - /* - * If it's PHB PE, the frozen state on all available PEs should have - * been cleared by the PHB reset. Otherwise, we unfreeze the PE and its - * child PEs because they might be in frozen state. - */ - if (!(pe->type & EEH_PE_PHB)) { - rc = eeh_clear_pe_frozen_state(pe, false); - if (rc) - return rc; - } + /* Clear frozen state */ + rc = eeh_clear_pe_frozen_state(pe, false); + if (rc) + return rc; /* Give the system 5 seconds to finish running the user-space * hotplug shutdown scripts, e.g. ifdown for ethernet. Yes, -- cgit v0.10.2 From e410b5cbabe70b1f1da08146481f6bd7e904643c Mon Sep 17 00:00:00 2001 From: Chunming Zhou Date: Mon, 7 Dec 2015 15:02:52 +0800 Subject: drm/amdgpu: fix the lost duplicates checking MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Chunming Zhou Reviewed-by: Christian König Reviewed-by: Jammy Zhou Cc: stable@vger.kernel.org diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c index f6ea4b4..9c253c5 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c @@ -477,6 +477,14 @@ static void amdgpu_gem_va_update_vm(struct amdgpu_device *adev, if (domain == AMDGPU_GEM_DOMAIN_CPU) goto error_unreserve; } + list_for_each_entry(entry, &duplicates, head) { + domain = amdgpu_mem_type_to_domain(entry->bo->mem.mem_type); + /* if anything is swapped out don't swap it in here, + just abort and wait for the next CS */ + if (domain == AMDGPU_GEM_DOMAIN_CPU) + goto error_unreserve; + } + r = amdgpu_vm_update_page_directory(adev, bo_va->vm); if (r) goto error_unreserve; -- cgit v0.10.2 From 5f3e226f511ec98d70c970f09cdb4ec46511cc5e Mon Sep 17 00:00:00 2001 From: Oded Gabbay Date: Fri, 4 Dec 2015 23:09:03 +0200 Subject: radeon/cik: Fix GFX IB test on Big-Endian MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch makes the IB test on the GFX ring pass for CI-based cards installed in Big-Endian machines. Reviewed-by: Christian König Signed-off-by: Oded Gabbay Cc: stable@vger.kernel.org Signed-off-by: Alex Deucher diff --git a/drivers/gpu/drm/radeon/cik.c b/drivers/gpu/drm/radeon/cik.c index 0154db4..f81fb26 100644 --- a/drivers/gpu/drm/radeon/cik.c +++ b/drivers/gpu/drm/radeon/cik.c @@ -4173,11 +4173,7 @@ void cik_ring_ib_execute(struct radeon_device *rdev, struct radeon_ib *ib) control |= ib->length_dw | (vm_id << 24); radeon_ring_write(ring, header); - radeon_ring_write(ring, -#ifdef __BIG_ENDIAN - (2 << 0) | -#endif - (ib->gpu_addr & 0xFFFFFFFC)); + radeon_ring_write(ring, (ib->gpu_addr & 0xFFFFFFFC)); radeon_ring_write(ring, upper_32_bits(ib->gpu_addr) & 0xFFFF); radeon_ring_write(ring, control); } -- cgit v0.10.2 From 687f4b98d1f4e27508f7ad4bcce787c1ba58b289 Mon Sep 17 00:00:00 2001 From: Oded Gabbay Date: Fri, 4 Dec 2015 23:09:04 +0200 Subject: radeon: Fix VCE ring test for Big-Endian systems MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch fixes the VCE ring test when running on Big-Endian machines. Every write to the ring needs to be translated to little-endian. Reviewed-by: Christian König Signed-off-by: Oded Gabbay Cc: stable@vger.kernel.org Signed-off-by: Alex Deucher diff --git a/drivers/gpu/drm/radeon/radeon_vce.c b/drivers/gpu/drm/radeon/radeon_vce.c index 574f62b..86f57e4 100644 --- a/drivers/gpu/drm/radeon/radeon_vce.c +++ b/drivers/gpu/drm/radeon/radeon_vce.c @@ -699,12 +699,12 @@ bool radeon_vce_semaphore_emit(struct radeon_device *rdev, { uint64_t addr = semaphore->gpu_addr; - radeon_ring_write(ring, VCE_CMD_SEMAPHORE); - radeon_ring_write(ring, (addr >> 3) & 0x000FFFFF); - radeon_ring_write(ring, (addr >> 23) & 0x000FFFFF); - radeon_ring_write(ring, 0x01003000 | (emit_wait ? 1 : 0)); + radeon_ring_write(ring, cpu_to_le32(VCE_CMD_SEMAPHORE)); + radeon_ring_write(ring, cpu_to_le32((addr >> 3) & 0x000FFFFF)); + radeon_ring_write(ring, cpu_to_le32((addr >> 23) & 0x000FFFFF)); + radeon_ring_write(ring, cpu_to_le32(0x01003000 | (emit_wait ? 1 : 0))); if (!emit_wait) - radeon_ring_write(ring, VCE_CMD_END); + radeon_ring_write(ring, cpu_to_le32(VCE_CMD_END)); return true; } @@ -719,10 +719,10 @@ bool radeon_vce_semaphore_emit(struct radeon_device *rdev, void radeon_vce_ib_execute(struct radeon_device *rdev, struct radeon_ib *ib) { struct radeon_ring *ring = &rdev->ring[ib->ring]; - radeon_ring_write(ring, VCE_CMD_IB); - radeon_ring_write(ring, ib->gpu_addr); - radeon_ring_write(ring, upper_32_bits(ib->gpu_addr)); - radeon_ring_write(ring, ib->length_dw); + radeon_ring_write(ring, cpu_to_le32(VCE_CMD_IB)); + radeon_ring_write(ring, cpu_to_le32(ib->gpu_addr)); + radeon_ring_write(ring, cpu_to_le32(upper_32_bits(ib->gpu_addr))); + radeon_ring_write(ring, cpu_to_le32(ib->length_dw)); } /** @@ -738,12 +738,12 @@ void radeon_vce_fence_emit(struct radeon_device *rdev, struct radeon_ring *ring = &rdev->ring[fence->ring]; uint64_t addr = rdev->fence_drv[fence->ring].gpu_addr; - radeon_ring_write(ring, VCE_CMD_FENCE); - radeon_ring_write(ring, addr); - radeon_ring_write(ring, upper_32_bits(addr)); - radeon_ring_write(ring, fence->seq); - radeon_ring_write(ring, VCE_CMD_TRAP); - radeon_ring_write(ring, VCE_CMD_END); + radeon_ring_write(ring, cpu_to_le32(VCE_CMD_FENCE)); + radeon_ring_write(ring, cpu_to_le32(addr)); + radeon_ring_write(ring, cpu_to_le32(upper_32_bits(addr))); + radeon_ring_write(ring, cpu_to_le32(fence->seq)); + radeon_ring_write(ring, cpu_to_le32(VCE_CMD_TRAP)); + radeon_ring_write(ring, cpu_to_le32(VCE_CMD_END)); } /** @@ -765,7 +765,7 @@ int radeon_vce_ring_test(struct radeon_device *rdev, struct radeon_ring *ring) ring->idx, r); return r; } - radeon_ring_write(ring, VCE_CMD_END); + radeon_ring_write(ring, cpu_to_le32(VCE_CMD_END)); radeon_ring_unlock_commit(rdev, ring, false); for (i = 0; i < rdev->usec_timeout; i++) { -- cgit v0.10.2 From 361c32d39087e7caa99e629c0d7fb00643cb2190 Mon Sep 17 00:00:00 2001 From: Oded Gabbay Date: Fri, 4 Dec 2015 23:09:05 +0200 Subject: radeon: Fix VCE IB test on Big-Endian systems MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch makes the VCE IB test pass on Big-Endian systems. It converts to little-endian the contents of the VCE message. Reviewed-by: Christian König Signed-off-by: Oded Gabbay Cc: stable@vger.kernel.org Signed-off-by: Alex Deucher diff --git a/drivers/gpu/drm/radeon/radeon_vce.c b/drivers/gpu/drm/radeon/radeon_vce.c index 86f57e4..7eb1ae7 100644 --- a/drivers/gpu/drm/radeon/radeon_vce.c +++ b/drivers/gpu/drm/radeon/radeon_vce.c @@ -361,31 +361,31 @@ int radeon_vce_get_create_msg(struct radeon_device *rdev, int ring, /* stitch together an VCE create msg */ ib.length_dw = 0; - ib.ptr[ib.length_dw++] = 0x0000000c; /* len */ - ib.ptr[ib.length_dw++] = 0x00000001; /* session cmd */ - ib.ptr[ib.length_dw++] = handle; - - ib.ptr[ib.length_dw++] = 0x00000030; /* len */ - ib.ptr[ib.length_dw++] = 0x01000001; /* create cmd */ - ib.ptr[ib.length_dw++] = 0x00000000; - ib.ptr[ib.length_dw++] = 0x00000042; - ib.ptr[ib.length_dw++] = 0x0000000a; - ib.ptr[ib.length_dw++] = 0x00000001; - ib.ptr[ib.length_dw++] = 0x00000080; - ib.ptr[ib.length_dw++] = 0x00000060; - ib.ptr[ib.length_dw++] = 0x00000100; - ib.ptr[ib.length_dw++] = 0x00000100; - ib.ptr[ib.length_dw++] = 0x0000000c; - ib.ptr[ib.length_dw++] = 0x00000000; - - ib.ptr[ib.length_dw++] = 0x00000014; /* len */ - ib.ptr[ib.length_dw++] = 0x05000005; /* feedback buffer */ - ib.ptr[ib.length_dw++] = upper_32_bits(dummy); - ib.ptr[ib.length_dw++] = dummy; - ib.ptr[ib.length_dw++] = 0x00000001; + ib.ptr[ib.length_dw++] = cpu_to_le32(0x0000000c); /* len */ + ib.ptr[ib.length_dw++] = cpu_to_le32(0x00000001); /* session cmd */ + ib.ptr[ib.length_dw++] = cpu_to_le32(handle); + + ib.ptr[ib.length_dw++] = cpu_to_le32(0x00000030); /* len */ + ib.ptr[ib.length_dw++] = cpu_to_le32(0x01000001); /* create cmd */ + ib.ptr[ib.length_dw++] = cpu_to_le32(0x00000000); + ib.ptr[ib.length_dw++] = cpu_to_le32(0x00000042); + ib.ptr[ib.length_dw++] = cpu_to_le32(0x0000000a); + ib.ptr[ib.length_dw++] = cpu_to_le32(0x00000001); + ib.ptr[ib.length_dw++] = cpu_to_le32(0x00000080); + ib.ptr[ib.length_dw++] = cpu_to_le32(0x00000060); + ib.ptr[ib.length_dw++] = cpu_to_le32(0x00000100); + ib.ptr[ib.length_dw++] = cpu_to_le32(0x00000100); + ib.ptr[ib.length_dw++] = cpu_to_le32(0x0000000c); + ib.ptr[ib.length_dw++] = cpu_to_le32(0x00000000); + + ib.ptr[ib.length_dw++] = cpu_to_le32(0x00000014); /* len */ + ib.ptr[ib.length_dw++] = cpu_to_le32(0x05000005); /* feedback buffer */ + ib.ptr[ib.length_dw++] = cpu_to_le32(upper_32_bits(dummy)); + ib.ptr[ib.length_dw++] = cpu_to_le32(dummy); + ib.ptr[ib.length_dw++] = cpu_to_le32(0x00000001); for (i = ib.length_dw; i < ib_size_dw; ++i) - ib.ptr[i] = 0x0; + ib.ptr[i] = cpu_to_le32(0x0); r = radeon_ib_schedule(rdev, &ib, NULL, false); if (r) { @@ -428,21 +428,21 @@ int radeon_vce_get_destroy_msg(struct radeon_device *rdev, int ring, /* stitch together an VCE destroy msg */ ib.length_dw = 0; - ib.ptr[ib.length_dw++] = 0x0000000c; /* len */ - ib.ptr[ib.length_dw++] = 0x00000001; /* session cmd */ - ib.ptr[ib.length_dw++] = handle; + ib.ptr[ib.length_dw++] = cpu_to_le32(0x0000000c); /* len */ + ib.ptr[ib.length_dw++] = cpu_to_le32(0x00000001); /* session cmd */ + ib.ptr[ib.length_dw++] = cpu_to_le32(handle); - ib.ptr[ib.length_dw++] = 0x00000014; /* len */ - ib.ptr[ib.length_dw++] = 0x05000005; /* feedback buffer */ - ib.ptr[ib.length_dw++] = upper_32_bits(dummy); - ib.ptr[ib.length_dw++] = dummy; - ib.ptr[ib.length_dw++] = 0x00000001; + ib.ptr[ib.length_dw++] = cpu_to_le32(0x00000014); /* len */ + ib.ptr[ib.length_dw++] = cpu_to_le32(0x05000005); /* feedback buffer */ + ib.ptr[ib.length_dw++] = cpu_to_le32(upper_32_bits(dummy)); + ib.ptr[ib.length_dw++] = cpu_to_le32(dummy); + ib.ptr[ib.length_dw++] = cpu_to_le32(0x00000001); - ib.ptr[ib.length_dw++] = 0x00000008; /* len */ - ib.ptr[ib.length_dw++] = 0x02000001; /* destroy cmd */ + ib.ptr[ib.length_dw++] = cpu_to_le32(0x00000008); /* len */ + ib.ptr[ib.length_dw++] = cpu_to_le32(0x02000001); /* destroy cmd */ for (i = ib.length_dw; i < ib_size_dw; ++i) - ib.ptr[i] = 0x0; + ib.ptr[i] = cpu_to_le32(0x0); r = radeon_ib_schedule(rdev, &ib, NULL, false); if (r) { -- cgit v0.10.2 From 9a811230481243f384b8036c6a558bfdbd961f78 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Wed, 9 Dec 2015 15:17:43 +0100 Subject: ALSA: hda - Fix noise problems on Thinkpad T440s Lenovo Thinkpad T440s suffers from constant background noises, and it seems to be a generic hardware issue on this model: https://forums.lenovo.com/t5/ThinkPad-T400-T500-and-newer-T/T440s-speaker-noise/td-p/1339883 As the noise comes from the analog loopback path, disabling the path is the easy workaround. Also, the machine gives significant cracking noises at PM suspend. A workaround found by trial-and-error is to disable the shutup callback currently used for ALC269-variant. This patch addresses these noise issues by introducing a new fixup chain. Although the same workaround might be applicable to other Thinkpad models, it's applied only to T440s (17aa:220c) in this patch, so far, just to be safe (you chicken!). As a compromise, a new model option string "tp440" is provided now, though, so that owners of other Thinkpad models can test it more easily. Bugzilla: https://bugzilla.opensuse.org/show_bug.cgi?id=958504 Reported-and-tested-by: Tim Hardeck Cc: Signed-off-by: Takashi Iwai diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index 49b1d36..8dd2ac1 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -4204,6 +4204,18 @@ static void alc_fixup_tpt440_dock(struct hda_codec *codec, } } +/* additional fixup for Thinkpad T440s noise problem */ +static void alc_fixup_tpt440(struct hda_codec *codec, + const struct hda_fixup *fix, int action) +{ + struct alc_spec *spec = codec->spec; + + if (action == HDA_FIXUP_ACT_PRE_PROBE) { + spec->shutup = alc_no_shutup; /* reduce click noise */ + spec->gen.mixer_nid = 0; /* reduce background noise */ + } +} + static void alc_shutup_dell_xps13(struct hda_codec *codec) { struct alc_spec *spec = codec->spec; @@ -4578,6 +4590,7 @@ enum { ALC255_FIXUP_HEADSET_MODE_NO_HP_MIC, ALC293_FIXUP_DELL1_MIC_NO_PRESENCE, ALC292_FIXUP_TPT440_DOCK, + ALC292_FIXUP_TPT440, ALC283_FIXUP_BXBT2807_MIC, ALC255_FIXUP_DELL_WMI_MIC_MUTE_LED, ALC282_FIXUP_ASPIRE_V5_PINS, @@ -5051,6 +5064,12 @@ static const struct hda_fixup alc269_fixups[] = { .chained = true, .chain_id = ALC269_FIXUP_LIMIT_INT_MIC_BOOST }, + [ALC292_FIXUP_TPT440] = { + .type = HDA_FIXUP_FUNC, + .v.func = alc_fixup_tpt440, + .chained = true, + .chain_id = ALC292_FIXUP_TPT440_DOCK, + }, [ALC283_FIXUP_BXBT2807_MIC] = { .type = HDA_FIXUP_PINS, .v.pins = (const struct hda_pintbl[]) { @@ -5332,7 +5351,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x17aa, 0x21fb, "Thinkpad T430s", ALC269_FIXUP_LENOVO_DOCK), SND_PCI_QUIRK(0x17aa, 0x2203, "Thinkpad X230 Tablet", ALC269_FIXUP_LENOVO_DOCK), SND_PCI_QUIRK(0x17aa, 0x2208, "Thinkpad T431s", ALC269_FIXUP_LENOVO_DOCK), - SND_PCI_QUIRK(0x17aa, 0x220c, "Thinkpad T440s", ALC292_FIXUP_TPT440_DOCK), + SND_PCI_QUIRK(0x17aa, 0x220c, "Thinkpad T440s", ALC292_FIXUP_TPT440), SND_PCI_QUIRK(0x17aa, 0x220e, "Thinkpad T440p", ALC292_FIXUP_TPT440_DOCK), SND_PCI_QUIRK(0x17aa, 0x2210, "Thinkpad T540p", ALC292_FIXUP_TPT440_DOCK), SND_PCI_QUIRK(0x17aa, 0x2211, "Thinkpad W541", ALC292_FIXUP_TPT440_DOCK), @@ -5432,6 +5451,7 @@ static const struct hda_model_fixup alc269_fixup_models[] = { {.id = ALC283_FIXUP_CHROME_BOOK, .name = "alc283-dac-wcaps"}, {.id = ALC283_FIXUP_SENSE_COMBO_JACK, .name = "alc283-sense-combo"}, {.id = ALC292_FIXUP_TPT440_DOCK, .name = "tpt440-dock"}, + {.id = ALC292_FIXUP_TPT440, .name = "tpt440"}, {} }; -- cgit v0.10.2 From 4c3141e09cfa6460bfcd5e90f73e498db654c917 Mon Sep 17 00:00:00 2001 From: Carlo Caione Date: Tue, 1 Dec 2015 17:24:17 +0100 Subject: of/irq: Export of_irq_find_parent again of_irq_find_parent was made static since it had no users outside of of_irq.c. Export it again since we are going to use it again. Signed-off-by: Carlo Caione [robh: move of_irq_find_parent to correct ifdef section] Signed-off-by: Rob Herring diff --git a/drivers/of/irq.c b/drivers/of/irq.c index 902b89b..4fa916d 100644 --- a/drivers/of/irq.c +++ b/drivers/of/irq.c @@ -53,7 +53,7 @@ EXPORT_SYMBOL_GPL(irq_of_parse_and_map); * Returns a pointer to the interrupt parent node, or NULL if the interrupt * parent could not be determined. */ -static struct device_node *of_irq_find_parent(struct device_node *child) +struct device_node *of_irq_find_parent(struct device_node *child) { struct device_node *p; const __be32 *parp; @@ -77,6 +77,7 @@ static struct device_node *of_irq_find_parent(struct device_node *child) return p; } +EXPORT_SYMBOL_GPL(of_irq_find_parent); /** * of_irq_parse_raw - Low level interrupt tree parsing diff --git a/include/linux/of_irq.h b/include/linux/of_irq.h index 039f2ee..f648acf 100644 --- a/include/linux/of_irq.h +++ b/include/linux/of_irq.h @@ -46,6 +46,7 @@ extern int of_irq_get(struct device_node *dev, int index); extern int of_irq_get_byname(struct device_node *dev, const char *name); extern int of_irq_to_resource_table(struct device_node *dev, struct resource *res, int nr_irqs); +extern struct device_node *of_irq_find_parent(struct device_node *child); extern struct irq_domain *of_msi_get_domain(struct device *dev, struct device_node *np, enum irq_domain_bus_token token); @@ -70,6 +71,11 @@ static inline int of_irq_to_resource_table(struct device_node *dev, { return 0; } +static inline void *of_irq_find_parent(struct device_node *child) +{ + return NULL; +} + static inline struct irq_domain *of_msi_get_domain(struct device *dev, struct device_node *np, enum irq_domain_bus_token token) -- cgit v0.10.2 From eaddb5725357e9f05ffe5d271630f8197d089da4 Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Wed, 9 Dec 2015 09:11:10 -0600 Subject: of/irq: move of_msi_map_rid declaration to the correct ifdef section In checking fixes for of_irq_find_parent declaration location, I found that of_msi_map_rid is also wrong. of_msi_map_rid is not implemented for Sparc, so it should not be in the Sparc specific section of the header. Move it to just depend on OF_IRQ. Cc: Frank Rowand Signed-off-by: Rob Herring diff --git a/include/linux/of_irq.h b/include/linux/of_irq.h index f648acf..1e0deb8 100644 --- a/include/linux/of_irq.h +++ b/include/linux/of_irq.h @@ -53,6 +53,7 @@ extern struct irq_domain *of_msi_get_domain(struct device *dev, extern struct irq_domain *of_msi_map_get_device_domain(struct device *dev, u32 rid); extern void of_msi_configure(struct device *dev, struct device_node *np); +u32 of_msi_map_rid(struct device *dev, struct device_node *msi_np, u32 rid_in); #else static inline int of_irq_count(struct device_node *dev) { @@ -90,6 +91,11 @@ static inline struct irq_domain *of_msi_map_get_device_domain(struct device *dev static inline void of_msi_configure(struct device *dev, struct device_node *np) { } +static inline u32 of_msi_map_rid(struct device *dev, + struct device_node *msi_np, u32 rid_in) +{ + return rid_in; +} #endif #if defined(CONFIG_OF_IRQ) || defined(CONFIG_SPARC) @@ -99,7 +105,6 @@ static inline void of_msi_configure(struct device *dev, struct device_node *np) * so declare it here regardless of the CONFIG_OF_IRQ setting. */ extern unsigned int irq_of_parse_and_map(struct device_node *node, int index); -u32 of_msi_map_rid(struct device *dev, struct device_node *msi_np, u32 rid_in); #else /* !CONFIG_OF && !CONFIG_SPARC */ static inline unsigned int irq_of_parse_and_map(struct device_node *dev, @@ -107,12 +112,6 @@ static inline unsigned int irq_of_parse_and_map(struct device_node *dev, { return 0; } - -static inline u32 of_msi_map_rid(struct device *dev, - struct device_node *msi_np, u32 rid_in) -{ - return rid_in; -} #endif /* !CONFIG_OF */ #endif /* __OF_IRQ_H */ -- cgit v0.10.2 From 7d32cdef535622c0aea39807989f62cdddea207e Mon Sep 17 00:00:00 2001 From: Aaro Koskinen Date: Mon, 23 Nov 2015 21:50:11 +0200 Subject: usb: musb: fail with error when no DMA controller set Fail with error when no DMA controller is set. Signed-off-by: Aaro Koskinen Signed-off-by: Felipe Balbi diff --git a/drivers/usb/musb/musb_core.c b/drivers/usb/musb/musb_core.c index 3c07ffd..ee9ff70 100644 --- a/drivers/usb/musb/musb_core.c +++ b/drivers/usb/musb/musb_core.c @@ -2094,6 +2094,7 @@ musb_init_controller(struct device *dev, int nIrq, void __iomem *ctrl) #ifndef CONFIG_MUSB_PIO_ONLY if (!musb->ops->dma_init || !musb->ops->dma_exit) { dev_err(dev, "DMA controller not set\n"); + status = -ENODEV; goto fail2; } musb_dma_controller_create = musb->ops->dma_init; -- cgit v0.10.2 From 5328e1ea87fb2b5cf695115df4325c1913209e97 Mon Sep 17 00:00:00 2001 From: Gabriele Martino Date: Wed, 9 Dec 2015 17:05:58 +0100 Subject: ALSA: hda/ca0132 - quirk for Alienware 17 2015 The Alienware 17 (2015) has the same card and pin configuration of the Alienware 15, so the same quirks must be applied. Signed-off-by: Gabriele Martino Cc: Signed-off-by: Takashi Iwai diff --git a/sound/pci/hda/patch_ca0132.c b/sound/pci/hda/patch_ca0132.c index f8a12ca4..4ef2259 100644 --- a/sound/pci/hda/patch_ca0132.c +++ b/sound/pci/hda/patch_ca0132.c @@ -778,7 +778,8 @@ static const struct hda_pintbl alienware_pincfgs[] = { }; static const struct snd_pci_quirk ca0132_quirks[] = { - SND_PCI_QUIRK(0x1028, 0x0685, "Alienware 15", QUIRK_ALIENWARE), + SND_PCI_QUIRK(0x1028, 0x0685, "Alienware 15 2015", QUIRK_ALIENWARE), + SND_PCI_QUIRK(0x1028, 0x0688, "Alienware 17 2015", QUIRK_ALIENWARE), {} }; -- cgit v0.10.2 From 49e99fc717f624aa75ca755d6e7bc029efd3f0e9 Mon Sep 17 00:00:00 2001 From: Joe Thornber Date: Wed, 9 Dec 2015 16:23:24 +0000 Subject: dm thin metadata: fix bug when taking a metadata snapshot When you take a metadata snapshot the btree roots for the mapping and details tree need to have their reference counts incremented so they persist for the lifetime of the metadata snap. The roots being incremented were those currently written in the superblock, which could possibly be out of date if concurrent IO is triggering new mappings, breaking of sharing, etc. Fix this by performing a commit with the metadata lock held while taking a metadata snapshot. Signed-off-by: Joe Thornber Signed-off-by: Mike Snitzer Cc: stable@vger.kernel.org diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c index 67871e7..c219a05 100644 --- a/drivers/md/dm-thin-metadata.c +++ b/drivers/md/dm-thin-metadata.c @@ -1207,6 +1207,12 @@ static int __reserve_metadata_snap(struct dm_pool_metadata *pmd) dm_block_t held_root; /* + * We commit to ensure the btree roots which we increment in a + * moment are up to date. + */ + __commit_transaction(pmd); + + /* * Copy the superblock. */ dm_sm_inc_block(pmd->metadata_sm, THIN_SUPERBLOCK_LOCATION); -- cgit v0.10.2 From 50dd842ad83b43bed71790efb31cfb2f6c05c9c1 Mon Sep 17 00:00:00 2001 From: Joe Thornber Date: Wed, 9 Dec 2015 16:38:12 +0000 Subject: dm space map metadata: fix ref counting bug when bootstrapping a new space map When applying block operations (BOPs) do not remove them from the uncommitted BOP ring-buffer until after they've been applied -- in case we recurse. Also, perform BOP_INC operation, in dm_sm_metadata_create() and sm_metadata_extend(), in terms of the uncommitted BOP ring-buffer rather than using direct calls to sm_ll_inc(). Signed-off-by: Joe Thornber Signed-off-by: Mike Snitzer Cc: stable@vger.kernel.org diff --git a/drivers/md/persistent-data/dm-space-map-metadata.c b/drivers/md/persistent-data/dm-space-map-metadata.c index 5309129..fca6dbc 100644 --- a/drivers/md/persistent-data/dm-space-map-metadata.c +++ b/drivers/md/persistent-data/dm-space-map-metadata.c @@ -136,7 +136,7 @@ static int brb_push(struct bop_ring_buffer *brb, return 0; } -static int brb_pop(struct bop_ring_buffer *brb, struct block_op *result) +static int brb_peek(struct bop_ring_buffer *brb, struct block_op *result) { struct block_op *bop; @@ -147,6 +147,17 @@ static int brb_pop(struct bop_ring_buffer *brb, struct block_op *result) result->type = bop->type; result->block = bop->block; + return 0; +} + +static int brb_pop(struct bop_ring_buffer *brb) +{ + struct block_op *bop; + + if (brb_empty(brb)) + return -ENODATA; + + bop = brb->bops + brb->begin; brb->begin = brb_next(brb, brb->begin); return 0; @@ -211,7 +222,7 @@ static int apply_bops(struct sm_metadata *smm) while (!brb_empty(&smm->uncommitted)) { struct block_op bop; - r = brb_pop(&smm->uncommitted, &bop); + r = brb_peek(&smm->uncommitted, &bop); if (r) { DMERR("bug in bop ring buffer"); break; @@ -220,6 +231,8 @@ static int apply_bops(struct sm_metadata *smm) r = commit_bop(smm, &bop); if (r) break; + + brb_pop(&smm->uncommitted); } return r; @@ -683,7 +696,6 @@ static struct dm_space_map bootstrap_ops = { static int sm_metadata_extend(struct dm_space_map *sm, dm_block_t extra_blocks) { int r, i; - enum allocation_event ev; struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); dm_block_t old_len = smm->ll.nr_blocks; @@ -705,11 +717,12 @@ static int sm_metadata_extend(struct dm_space_map *sm, dm_block_t extra_blocks) * allocate any new blocks. */ do { - for (i = old_len; !r && i < smm->begin; i++) { - r = sm_ll_inc(&smm->ll, i, &ev); - if (r) - goto out; - } + for (i = old_len; !r && i < smm->begin; i++) + r = add_bop(smm, BOP_INC, i); + + if (r) + goto out; + old_len = smm->begin; r = apply_bops(smm); @@ -754,7 +767,6 @@ int dm_sm_metadata_create(struct dm_space_map *sm, { int r; dm_block_t i; - enum allocation_event ev; struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); smm->begin = superblock + 1; @@ -782,7 +794,7 @@ int dm_sm_metadata_create(struct dm_space_map *sm, * allocated blocks that they were built from. */ for (i = superblock; !r && i < smm->begin; i++) - r = sm_ll_inc(&smm->ll, i, &ev); + r = add_bop(smm, BOP_INC, i); if (r) return r; -- cgit v0.10.2 From d7e35dfa2531b53618b9e6edcd8752ce988ac555 Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Thu, 3 Dec 2015 22:04:01 -0500 Subject: bitops.h: correctly handle rol32 with 0 byte shift ROL on a 32 bit integer with a shift of 32 or more is undefined and the result is arch-dependent. Avoid this by handling the trivial case of roling by 0 correctly. The trivial solution of checking if shift is 0 breaks gcc's detection of this code as a ROL instruction, which is unacceptable. This bug was reported and fixed in GCC (https://gcc.gnu.org/bugzilla/show_bug.cgi?id=57157): The standard rotate idiom, (x << n) | (x >> (32 - n)) is recognized by gcc (for concreteness, I discuss only the case that x is an uint32_t here). However, this is portable C only for n in the range 0 < n < 32. For n == 0, we get x >> 32 which gives undefined behaviour according to the C standard (6.5.7, Bitwise shift operators). To portably support n == 0, one has to write the rotate as something like (x << n) | (x >> ((-n) & 31)) And this is apparently not recognized by gcc. Note that this is broken on older GCCs and will result in slower ROL. Acked-by: Linus Torvalds Signed-off-by: Sasha Levin Signed-off-by: Linus Torvalds diff --git a/include/linux/bitops.h b/include/linux/bitops.h index 2b8ed12..defeaac 100644 --- a/include/linux/bitops.h +++ b/include/linux/bitops.h @@ -107,7 +107,7 @@ static inline __u64 ror64(__u64 word, unsigned int shift) */ static inline __u32 rol32(__u32 word, unsigned int shift) { - return (word << shift) | (word >> (32 - shift)); + return (word << shift) | (word >> ((-shift) & 31)); } /** -- cgit v0.10.2 From 27f972d3e00b50639deb4cc1392afaeb08d3cecc Mon Sep 17 00:00:00 2001 From: Jan Stancek Date: Tue, 8 Dec 2015 13:57:51 -0500 Subject: ipmi: move timer init to before irq is setup We encountered a panic on boot in ipmi_si on a dell per320 due to an uninitialized timer as follows. static int smi_start_processing(void *send_info, ipmi_smi_t intf) { /* Try to claim any interrupts. */ if (new_smi->irq_setup) new_smi->irq_setup(new_smi); --> IRQ arrives here and irq handler tries to modify uninitialized timer which triggers BUG_ON(!timer->function) in __mod_timer(). Call Trace: [] start_new_msg+0x47/0x80 [ipmi_si] [] start_check_enables+0x4e/0x60 [ipmi_si] [] smi_event_handler+0x1e8/0x640 [ipmi_si] [] ? __rcu_process_callbacks+0x54/0x350 [] si_irq_handler+0x3c/0x60 [ipmi_si] [] handle_IRQ_event+0x60/0x170 [] handle_edge_irq+0xde/0x180 [] handle_irq+0x49/0xa0 [] do_IRQ+0x6c/0xf0 [] ret_from_intr+0x0/0x11 /* Set up the timer that drives the interface. */ setup_timer(&new_smi->si_timer, smi_timeout, (long)new_smi); The following patch fixes the problem. To: Openipmi-developer@lists.sourceforge.net To: Corey Minyard CC: linux-kernel@vger.kernel.org Signed-off-by: Jan Stancek Signed-off-by: Tony Camuso Signed-off-by: Corey Minyard Cc: stable@vger.kernel.org # Applies cleanly to 3.10-, needs small rework before diff --git a/drivers/char/ipmi/ipmi_si_intf.c b/drivers/char/ipmi/ipmi_si_intf.c index 55fe902..4cc72fa 100644 --- a/drivers/char/ipmi/ipmi_si_intf.c +++ b/drivers/char/ipmi/ipmi_si_intf.c @@ -1230,14 +1230,14 @@ static int smi_start_processing(void *send_info, new_smi->intf = intf; - /* Try to claim any interrupts. */ - if (new_smi->irq_setup) - new_smi->irq_setup(new_smi); - /* Set up the timer that drives the interface. */ setup_timer(&new_smi->si_timer, smi_timeout, (long)new_smi); smi_mod_timer(new_smi, jiffies + SI_TIMEOUT_JIFFIES); + /* Try to claim any interrupts. */ + if (new_smi->irq_setup) + new_smi->irq_setup(new_smi); + /* * Check if the user forcefully enabled the daemon. */ -- cgit v0.10.2 From 14054fb1da099fd89208b8b319a00e0b902c7645 Mon Sep 17 00:00:00 2001 From: Grygorii Strashko Date: Mon, 30 Nov 2015 17:56:38 +0200 Subject: ARM: dts: am4372: fix clock source for arm twd and global timers ARM TWD and Global timer are clocked by PERIPHCLK which is MPU_CLK/2. But now they are clocked by dpll_mpu_m2_ck == MPU_CLK and, as result. Timekeeping core misbehaves. For example, execution of command "sleep 5" will take 10 sec instead of 5. Hence, fix it by adding mpu_periphclk ("fixed-factor-clock") and use it for clocking ARM TWD and Global timer (same way as on OMAP4). Cc: Tony Lindgren Cc: Felipe Balbi Cc: Tero Kristo Fixes:commit 8cbd4c2f6a99 ("arm: boot: dts: am4372: add ARM timers and SCU nodes") Signed-off-by: Grygorii Strashko Reviewed-by: Felipe Balbi Signed-off-by: Tony Lindgren diff --git a/arch/arm/boot/dts/am4372.dtsi b/arch/arm/boot/dts/am4372.dtsi index d83ff9c..de8791a 100644 --- a/arch/arm/boot/dts/am4372.dtsi +++ b/arch/arm/boot/dts/am4372.dtsi @@ -74,7 +74,7 @@ reg = <0x48240200 0x100>; interrupts = ; interrupt-parent = <&gic>; - clocks = <&dpll_mpu_m2_ck>; + clocks = <&mpu_periphclk>; }; local_timer: timer@48240600 { @@ -82,7 +82,7 @@ reg = <0x48240600 0x100>; interrupts = ; interrupt-parent = <&gic>; - clocks = <&dpll_mpu_m2_ck>; + clocks = <&mpu_periphclk>; }; l2-cache-controller@48242000 { diff --git a/arch/arm/boot/dts/am43xx-clocks.dtsi b/arch/arm/boot/dts/am43xx-clocks.dtsi index cc88728..a38af2b 100644 --- a/arch/arm/boot/dts/am43xx-clocks.dtsi +++ b/arch/arm/boot/dts/am43xx-clocks.dtsi @@ -259,6 +259,14 @@ ti,invert-autoidle-bit; }; + mpu_periphclk: mpu_periphclk { + #clock-cells = <0>; + compatible = "fixed-factor-clock"; + clocks = <&dpll_mpu_m2_ck>; + clock-mult = <1>; + clock-div = <2>; + }; + dpll_ddr_ck: dpll_ddr_ck { #clock-cells = <0>; compatible = "ti,am3-dpll-clock"; -- cgit v0.10.2 From ed8b45a3679eb49069b094c0711b30833f27c734 Mon Sep 17 00:00:00 2001 From: Joe Thornber Date: Thu, 10 Dec 2015 14:37:53 +0000 Subject: dm btree: fix bufio buffer leaks in dm_btree_del() error path If dm_btree_del()'s call to push_frame() fails, e.g. due to btree_node_validator finding invalid metadata, the dm_btree_del() error path must unlock all frames (which have active dm-bufio buffers) that were pushed onto the del_stack. Otherwise, dm_bufio_client_destroy() will BUG_ON() because dm-bufio buffers have leaked, e.g.: device-mapper: bufio: leaked buffer 3, hold count 1, list 0 Signed-off-by: Joe Thornber Signed-off-by: Mike Snitzer Cc: stable@vger.kernel.org diff --git a/drivers/md/persistent-data/dm-btree.c b/drivers/md/persistent-data/dm-btree.c index 7e5b7f1..b1ced58 100644 --- a/drivers/md/persistent-data/dm-btree.c +++ b/drivers/md/persistent-data/dm-btree.c @@ -257,6 +257,16 @@ static void pop_frame(struct del_stack *s) dm_tm_unlock(s->tm, f->b); } +static void unlock_all_frames(struct del_stack *s) +{ + struct frame *f; + + while (unprocessed_frames(s)) { + f = s->spine + s->top--; + dm_tm_unlock(s->tm, f->b); + } +} + int dm_btree_del(struct dm_btree_info *info, dm_block_t root) { int r; @@ -313,9 +323,13 @@ int dm_btree_del(struct dm_btree_info *info, dm_block_t root) pop_frame(s); } } - out: + if (r) { + /* cleanup all frames of del_stack */ + unlock_all_frames(s); + } kfree(s); + return r; } EXPORT_SYMBOL_GPL(dm_btree_del); -- cgit v0.10.2 From 059393c5bdd1420bdf1bed2972f33196dff263ae Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 7 Dec 2015 10:11:11 +0000 Subject: irqchip/gic-v3: Add missing struct device_node declaration When the GICv3 header file is used in a C file that doesn't include any of the OF stuff, we end up with a bunch of ugly warnings. Let's keep GCC quiet by adding a forward declaration. Signed-off-by: Marc Zyngier Cc: Cc: Jason Cooper Link: http://lkml.kernel.org/r/1449483072-17694-2-git-send-email-marc.zyngier@arm.com Signed-off-by: Thomas Gleixner diff --git a/include/linux/irqchip/arm-gic-v3.h b/include/linux/irqchip/arm-gic-v3.h index c9ae0c6..d5d798b 100644 --- a/include/linux/irqchip/arm-gic-v3.h +++ b/include/linux/irqchip/arm-gic-v3.h @@ -330,6 +330,7 @@ struct rdists { }; struct irq_domain; +struct device_node; int its_cpu_init(void); int its_init(struct device_node *node, struct rdists *rdists, struct irq_domain *domain); -- cgit v0.10.2 From 8e31ed9c189788463c3b6c1f950cd90755f05f76 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 7 Dec 2015 10:11:12 +0000 Subject: irqchip/gic-v3: Add missing include for barrier.h Both the 32bit and 64bit versions of the GICv3 header file are using barriers, but neglect to include barrier.h, leading to an interesting splat in some circumstances. Signed-off-by: Marc Zyngier Cc: Cc: Jason Cooper Link: http://lkml.kernel.org/r/1449483072-17694-3-git-send-email-marc.zyngier@arm.com Signed-off-by: Thomas Gleixner diff --git a/arch/arm/include/asm/arch_gicv3.h b/arch/arm/include/asm/arch_gicv3.h index 6607d97..7da5503 100644 --- a/arch/arm/include/asm/arch_gicv3.h +++ b/arch/arm/include/asm/arch_gicv3.h @@ -21,6 +21,7 @@ #ifndef __ASSEMBLY__ #include +#include #define __ACCESS_CP15(CRn, Op1, CRm, Op2) p15, Op1, %0, CRn, CRm, Op2 #define __ACCESS_CP15_64(Op1, CRm) p15, Op1, %Q0, %R0, CRm diff --git a/arch/arm64/include/asm/arch_gicv3.h b/arch/arm64/include/asm/arch_gicv3.h index 030cdcb..2731d3b 100644 --- a/arch/arm64/include/asm/arch_gicv3.h +++ b/arch/arm64/include/asm/arch_gicv3.h @@ -77,6 +77,7 @@ #ifndef __ASSEMBLY__ #include +#include /* * Low-level accessors -- cgit v0.10.2 From e2bf3e6ecaff79c5479682da2dc7b2035e52c5b8 Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Thu, 10 Dec 2015 18:21:41 +0100 Subject: clocksource: Mmio: remove artificial 32bit limitation The EP93xx is registering a clocksource of 40 bits with clocksource_mmio_init() but this is not working because of this artificial limitation. It works fine to lift the uppe limit to 64 bits, and since cycle_t is u64, it should intuitively have been like that from the beginning. Fixes: 000bc17817bf "ARM: ep93xx: switch to GENERIC_CLOCKEVENTS" Reported-by: Alexander Sverdlin Signed-off-by: Linus Walleij Cc: Daniel Lezcano Link: http://lkml.kernel.org/r/1449768101-6879-1-git-send-email-linus.walleij@linaro.org Signed-off-by: Thomas Gleixner diff --git a/drivers/clocksource/mmio.c b/drivers/clocksource/mmio.c index 1593ade..c4f7d7a 100644 --- a/drivers/clocksource/mmio.c +++ b/drivers/clocksource/mmio.c @@ -55,7 +55,7 @@ int __init clocksource_mmio_init(void __iomem *base, const char *name, { struct clocksource_mmio *cs; - if (bits > 32 || bits < 16) + if (bits > 64 || bits < 16) return -EINVAL; cs = kzalloc(sizeof(struct clocksource_mmio), GFP_KERNEL); -- cgit v0.10.2 From 5d756147f7d356c3e85372373a0054dcf25a370e Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Mon, 7 Dec 2015 21:09:24 +0800 Subject: ARM: dts: berlin: correct BG2Q's sdhci2 2nd clock The optional 2nd clock is CLKID_SDIO. We removed CLK_IGNORE_UNUSED from CLKID_SDIO's flag, so the sdhci2 doesn't work. This patch fixes this issue by correcting the sdhci2's 2nd clock. Signed-off-by: Jisheng Zhang Signed-off-by: Sebastian Hesselbarth diff --git a/arch/arm/boot/dts/berlin2q.dtsi b/arch/arm/boot/dts/berlin2q.dtsi index 8ea177f..5b10129 100644 --- a/arch/arm/boot/dts/berlin2q.dtsi +++ b/arch/arm/boot/dts/berlin2q.dtsi @@ -135,7 +135,7 @@ compatible = "mrvl,pxav3-mmc"; reg = <0xab1000 0x200>; interrupts = ; - clocks = <&chip_clk CLKID_NFC_ECC>, <&chip_clk CLKID_NFC>; + clocks = <&chip_clk CLKID_NFC_ECC>, <&chip_clk CLKID_SDIO>; clock-names = "io", "core"; status = "disabled"; }; -- cgit v0.10.2 From b5010d2081331e1c95358c4eb500bd30f5d532ce Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Mon, 7 Dec 2015 21:09:25 +0800 Subject: ARM: dts: berlin: add 2nd clock for BG2Q sdhci0 and sdhci1 We removed CLK_IGNORE_UNUSED from CLKID_SDIO's flag, so the sdhci0 and sdhci1 don't work. We fix this by adding the optional 2nd clock for BG2Q's sdhci0 and sdhci1. This patch brings another benefit: the 2nd clock can be disabled during runtime pm, so saves power a bit. Signed-off-by: Jisheng Zhang Signed-off-by: Sebastian Hesselbarth diff --git a/arch/arm/boot/dts/berlin2q.dtsi b/arch/arm/boot/dts/berlin2q.dtsi index 5b10129..fb1da99 100644 --- a/arch/arm/boot/dts/berlin2q.dtsi +++ b/arch/arm/boot/dts/berlin2q.dtsi @@ -118,7 +118,8 @@ sdhci0: sdhci@ab0000 { compatible = "mrvl,pxav3-mmc"; reg = <0xab0000 0x200>; - clocks = <&chip_clk CLKID_SDIO1XIN>; + clocks = <&chip_clk CLKID_SDIO1XIN>, <&chip_clk CLKID_SDIO>; + clock-names = "io", "core"; interrupts = ; status = "disabled"; }; @@ -126,7 +127,8 @@ sdhci1: sdhci@ab0800 { compatible = "mrvl,pxav3-mmc"; reg = <0xab0800 0x200>; - clocks = <&chip_clk CLKID_SDIO1XIN>; + clocks = <&chip_clk CLKID_SDIO1XIN>, <&chip_clk CLKID_SDIO>; + clock-names = "io", "core"; interrupts = ; status = "disabled"; }; -- cgit v0.10.2 From ddd47fbcf68dd07487fd86f5e3d9f594f8349980 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 9 Dec 2015 15:11:12 +0100 Subject: ARM: pxa: use PWM lookup table for all machines The recent change to use a pwm lookup table for the ezx machines was incomplete and only changed the a780 model, but not the other ones in the same file. This adds the missing calls to pwm_add_table(). Signed-off-by: Arnd Bergmann Fixes: c3322022897c ("ARM: pxa: ezx: Use PWM lookup table") Acked-by: Thierry Reding Acked-by: Robert Jarzmik diff --git a/arch/arm/mach-pxa/ezx.c b/arch/arm/mach-pxa/ezx.c index 9a9c15b..7c0d561 100644 --- a/arch/arm/mach-pxa/ezx.c +++ b/arch/arm/mach-pxa/ezx.c @@ -889,6 +889,7 @@ static void __init e680_init(void) pxa_set_keypad_info(&e680_keypad_platform_data); + pwm_add_table(ezx_pwm_lookup, ARRAY_SIZE(ezx_pwm_lookup)); platform_add_devices(ARRAY_AND_SIZE(ezx_devices)); platform_add_devices(ARRAY_AND_SIZE(e680_devices)); } @@ -956,6 +957,7 @@ static void __init a1200_init(void) pxa_set_keypad_info(&a1200_keypad_platform_data); + pwm_add_table(ezx_pwm_lookup, ARRAY_SIZE(ezx_pwm_lookup)); platform_add_devices(ARRAY_AND_SIZE(ezx_devices)); platform_add_devices(ARRAY_AND_SIZE(a1200_devices)); } @@ -1148,6 +1150,7 @@ static void __init a910_init(void) platform_device_register(&a910_camera); } + pwm_add_table(ezx_pwm_lookup, ARRAY_SIZE(ezx_pwm_lookup)); platform_add_devices(ARRAY_AND_SIZE(ezx_devices)); platform_add_devices(ARRAY_AND_SIZE(a910_devices)); } @@ -1215,6 +1218,7 @@ static void __init e6_init(void) pxa_set_keypad_info(&e6_keypad_platform_data); + pwm_add_table(ezx_pwm_lookup, ARRAY_SIZE(ezx_pwm_lookup)); platform_add_devices(ARRAY_AND_SIZE(ezx_devices)); platform_add_devices(ARRAY_AND_SIZE(e6_devices)); } @@ -1256,6 +1260,7 @@ static void __init e2_init(void) pxa_set_keypad_info(&e2_keypad_platform_data); + pwm_add_table(ezx_pwm_lookup, ARRAY_SIZE(ezx_pwm_lookup)); platform_add_devices(ARRAY_AND_SIZE(ezx_devices)); platform_add_devices(ARRAY_AND_SIZE(e2_devices)); } -- cgit v0.10.2 From 9f5bd30818c42c6c36a51f93b4df75a2ea2bd85e Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Mon, 30 Nov 2015 04:17:31 +0200 Subject: vgaarb: fix signal handling in vga_get() There are few defects in vga_get() related to signal hadning: - we shouldn't check for pending signals for TASK_UNINTERRUPTIBLE case; - if we found pending signal we must remove ourself from wait queue and change task state back to running; - -ERESTARTSYS is more appropriate, I guess. Signed-off-by: Kirill A. Shutemov Cc: stable@vger.kernel.org Reviewed-by: David Herrmann Signed-off-by: Dave Airlie diff --git a/drivers/gpu/vga/vgaarb.c b/drivers/gpu/vga/vgaarb.c index 3166e4b..9abcaa5 100644 --- a/drivers/gpu/vga/vgaarb.c +++ b/drivers/gpu/vga/vgaarb.c @@ -395,8 +395,10 @@ int vga_get(struct pci_dev *pdev, unsigned int rsrc, int interruptible) set_current_state(interruptible ? TASK_INTERRUPTIBLE : TASK_UNINTERRUPTIBLE); - if (signal_pending(current)) { - rc = -EINTR; + if (interruptible && signal_pending(current)) { + __set_current_state(TASK_RUNNING); + remove_wait_queue(&vga_wait_queue, &wait); + rc = -ERESTARTSYS; break; } schedule(); -- cgit v0.10.2 From 9c17190595840b4ed30e8d5f286636ceb28aae4f Mon Sep 17 00:00:00 2001 From: Stefan Agner Date: Mon, 30 Nov 2015 17:59:26 -0800 Subject: ARM: dts: vf610: use reset values for L2 cache latencies Linux on Vybrid used several different L2 latencies so far, none of them seem to be the right ones. According to the application note AN4947 ("Understanding Vybrid Architecture"), the tag portion runs on CPU clock and is inside the L2 cache controller, whereas the data portion is stored in the external SRAM running on platform clock. Hence it is likely that the correct value requires a higher data latency then tag latency. These are the values which have been used so far: - The mainline values: arm,data-latency = <1 1 1>; arm,tag-latency = <2 2 2>; Those values have lead to problems on higher clocks. They look like a poor translation from the reset values (missing +1 offset and a mix up between tag/latency values). - The Linux 3.0 (SoC vendor BSP) values (converted to DT notation): arm,data-latency = <4 2 3> arm,tag-latency = <4 2 3> The cache initialization function along with the value matches the i.MX6 code from the same kernel, so it seems that those values have just been copied. - The Colibri values: arm,data-latency = <2 1 2>; arm,tag-latency = <3 2 3>; Those were a mix between the values of the Linux 3.0 based BSP and the mainline values above. - The SoC Reset values (converted to DT notation): arm,data-latency = <3 3 3>; arm,tag-latency = <2 2 2>; So far there is no official statement on what the correct values are. See also the related Freescale community thread: https://community.freescale.com/message/579785#579785 For now, the reset values seem to be the best bet. Remove all other "bogus" values and use the reset value on vf610.dtsi level. Signed-off-by: Stefan Agner Cc: Signed-off-by: Shawn Guo diff --git a/arch/arm/boot/dts/vf610-colibri.dtsi b/arch/arm/boot/dts/vf610-colibri.dtsi index 19fe045..2d7eab7 100644 --- a/arch/arm/boot/dts/vf610-colibri.dtsi +++ b/arch/arm/boot/dts/vf610-colibri.dtsi @@ -18,8 +18,3 @@ reg = <0x80000000 0x10000000>; }; }; - -&L2 { - arm,data-latency = <2 1 2>; - arm,tag-latency = <3 2 3>; -}; diff --git a/arch/arm/boot/dts/vf610.dtsi b/arch/arm/boot/dts/vf610.dtsi index 5f8eb1b..58bc6e4 100644 --- a/arch/arm/boot/dts/vf610.dtsi +++ b/arch/arm/boot/dts/vf610.dtsi @@ -19,7 +19,7 @@ reg = <0x40006000 0x1000>; cache-unified; cache-level = <2>; - arm,data-latency = <1 1 1>; + arm,data-latency = <3 3 3>; arm,tag-latency = <2 2 2>; }; }; -- cgit v0.10.2 From 82d340081b6f71237373d1452e3573a5a122794c Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Tue, 8 Dec 2015 17:39:15 +0000 Subject: arm64: Improve error reporting on set_pte_at() checks Currently the BUG_ON() checks do not give enough information about the PTEs being set. This patch changes BUG_ON to WARN_ONCE and dumps the values of the old and new PTEs. In addition, the checks are only made if the new PTE entry is valid. Signed-off-by: Catalin Marinas Reported-by: Ming Lei Cc: Will Deacon diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 7e074f9..63f52b5 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -276,10 +276,14 @@ static inline void set_pte_at(struct mm_struct *mm, unsigned long addr, * hardware updates of the pte (ptep_set_access_flags safely changes * valid ptes without going through an invalid entry). */ - if (IS_ENABLED(CONFIG_DEBUG_VM) && IS_ENABLED(CONFIG_ARM64_HW_AFDBM) && - pte_valid(*ptep)) { - BUG_ON(!pte_young(pte)); - BUG_ON(pte_write(*ptep) && !pte_dirty(pte)); + if (IS_ENABLED(CONFIG_ARM64_HW_AFDBM) && + pte_valid(*ptep) && pte_valid(pte)) { + VM_WARN_ONCE(!pte_young(pte), + "%s: racy access flag clearing: 0x%016llx -> 0x%016llx", + __func__, pte_val(*ptep), pte_val(pte)); + VM_WARN_ONCE(pte_write(*ptep) && !pte_dirty(pte), + "%s: racy dirty state clearing: 0x%016llx -> 0x%016llx", + __func__, pte_val(*ptep), pte_val(pte)); } set_pte(ptep, pte); -- cgit v0.10.2 From f69115fdbc1ac0718e7d19ad3caa3da2ecfe1c96 Mon Sep 17 00:00:00 2001 From: Mathias Nyman Date: Fri, 11 Dec 2015 14:38:06 +0200 Subject: xhci: fix usb2 resume timing and races. According to USB 2 specs ports need to signal resume for at least 20ms, in practice even longer, before moving to U0 state. Both host and devices can initiate resume. On device initiated resume, a port status interrupt with the port in resume state in issued. The interrupt handler tags a resume_done[port] timestamp with current time + USB_RESUME_TIMEOUT, and kick roothub timer. Root hub timer requests for port status, finds the port in resume state, checks if resume_done[port] timestamp passed, and set port to U0 state. On host initiated resume, current code sets the port to resume state, sleep 20ms, and finally sets the port to U0 state. This should also be changed to work in a similar way as the device initiated resume, with timestamp tagging, but that is not yet tested and will be a separate fix later. There are a few issues with this approach 1. A host initiated resume will also generate a resume event. The event handler will find the port in resume state, believe it's a device initiated resume, and act accordingly. 2. A port status request might cut the resume signalling short if a get_port_status request is handled during the host resume signalling. The port will be found in resume state. The timestamp is not set leading to time_after_eq(jiffies, timestamp) returning true, as timestamp = 0. get_port_status will proceed with moving the port to U0. 3. If an error, or anything else happens to the port during device initiated resume signalling it will leave all the device resume parameters hanging uncleared, preventing further suspend, returning -EBUSY, and cause the pm thread to busyloop trying to enter suspend. Fix this by using the existing resuming_ports bitfield to indicate that resume signalling timing is taken care of. Check if the resume_done[port] is set before using it for timestamp comparison, and also clear out any resume signalling related variables if port is not in U0 or Resume state This issue was discovered when a PM thread busylooped, trying to runtime suspend the xhci USB 2 roothub on a Dell XPS Cc: stable Reported-by: Daniel J Blueman Tested-by: Daniel J Blueman Signed-off-by: Mathias Nyman Signed-off-by: Greg Kroah-Hartman diff --git a/drivers/usb/host/xhci-hub.c b/drivers/usb/host/xhci-hub.c index 0230965..f980c23 100644 --- a/drivers/usb/host/xhci-hub.c +++ b/drivers/usb/host/xhci-hub.c @@ -733,8 +733,30 @@ static u32 xhci_get_port_status(struct usb_hcd *hcd, if ((raw_port_status & PORT_RESET) || !(raw_port_status & PORT_PE)) return 0xffffffff; - if (time_after_eq(jiffies, - bus_state->resume_done[wIndex])) { + /* did port event handler already start resume timing? */ + if (!bus_state->resume_done[wIndex]) { + /* If not, maybe we are in a host initated resume? */ + if (test_bit(wIndex, &bus_state->resuming_ports)) { + /* Host initated resume doesn't time the resume + * signalling using resume_done[]. + * It manually sets RESUME state, sleeps 20ms + * and sets U0 state. This should probably be + * changed, but not right now. + */ + } else { + /* port resume was discovered now and here, + * start resume timing + */ + unsigned long timeout = jiffies + + msecs_to_jiffies(USB_RESUME_TIMEOUT); + + set_bit(wIndex, &bus_state->resuming_ports); + bus_state->resume_done[wIndex] = timeout; + mod_timer(&hcd->rh_timer, timeout); + } + /* Has resume been signalled for USB_RESUME_TIME yet? */ + } else if (time_after_eq(jiffies, + bus_state->resume_done[wIndex])) { int time_left; xhci_dbg(xhci, "Resume USB2 port %d\n", @@ -775,13 +797,26 @@ static u32 xhci_get_port_status(struct usb_hcd *hcd, } else { /* * The resume has been signaling for less than - * 20ms. Report the port status as SUSPEND, - * let the usbcore check port status again - * and clear resume signaling later. + * USB_RESUME_TIME. Report the port status as SUSPEND, + * let the usbcore check port status again and clear + * resume signaling later. */ status |= USB_PORT_STAT_SUSPEND; } } + /* + * Clear stale usb2 resume signalling variables in case port changed + * state during resume signalling. For example on error + */ + if ((bus_state->resume_done[wIndex] || + test_bit(wIndex, &bus_state->resuming_ports)) && + (raw_port_status & PORT_PLS_MASK) != XDEV_U3 && + (raw_port_status & PORT_PLS_MASK) != XDEV_RESUME) { + bus_state->resume_done[wIndex] = 0; + clear_bit(wIndex, &bus_state->resuming_ports); + } + + if ((raw_port_status & PORT_PLS_MASK) == XDEV_U0 && (raw_port_status & PORT_POWER)) { if (bus_state->suspended_ports & (1 << wIndex)) { @@ -1115,6 +1150,7 @@ int xhci_hub_control(struct usb_hcd *hcd, u16 typeReq, u16 wValue, if ((temp & PORT_PE) == 0) goto error; + set_bit(wIndex, &bus_state->resuming_ports); xhci_set_link_state(xhci, port_array, wIndex, XDEV_RESUME); spin_unlock_irqrestore(&xhci->lock, flags); @@ -1122,6 +1158,7 @@ int xhci_hub_control(struct usb_hcd *hcd, u16 typeReq, u16 wValue, spin_lock_irqsave(&xhci->lock, flags); xhci_set_link_state(xhci, port_array, wIndex, XDEV_U0); + clear_bit(wIndex, &bus_state->resuming_ports); } bus_state->port_c_suspend |= 1 << wIndex; diff --git a/drivers/usb/host/xhci-ring.c b/drivers/usb/host/xhci-ring.c index 6c5e813..eeaa6c6 100644 --- a/drivers/usb/host/xhci-ring.c +++ b/drivers/usb/host/xhci-ring.c @@ -1583,7 +1583,8 @@ static void handle_port_status(struct xhci_hcd *xhci, */ bogus_port_status = true; goto cleanup; - } else { + } else if (!test_bit(faked_port_index, + &bus_state->resuming_ports)) { xhci_dbg(xhci, "resume HS port %d\n", port_id); bus_state->resume_done[faked_port_index] = jiffies + msecs_to_jiffies(USB_RESUME_TIMEOUT); -- cgit v0.10.2 From ad87e03213b552a5c33d5e1e7a19a73768397010 Mon Sep 17 00:00:00 2001 From: Alan Stern Date: Thu, 10 Dec 2015 15:27:21 -0500 Subject: USB: add quirk for devices with broken LPM Some USB device / host controller combinations seem to have problems with Link Power Management. For example, Steinar found that his xHCI controller wouldn't handle bandwidth calculations correctly for two video cards simultaneously when LPM was enabled, even though the bus had plenty of bandwidth available. This patch introduces a new quirk flag for devices that should remain disabled for LPM, and creates quirk entries for Steinar's devices. Signed-off-by: Alan Stern Reported-by: Steinar H. Gunderson Cc: stable Signed-off-by: Greg Kroah-Hartman diff --git a/drivers/usb/core/hub.c b/drivers/usb/core/hub.c index 585c3cb..a5cc032 100644 --- a/drivers/usb/core/hub.c +++ b/drivers/usb/core/hub.c @@ -124,6 +124,10 @@ struct usb_hub *usb_hub_to_struct_hub(struct usb_device *hdev) int usb_device_supports_lpm(struct usb_device *udev) { + /* Some devices have trouble with LPM */ + if (udev->quirks & USB_QUIRK_NO_LPM) + return 0; + /* USB 2.1 (and greater) devices indicate LPM support through * their USB 2.0 Extended Capabilities BOS descriptor. */ @@ -4512,6 +4516,8 @@ hub_port_init(struct usb_hub *hub, struct usb_device *udev, int port1, goto fail; } + usb_detect_quirks(udev); + if (udev->wusb == 0 && le16_to_cpu(udev->descriptor.bcdUSB) >= 0x0201) { retval = usb_get_bos_descriptor(udev); if (!retval) { @@ -4710,7 +4716,6 @@ static void hub_port_connect(struct usb_hub *hub, int port1, u16 portstatus, if (status < 0) goto loop; - usb_detect_quirks(udev); if (udev->quirks & USB_QUIRK_DELAY_INIT) msleep(1000); diff --git a/drivers/usb/core/quirks.c b/drivers/usb/core/quirks.c index fcd6ac0..6dc810b 100644 --- a/drivers/usb/core/quirks.c +++ b/drivers/usb/core/quirks.c @@ -202,6 +202,12 @@ static const struct usb_device_id usb_quirk_list[] = { { USB_DEVICE(0x1a0a, 0x0200), .driver_info = USB_QUIRK_LINEAR_UFRAME_INTR_BINTERVAL }, + /* Blackmagic Design Intensity Shuttle */ + { USB_DEVICE(0x1edb, 0xbd3b), .driver_info = USB_QUIRK_NO_LPM }, + + /* Blackmagic Design UltraStudio SDI */ + { USB_DEVICE(0x1edb, 0xbd4f), .driver_info = USB_QUIRK_NO_LPM }, + { } /* terminating entry must be last */ }; diff --git a/include/linux/usb/quirks.h b/include/linux/usb/quirks.h index 9948c87..1d0043d 100644 --- a/include/linux/usb/quirks.h +++ b/include/linux/usb/quirks.h @@ -47,4 +47,7 @@ /* device generates spurious wakeup, ignore remote wakeup capability */ #define USB_QUIRK_IGNORE_REMOTE_WAKEUP BIT(9) +/* device can't handle Link Power Management */ +#define USB_QUIRK_NO_LPM BIT(10) + #endif /* __LINUX_USB_QUIRKS_H */ -- cgit v0.10.2 From 2dd2e4d1eaadfc39eae0399c5be1a122a73ed13b Mon Sep 17 00:00:00 2001 From: yangbo lu Date: Fri, 4 Dec 2015 16:55:03 -0600 Subject: ARM64: dts: ls2080a: fix eSDHC endianness Add the "little-endian" property to fix the issue that eSDHC is not working and dumping out "mmc0: Controller never released inhibit bit(s)." error messages constantly. Fixes: 5461597f6ce0 ("dts/ls2080a: Update DTSI to add support of various peripherals") Signed-off-by: Yangbo Lu Signed-off-by: Li Yang Signed-off-by: Kevin Hilman diff --git a/arch/arm64/boot/dts/freescale/fsl-ls2080a.dtsi b/arch/arm64/boot/dts/freescale/fsl-ls2080a.dtsi index e81cd48..ed45e25 100644 --- a/arch/arm64/boot/dts/freescale/fsl-ls2080a.dtsi +++ b/arch/arm64/boot/dts/freescale/fsl-ls2080a.dtsi @@ -269,6 +269,7 @@ clock-frequency = <0>; /* Updated by bootloader */ voltage-ranges = <1800 1800 3300 3300>; sdhci,auto-cmd12; + little-endian; bus-width = <4>; }; -- cgit v0.10.2 From edb70e29f3c3e3b1064a7b7a5d129c8f32762e6f Mon Sep 17 00:00:00 2001 From: Li Yang Date: Fri, 4 Dec 2015 16:55:04 -0600 Subject: dt-bindings: define little-endian property for QorIQ GPIO The GPIO block on different QorIQ chips could have registers in different endianess. Define the property to specify which endian is used by the hardware. Signed-off-by: Liu Gang Acked-by: Rob Herring Signed-off-by: Li Yang Signed-off-by: Kevin Hilman diff --git a/Documentation/devicetree/bindings/gpio/gpio-mpc8xxx.txt b/Documentation/devicetree/bindings/gpio/gpio-mpc8xxx.txt index f2455c5..120bc49 100644 --- a/Documentation/devicetree/bindings/gpio/gpio-mpc8xxx.txt +++ b/Documentation/devicetree/bindings/gpio/gpio-mpc8xxx.txt @@ -11,6 +11,10 @@ Required properties: 0 = active high 1 = active low +Optional properties: +- little-endian : GPIO registers are used as little endian. If not + present registers are used as big endian by default. + Example: gpio0: gpio@1100 { -- cgit v0.10.2 From 6534778371b157e375a28a70290201d9aa20ab08 Mon Sep 17 00:00:00 2001 From: Liu Gang Date: Fri, 4 Dec 2015 16:55:05 -0600 Subject: ls2080a/dts: Add little endian property for GPIO IP block The GPIO block for ls2080a platform has little endian registers, the GPIO driver needs this property to read/write registers by right interface. Signed-off-by: Liu Gang Signed-off-by: Li Yang Signed-off-by: Kevin Hilman diff --git a/arch/arm64/boot/dts/freescale/fsl-ls2080a.dtsi b/arch/arm64/boot/dts/freescale/fsl-ls2080a.dtsi index ed45e25..925552e 100644 --- a/arch/arm64/boot/dts/freescale/fsl-ls2080a.dtsi +++ b/arch/arm64/boot/dts/freescale/fsl-ls2080a.dtsi @@ -278,6 +278,7 @@ reg = <0x0 0x2300000 0x0 0x10000>; interrupts = <0 36 0x4>; /* Level high type */ gpio-controller; + little-endian; #gpio-cells = <2>; interrupt-controller; #interrupt-cells = <2>; @@ -288,6 +289,7 @@ reg = <0x0 0x2310000 0x0 0x10000>; interrupts = <0 36 0x4>; /* Level high type */ gpio-controller; + little-endian; #gpio-cells = <2>; interrupt-controller; #interrupt-cells = <2>; @@ -298,6 +300,7 @@ reg = <0x0 0x2320000 0x0 0x10000>; interrupts = <0 37 0x4>; /* Level high type */ gpio-controller; + little-endian; #gpio-cells = <2>; interrupt-controller; #interrupt-cells = <2>; @@ -308,6 +311,7 @@ reg = <0x0 0x2330000 0x0 0x10000>; interrupts = <0 37 0x4>; /* Level high type */ gpio-controller; + little-endian; #gpio-cells = <2>; interrupt-controller; #interrupt-cells = <2>; -- cgit v0.10.2 From e46e31a3696ae2d66f32c207df3969613726e636 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Mon, 30 Nov 2015 14:47:46 -0500 Subject: parisc iommu: fix panic due to trying to allocate too large region When using the Promise TX2+ SATA controller on PA-RISC, the system often crashes with kernel panic, for example just writing data with the dd utility will make it crash. Kernel panic - not syncing: drivers/parisc/sba_iommu.c: I/O MMU @ 000000000000a000 is out of mapping resources CPU: 0 PID: 18442 Comm: mkspadfs Not tainted 4.4.0-rc2 #2 Backtrace: [<000000004021497c>] show_stack+0x14/0x20 [<0000000040410bf0>] dump_stack+0x88/0x100 [<000000004023978c>] panic+0x124/0x360 [<0000000040452c18>] sba_alloc_range+0x698/0x6a0 [<0000000040453150>] sba_map_sg+0x260/0x5b8 [<000000000c18dbb4>] ata_qc_issue+0x264/0x4a8 [libata] [<000000000c19535c>] ata_scsi_translate+0xe4/0x220 [libata] [<000000000c19a93c>] ata_scsi_queuecmd+0xbc/0x320 [libata] [<0000000040499bbc>] scsi_dispatch_cmd+0xfc/0x130 [<000000004049da34>] scsi_request_fn+0x6e4/0x970 [<00000000403e95a8>] __blk_run_queue+0x40/0x60 [<00000000403e9d8c>] blk_run_queue+0x3c/0x68 [<000000004049a534>] scsi_run_queue+0x2a4/0x360 [<000000004049be68>] scsi_end_request+0x1a8/0x238 [<000000004049de84>] scsi_io_completion+0xfc/0x688 [<0000000040493c74>] scsi_finish_command+0x17c/0x1d0 The cause of the crash is not exhaustion of the IOMMU space, there is plenty of free pages. The function sba_alloc_range is called with size 0x11000, thus the pages_needed variable is 0x11. The function sba_search_bitmap is called with bits_wanted 0x11 and boundary size is 0x10 (because dma_get_seg_boundary(dev) returns 0xffff). The function sba_search_bitmap attempts to allocate 17 pages that must not cross 16-page boundary - it can't satisfy this requirement (iommu_is_span_boundary always returns true) and fails even if there are many free entries in the IOMMU space. How did it happen that we try to allocate 17 pages that don't cross 16-page boundary? The cause is in the function iommu_coalesce_chunks. This function tries to coalesce adjacent entries in the scatterlist. The function does several checks if it may coalesce one entry with the next, one of those checks is this: if (startsg->length + dma_len > max_seg_size) break; When it finishes coalescing adjacent entries, it allocates the mapping: sg_dma_len(contig_sg) = dma_len; dma_len = ALIGN(dma_len + dma_offset, IOVP_SIZE); sg_dma_address(contig_sg) = PIDE_FLAG | (iommu_alloc_range(ioc, dev, dma_len) << IOVP_SHIFT) | dma_offset; It is possible that (startsg->length + dma_len > max_seg_size) is false (we are just near the 0x10000 max_seg_size boundary), so the funcion decides to coalesce this entry with the next entry. When the coalescing succeeds, the function performs dma_len = ALIGN(dma_len + dma_offset, IOVP_SIZE); And now, because of non-zero dma_offset, dma_len is greater than 0x10000. iommu_alloc_range (a pointer to sba_alloc_range) is called and it attempts to allocate 17 pages for a device that must not cross 16-page boundary. To fix the bug, we must make sure that dma_len after addition of dma_offset and alignment doesn't cross the segment boundary. I.e. change if (startsg->length + dma_len > max_seg_size) break; to if (ALIGN(dma_len + dma_offset + startsg->length, IOVP_SIZE) > max_seg_size) break; This patch makes this change (it precalculates max_seg_boundary at the beginning of the function iommu_coalesce_chunks). I also added a check that the mapping length doesn't exceed dma_get_seg_boundary(dev) (it is not needed for Promise TX2+ SATA, but it may be needed for other devices that have dma_get_seg_boundary lower than dma_get_max_seg_size). Signed-off-by: Mikulas Patocka Cc: stable@vger.kernel.org Signed-off-by: Helge Deller diff --git a/drivers/parisc/iommu-helpers.h b/drivers/parisc/iommu-helpers.h index 761e77b..e56f156 100644 --- a/drivers/parisc/iommu-helpers.h +++ b/drivers/parisc/iommu-helpers.h @@ -104,7 +104,11 @@ iommu_coalesce_chunks(struct ioc *ioc, struct device *dev, struct scatterlist *contig_sg; /* contig chunk head */ unsigned long dma_offset, dma_len; /* start/len of DMA stream */ unsigned int n_mappings = 0; - unsigned int max_seg_size = dma_get_max_seg_size(dev); + unsigned int max_seg_size = min(dma_get_max_seg_size(dev), + (unsigned)DMA_CHUNK_SIZE); + unsigned int max_seg_boundary = dma_get_seg_boundary(dev) + 1; + if (max_seg_boundary) /* check if the addition above didn't overflow */ + max_seg_size = min(max_seg_size, max_seg_boundary); while (nents > 0) { @@ -138,14 +142,11 @@ iommu_coalesce_chunks(struct ioc *ioc, struct device *dev, /* ** First make sure current dma stream won't - ** exceed DMA_CHUNK_SIZE if we coalesce the + ** exceed max_seg_size if we coalesce the ** next entry. */ - if(unlikely(ALIGN(dma_len + dma_offset + startsg->length, - IOVP_SIZE) > DMA_CHUNK_SIZE)) - break; - - if (startsg->length + dma_len > max_seg_size) + if (unlikely(ALIGN(dma_len + dma_offset + startsg->length, IOVP_SIZE) > + max_seg_size)) break; /* -- cgit v0.10.2 From 5f0e9b4c301742c860b8aedc3e7c471d4a611043 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Tue, 1 Dec 2015 10:41:47 -0600 Subject: parisc: Remove unused pcibios_init_bus() There are no callers of pcibios_init_bus(), so remove it. Signed-off-by: Bjorn Helgaas Signed-off-by: Helge Deller diff --git a/arch/parisc/kernel/pci.c b/arch/parisc/kernel/pci.c index 64f2764..c99f3dd 100644 --- a/arch/parisc/kernel/pci.c +++ b/arch/parisc/kernel/pci.c @@ -171,24 +171,6 @@ void pcibios_set_master(struct pci_dev *dev) } -void __init pcibios_init_bus(struct pci_bus *bus) -{ - struct pci_dev *dev = bus->self; - unsigned short bridge_ctl; - - /* We deal only with pci controllers and pci-pci bridges. */ - if (!dev || (dev->class >> 8) != PCI_CLASS_BRIDGE_PCI) - return; - - /* PCI-PCI bridge - set the cache line and default latency - (32) for primary and secondary buses. */ - pci_write_config_byte(dev, PCI_SEC_LATENCY_TIMER, 32); - - pci_read_config_word(dev, PCI_BRIDGE_CONTROL, &bridge_ctl); - bridge_ctl |= PCI_BRIDGE_CTL_PARITY | PCI_BRIDGE_CTL_SERR; - pci_write_config_word(dev, PCI_BRIDGE_CONTROL, bridge_ctl); -} - /* * pcibios align resources() is called every time generic PCI code * wants to generate a new address. The process of looking for -- cgit v0.10.2 From 5c477b45792fb8ccd279162483aa57cc8ee40ae1 Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Sun, 6 Dec 2015 21:56:26 +0100 Subject: parisc: Wire up mlock2 syscall Signed-off-by: Helge Deller diff --git a/arch/parisc/include/uapi/asm/unistd.h b/arch/parisc/include/uapi/asm/unistd.h index 3317038..35bdccb 100644 --- a/arch/parisc/include/uapi/asm/unistd.h +++ b/arch/parisc/include/uapi/asm/unistd.h @@ -360,8 +360,9 @@ #define __NR_execveat (__NR_Linux + 342) #define __NR_membarrier (__NR_Linux + 343) #define __NR_userfaultfd (__NR_Linux + 344) +#define __NR_mlock2 (__NR_Linux + 345) -#define __NR_Linux_syscalls (__NR_userfaultfd + 1) +#define __NR_Linux_syscalls (__NR_mlock2 + 1) #define __IGNORE_select /* newselect */ diff --git a/arch/parisc/kernel/syscall_table.S b/arch/parisc/kernel/syscall_table.S index 78c3ef8..d4ffcfb 100644 --- a/arch/parisc/kernel/syscall_table.S +++ b/arch/parisc/kernel/syscall_table.S @@ -440,6 +440,7 @@ ENTRY_COMP(execveat) ENTRY_SAME(membarrier) ENTRY_SAME(userfaultfd) + ENTRY_SAME(mlock2) /* 345 */ .ifne (. - 90b) - (__NR_Linux_syscalls * (91b - 90b)) -- cgit v0.10.2 From 78c0cbffebe60e1182c8f077de2c06a1a48dc5ae Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Sun, 6 Dec 2015 21:25:20 +0100 Subject: parisc: Disable huge pages on Mako machines Mako-based machines (PA8800 and PA8900 CPUs) don't allow aliasing on non-equaivalent addresses. Signed-off-by: Helge Deller diff --git a/arch/parisc/include/asm/pgtable.h b/arch/parisc/include/asm/pgtable.h index d8534f9..291cee2 100644 --- a/arch/parisc/include/asm/pgtable.h +++ b/arch/parisc/include/asm/pgtable.h @@ -372,7 +372,8 @@ static inline pte_t pte_mkspecial(pte_t pte) { return pte; } */ #ifdef CONFIG_HUGETLB_PAGE #define pte_huge(pte) (pte_val(pte) & _PAGE_HUGE) -#define pte_mkhuge(pte) (__pte(pte_val(pte) | _PAGE_HUGE)) +#define pte_mkhuge(pte) (__pte(pte_val(pte) | \ + (parisc_requires_coherency() ? 0 : _PAGE_HUGE))) #else #define pte_huge(pte) (0) #define pte_mkhuge(pte) (pte) -- cgit v0.10.2 From a88c769548047b21f76fd71e04b6a3300ff17160 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Fri, 11 Dec 2015 13:40:24 -0800 Subject: mm: hugetlb: fix hugepage memory leak caused by wrong reserve count When dequeue_huge_page_vma() in alloc_huge_page() fails, we fall back on alloc_buddy_huge_page() to directly create a hugepage from the buddy allocator. In that case, however, if alloc_buddy_huge_page() succeeds we don't decrement h->resv_huge_pages, which means that successful hugetlb_fault() returns without releasing the reserve count. As a result, subsequent hugetlb_fault() might fail despite that there are still free hugepages. This patch simply adds decrementing code on that code path. I reproduced this problem when testing v4.3 kernel in the following situation: - the test machine/VM is a NUMA system, - hugepage overcommiting is enabled, - most of hugepages are allocated and there's only one free hugepage which is on node 0 (for example), - another program, which calls set_mempolicy(MPOL_BIND) to bind itself to node 1, tries to allocate a hugepage, - the allocation should fail but the reserve count is still hold. Signed-off-by: Naoya Horiguchi Cc: David Rientjes Cc: Dave Hansen Cc: Mel Gorman Cc: Joonsoo Kim Cc: Hillf Danton Cc: Mike Kravetz Cc: [3.16+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 827bb02..4fe4340 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1886,7 +1886,10 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, page = __alloc_buddy_huge_page_with_mpol(h, vma, addr); if (!page) goto out_uncharge_cgroup; - + if (!avoid_reserve && vma_has_reserves(vma, gbl_chg)) { + SetPagePrivate(page); + h->resv_huge_pages--; + } spin_lock(&hugetlb_lock); list_move(&page->lru, &h->hugepage_activelist); /* Fall through */ -- cgit v0.10.2 From 9516a18a9a253a94292e74f11f92083126c5a237 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Fri, 11 Dec 2015 13:40:24 -0800 Subject: memcg: fix memory.high target When the memory.high threshold is exceeded, try_charge() schedules a task_work to reclaim the excess. The reclaim target is set to the number of pages requested by try_charge(). This is wrong, because try_charge() usually charges more pages than requested (batch > nr_pages) in order to refill per cpu stocks. As a result, a process in a cgroup can easily exceed memory.high significantly when doing a lot of charges w/o returning to userspace (e.g. reading a file in big chunks). Fix this issue by assuring that when exceeding memory.high a process reclaims as many pages as were actually charged (i.e. batch). Signed-off-by: Vladimir Davydov Acked-by: Johannes Weiner Acked-by: Michal Hocko Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/memcontrol.c b/mm/memcontrol.c index c92a65b..32259e3 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2128,7 +2128,7 @@ done_restock: */ do { if (page_counter_read(&memcg->memory) > memcg->high) { - current->memcg_nr_pages_over_high += nr_pages; + current->memcg_nr_pages_over_high += batch; set_notify_resume(current); break; } -- cgit v0.10.2 From 475a2f905d5a41d5fc569ef21841be67d0a7f788 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Fri, 11 Dec 2015 13:40:29 -0800 Subject: mm: fix swapped Movable and Reclaimable in /proc/pagetypeinfo Commit 016c13daa5c9 ("mm, page_alloc: use masks and shifts when converting GFP flags to migrate types") has swapped MIGRATE_MOVABLE and MIGRATE_RECLAIMABLE in the enum definition. However, migratetype_names wasn't updated to reflect that. As a result, the file /proc/pagetypeinfo shows the counts for Movable as Reclaimable and vice versa. Additionally, commit 0aaa29a56e4f ("mm, page_alloc: reserve pageblocks for high-order atomic allocations on demand") introduced MIGRATE_HIGHATOMIC, but did not add a letter to distinguish it into show_migration_types(), so it doesn't appear in the listing of free areas during page alloc failures or oom kills. This patch fixes both problems. The atomic reserves will show with a letter 'H' in the free areas listings. Fixes: 016c13daa5c9 ("mm, page_alloc: use masks and shifts when converting GFP flags to migrate types") Fixes: 0aaa29a56e4f ("mm, page_alloc: reserve pageblocks for high-order atomic allocations on demand") Signed-off-by: Vlastimil Babka Acked-by: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 17a3c66..9d666df 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3647,8 +3647,9 @@ static void show_migration_types(unsigned char type) { static const char types[MIGRATE_TYPES] = { [MIGRATE_UNMOVABLE] = 'U', - [MIGRATE_RECLAIMABLE] = 'E', [MIGRATE_MOVABLE] = 'M', + [MIGRATE_RECLAIMABLE] = 'E', + [MIGRATE_HIGHATOMIC] = 'H', #ifdef CONFIG_CMA [MIGRATE_CMA] = 'C', #endif diff --git a/mm/vmstat.c b/mm/vmstat.c index 879a2be..2ec3434 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -921,8 +921,8 @@ static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat, #ifdef CONFIG_PROC_FS static char * const migratetype_names[MIGRATE_TYPES] = { "Unmovable", - "Reclaimable", "Movable", + "Reclaimable", "HighAtomic", #ifdef CONFIG_CMA "CMA", -- cgit v0.10.2 From 373ccbe5927034b55bdc80b0f8b54d6e13fe8d12 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 11 Dec 2015 13:40:32 -0800 Subject: mm, vmstat: allow WQ concurrency to discover memory reclaim doesn't make any progress Tetsuo Handa has reported that the system might basically livelock in OOM condition without triggering the OOM killer. The issue is caused by internal dependency of the direct reclaim on vmstat counter updates (via zone_reclaimable) which are performed from the workqueue context. If all the current workers get assigned to an allocation request, though, they will be looping inside the allocator trying to reclaim memory but zone_reclaimable can see stalled numbers so it will consider a zone reclaimable even though it has been scanned way too much. WQ concurrency logic will not consider this situation as a congested workqueue because it relies that worker would have to sleep in such a situation. This also means that it doesn't try to spawn new workers or invoke the rescuer thread if the one is assigned to the queue. In order to fix this issue we need to do two things. First we have to let wq concurrency code know that we are in trouble so we have to do a short sleep. In order to prevent from issues handled by 0e093d99763e ("writeback: do not sleep on the congestion queue if there are no congested BDIs or if significant congestion is not being encountered in the current zone") we limit the sleep only to worker threads which are the ones of the interest anyway. The second thing to do is to create a dedicated workqueue for vmstat and mark it WQ_MEM_RECLAIM to note it participates in the reclaim and to have a spare worker thread for it. Signed-off-by: Michal Hocko Reported-by: Tetsuo Handa Cc: Tejun Heo Cc: Cristopher Lameter Cc: Joonsoo Kim Cc: Arkadiusz Miskiewicz Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 8ed2ffd..7340353 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -957,8 +957,9 @@ EXPORT_SYMBOL(congestion_wait); * jiffies for either a BDI to exit congestion of the given @sync queue * or a write to complete. * - * In the absence of zone congestion, cond_resched() is called to yield - * the processor if necessary but otherwise does not sleep. + * In the absence of zone congestion, a short sleep or a cond_resched is + * performed to yield the processor and to allow other subsystems to make + * a forward progress. * * The return value is 0 if the sleep is for the full timeout. Otherwise, * it is the number of jiffies that were still remaining when the function @@ -978,7 +979,19 @@ long wait_iff_congested(struct zone *zone, int sync, long timeout) */ if (atomic_read(&nr_wb_congested[sync]) == 0 || !test_bit(ZONE_CONGESTED, &zone->flags)) { - cond_resched(); + + /* + * Memory allocation/reclaim might be called from a WQ + * context and the current implementation of the WQ + * concurrency control doesn't recognize that a particular + * WQ is congested if the worker thread is looping without + * ever sleeping. Therefore we have to do a short sleep + * here rather than calling cond_resched(). + */ + if (current->flags & PF_WQ_WORKER) + schedule_timeout(1); + else + cond_resched(); /* In case we scheduled, work out time remaining */ ret = timeout - (jiffies - start); diff --git a/mm/vmstat.c b/mm/vmstat.c index 2ec3434..0d5712b 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1379,6 +1379,7 @@ static const struct file_operations proc_vmstat_file_operations = { #endif /* CONFIG_PROC_FS */ #ifdef CONFIG_SMP +static struct workqueue_struct *vmstat_wq; static DEFINE_PER_CPU(struct delayed_work, vmstat_work); int sysctl_stat_interval __read_mostly = HZ; static cpumask_var_t cpu_stat_off; @@ -1391,7 +1392,7 @@ static void vmstat_update(struct work_struct *w) * to occur in the future. Keep on running the * update worker thread. */ - schedule_delayed_work_on(smp_processor_id(), + queue_delayed_work_on(smp_processor_id(), vmstat_wq, this_cpu_ptr(&vmstat_work), round_jiffies_relative(sysctl_stat_interval)); } else { @@ -1460,7 +1461,7 @@ static void vmstat_shepherd(struct work_struct *w) if (need_update(cpu) && cpumask_test_and_clear_cpu(cpu, cpu_stat_off)) - schedule_delayed_work_on(cpu, + queue_delayed_work_on(cpu, vmstat_wq, &per_cpu(vmstat_work, cpu), 0); put_online_cpus(); @@ -1549,6 +1550,7 @@ static int __init setup_vmstat(void) start_shepherd_timer(); cpu_notifier_register_done(); + vmstat_wq = alloc_workqueue("vmstat", WQ_FREEZABLE|WQ_MEM_RECLAIM, 0); #endif #ifdef CONFIG_PROC_FS proc_create("buddyinfo", S_IRUGO, NULL, &fragmentation_file_operations); -- cgit v0.10.2 From ed0f1e2102b6f245210be5f93a08d708b6ac070f Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Fri, 11 Dec 2015 13:40:35 -0800 Subject: MAINTAINERS: make Vladimir co-maintainer of the memory controller Vladimir architected and authored much of the current state of the memcg's slab memory accounting and tracking. Make sure he gets CC'd on bug reports ;-) Signed-off-by: Johannes Weiner Acked-by: Vladimir Davydov Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/MAINTAINERS b/MAINTAINERS index 38df53f..9bff63c 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2975,6 +2975,7 @@ F: kernel/cpuset.c CONTROL GROUP - MEMORY RESOURCE CONTROLLER (MEMCG) M: Johannes Weiner M: Michal Hocko +M: Vladimir Davydov L: cgroups@vger.kernel.org L: linux-mm@kvack.org S: Maintained -- cgit v0.10.2 From 3066a9670ba6f57dd5046640a2494912a2110f1e Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Fri, 11 Dec 2015 13:40:38 -0800 Subject: osd fs: __r4w_get_page rely on PageUptodate for uptodate Commit 42cb14b110a5 ("mm: migrate dirty page without clear_page_dirty_for_io etc") simplified the migration of a PageDirty pagecache page: one stat needs moving from zone to zone and that's about all. It's convenient and safest for it to shift the PageDirty bit from old page to new, just before updating the zone stats: before copying data and marking the new PageUptodate. This is all done while both pages are isolated and locked, just as before; and just as before, there's a moment when the new page is visible in the radix_tree, but not yet PageUptodate. What's new is that it may now be briefly visible as PageDirty before it is PageUptodate. When I scoured the tree to see if this could cause a problem anywhere, the only places I found were in two similar functions __r4w_get_page(): which look up a page with find_get_page() (not using page lock), then claim it's uptodate if it's PageDirty or PageWriteback or PageUptodate. I'm not sure whether that was right before, but now it might be wrong (on rare occasions): only claim the page is uptodate if PageUptodate. Or perhaps the page in question could never be migratable anyway? Signed-off-by: Hugh Dickins Tested-by: Boaz Harrosh Cc: Benny Halevy Cc: Trond Myklebust Cc: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c index 73c64da..60f03b7 100644 --- a/fs/exofs/inode.c +++ b/fs/exofs/inode.c @@ -592,10 +592,7 @@ static struct page *__r4w_get_page(void *priv, u64 offset, bool *uptodate) } unlock_page(page); } - if (PageDirty(page) || PageWriteback(page)) - *uptodate = true; - else - *uptodate = PageUptodate(page); + *uptodate = PageUptodate(page); EXOFS_DBGMSG2("index=0x%lx uptodate=%d\n", index, *uptodate); return page; } else { diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c index 5c0c6b5..9aebffb 100644 --- a/fs/nfs/objlayout/objio_osd.c +++ b/fs/nfs/objlayout/objio_osd.c @@ -476,10 +476,7 @@ static struct page *__r4w_get_page(void *priv, u64 offset, bool *uptodate) } unlock_page(page); } - if (PageDirty(page) || PageWriteback(page)) - *uptodate = true; - else - *uptodate = PageUptodate(page); + *uptodate = PageUptodate(page); dprintk("%s: index=0x%lx uptodate=%d\n", __func__, index, *uptodate); return page; } -- cgit v0.10.2 From 25be6a659598791d511c09ed71dca24e4844d128 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Fri, 11 Dec 2015 13:40:40 -0800 Subject: mm: fix kerneldoc on mem_cgroup_replace_page Whoops, I missed removing the kerneldoc comment of the lrucare arg removed from mem_cgroup_replace_page; but it's a good comment, keep it. Signed-off-by: Hugh Dickins Acked-by: Michal Hocko Acked-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 32259e3..e234c21 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5512,11 +5512,11 @@ void mem_cgroup_uncharge_list(struct list_head *page_list) * mem_cgroup_replace_page - migrate a charge to another page * @oldpage: currently charged page * @newpage: page to transfer the charge to - * @lrucare: either or both pages might be on the LRU already * * Migrate the charge from @oldpage to @newpage. * * Both pages must be locked, @newpage->mapping must be set up. + * Either or both pages might be on the LRU already. */ void mem_cgroup_replace_page(struct page *oldpage, struct page *newpage) { -- cgit v0.10.2 From 98e89cf02aed11166698dd53c6f14865613babb3 Mon Sep 17 00:00:00 2001 From: Nicolas Iooss Date: Fri, 11 Dec 2015 13:40:43 -0800 Subject: mm: kmemleak: mark kmemleak_init prototype as __init The kmemleak_init() definition in mm/kmemleak.c is marked __init but its prototype in include/linux/kmemleak.h is marked __ref since commit a6186d89c913 ("kmemleak: Mark the early log buffer as __initdata"). This causes a section mismatch which is reported as a warning when building with clang -Wsection, because kmemleak_init() is declared in section .ref.text but defined in .init.text. Fix this by marking kmemleak_init() prototype __init. Signed-off-by: Nicolas Iooss Signed-off-by: Catalin Marinas Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/kmemleak.h b/include/linux/kmemleak.h index d0a1f99..4894c68 100644 --- a/include/linux/kmemleak.h +++ b/include/linux/kmemleak.h @@ -25,7 +25,7 @@ #ifdef CONFIG_DEBUG_KMEMLEAK -extern void kmemleak_init(void) __ref; +extern void kmemleak_init(void) __init; extern void kmemleak_alloc(const void *ptr, size_t size, int min_count, gfp_t gfp) __ref; extern void kmemleak_alloc_percpu(const void __percpu *ptr, size_t size, -- cgit v0.10.2 From 86fffe4a61dd972d5a4e23260d530be6da02f614 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Fri, 11 Dec 2015 13:40:46 -0800 Subject: kernel: remove stop_machine() Kconfig dependency Currently the full stop_machine() routine is only enabled on SMP if module unloading is enabled, or if the CPUs are hotpluggable. This leads to configurations where stop_machine() is broken as it will then only run the callback on the local CPU with irqs disabled, and not stop the other CPUs or run the callback on them. For example, this breaks MTRR setup on x86 in certain configs since ea8596bb2d8d379 ("kprobes/x86: Remove unused text_poke_smp() and text_poke_smp_batch() functions") as the MTRR is only established on the boot CPU. This patch removes the Kconfig option for STOP_MACHINE and uses the SMP and HOTPLUG_CPU config options to compile the correct stop_machine() for the architecture, removing the false dependency on MODULE_UNLOAD in the process. Link: https://lkml.org/lkml/2014/10/8/124 References: https://bugs.freedesktop.org/show_bug.cgi?id=84794 Signed-off-by: Chris Wilson Acked-by: Ingo Molnar Cc: "Paul E. McKenney" Cc: Pranith Kumar Cc: Michal Hocko Cc: Vladimir Davydov Cc: Johannes Weiner Cc: H. Peter Anvin Cc: Tejun Heo Cc: Iulia Manda Cc: Andy Lutomirski Cc: Rusty Russell Cc: Peter Zijlstra Cc: Chuck Ebbert Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/stop_machine.h b/include/linux/stop_machine.h index 0adedca..0e1b154 100644 --- a/include/linux/stop_machine.h +++ b/include/linux/stop_machine.h @@ -99,7 +99,7 @@ static inline int try_stop_cpus(const struct cpumask *cpumask, * grabbing every spinlock (and more). So the "read" side to such a * lock is anything which disables preemption. */ -#if defined(CONFIG_STOP_MACHINE) && defined(CONFIG_SMP) +#if defined(CONFIG_SMP) || defined(CONFIG_HOTPLUG_CPU) /** * stop_machine: freeze the machine on all CPUs and run this function @@ -118,7 +118,7 @@ int stop_machine(cpu_stop_fn_t fn, void *data, const struct cpumask *cpus); int stop_machine_from_inactive_cpu(cpu_stop_fn_t fn, void *data, const struct cpumask *cpus); -#else /* CONFIG_STOP_MACHINE && CONFIG_SMP */ +#else /* CONFIG_SMP || CONFIG_HOTPLUG_CPU */ static inline int stop_machine(cpu_stop_fn_t fn, void *data, const struct cpumask *cpus) @@ -137,5 +137,5 @@ static inline int stop_machine_from_inactive_cpu(cpu_stop_fn_t fn, void *data, return stop_machine(fn, data, cpus); } -#endif /* CONFIG_STOP_MACHINE && CONFIG_SMP */ +#endif /* CONFIG_SMP || CONFIG_HOTPLUG_CPU */ #endif /* _LINUX_STOP_MACHINE */ diff --git a/init/Kconfig b/init/Kconfig index c24b6f7..235c7a2 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -2030,13 +2030,6 @@ config INIT_ALL_POSSIBLE it was better to provide this option than to break all the archs and have several arch maintainers pursuing me down dark alleys. -config STOP_MACHINE - bool - default y - depends on (SMP && MODULE_UNLOAD) || HOTPLUG_CPU - help - Need stop_machine() primitive. - source "block/Kconfig" config PREEMPT_NOTIFIERS diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 867bc20..a3bbaee 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -531,7 +531,7 @@ static int __init cpu_stop_init(void) } early_initcall(cpu_stop_init); -#ifdef CONFIG_STOP_MACHINE +#if defined(CONFIG_SMP) || defined(CONFIG_HOTPLUG_CPU) static int __stop_machine(cpu_stop_fn_t fn, void *data, const struct cpumask *cpus) { @@ -631,4 +631,4 @@ int stop_machine_from_inactive_cpu(cpu_stop_fn_t fn, void *data, return ret ?: done.ret; } -#endif /* CONFIG_STOP_MACHINE */ +#endif /* CONFIG_SMP || CONFIG_HOTPLUG_CPU */ -- cgit v0.10.2 From 0d777df5d8953293be090d9ab5a355db893e8357 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Fri, 11 Dec 2015 13:40:49 -0800 Subject: mm: hugetlb: call huge_pte_alloc() only if ptep is null Currently at the beginning of hugetlb_fault(), we call huge_pte_offset() and check whether the obtained *ptep is a migration/hwpoison entry or not. And if not, then we get to call huge_pte_alloc(). This is racy because the *ptep could turn into migration/hwpoison entry after the huge_pte_offset() check. This race results in BUG_ON in huge_pte_alloc(). We don't have to call huge_pte_alloc() when the huge_pte_offset() returns non-NULL, so let's fix this bug with moving the code into else block. Note that the *ptep could turn into a migration/hwpoison entry after this block, but that's not a problem because we have another !pte_present check later (we never go into hugetlb_no_page() in that case.) Fixes: 290408d4a250 ("hugetlb: hugepage migration core") Signed-off-by: Naoya Horiguchi Acked-by: Hillf Danton Acked-by: David Rientjes Cc: Hugh Dickins Cc: Dave Hansen Cc: Mel Gorman Cc: Joonsoo Kim Cc: Mike Kravetz Cc: [2.6.36+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 4fe4340..9e3fa71 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3696,12 +3696,12 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) return VM_FAULT_HWPOISON_LARGE | VM_FAULT_SET_HINDEX(hstate_index(h)); + } else { + ptep = huge_pte_alloc(mm, address, huge_page_size(h)); + if (!ptep) + return VM_FAULT_OOM; } - ptep = huge_pte_alloc(mm, address, huge_page_size(h)); - if (!ptep) - return VM_FAULT_OOM; - mapping = vma->vm_file->f_mapping; idx = vma_hugecache_offset(h, vma, address); -- cgit v0.10.2 From dbe409e4f5e5075bd9ff7f8dd5c627abf3ee38c1 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Fri, 11 Dec 2015 13:40:52 -0800 Subject: mm/hugetlb.c: fix resv map memory leak for placeholder entries Dmitry Vyukov reported the following memory leak unreferenced object 0xffff88002eaafd88 (size 32): comm "a.out", pid 5063, jiffies 4295774645 (age 15.810s) hex dump (first 32 bytes): 28 e9 4e 63 00 88 ff ff 28 e9 4e 63 00 88 ff ff (.Nc....(.Nc.... 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace: kmalloc include/linux/slab.h:458 region_chg+0x2d4/0x6b0 mm/hugetlb.c:398 __vma_reservation_common+0x2c3/0x390 mm/hugetlb.c:1791 vma_needs_reservation mm/hugetlb.c:1813 alloc_huge_page+0x19e/0xc70 mm/hugetlb.c:1845 hugetlb_no_page mm/hugetlb.c:3543 hugetlb_fault+0x7a1/0x1250 mm/hugetlb.c:3717 follow_hugetlb_page+0x339/0xc70 mm/hugetlb.c:3880 __get_user_pages+0x542/0xf30 mm/gup.c:497 populate_vma_page_range+0xde/0x110 mm/gup.c:919 __mm_populate+0x1c7/0x310 mm/gup.c:969 do_mlock+0x291/0x360 mm/mlock.c:637 SYSC_mlock2 mm/mlock.c:658 SyS_mlock2+0x4b/0x70 mm/mlock.c:648 Dmitry identified a potential memory leak in the routine region_chg, where a region descriptor is not free'ed on an error path. However, the root cause for the above memory leak resides in region_del. In this specific case, a "placeholder" entry is created in region_chg. The associated page allocation fails, and the placeholder entry is left in the reserve map. This is "by design" as the entry should be deleted when the map is released. The bug is in the region_del routine which is used to delete entries within a specific range (and when the map is released). region_del did not handle the case where a placeholder entry exactly matched the start of the range range to be deleted. In this case, the entry would not be deleted and leaked. The fix is to take these special placeholder entries into account in region_del. The region_chg error path leak is also fixed. Fixes: feba16e25a57 ("mm/hugetlb: add region_del() to delete a specific range of entries") Signed-off-by: Mike Kravetz Reported-by: Dmitry Vyukov Acked-by: Hillf Danton Cc: [4.3+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 9e3fa71..ef6963b 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -372,8 +372,10 @@ retry_locked: spin_unlock(&resv->lock); trg = kmalloc(sizeof(*trg), GFP_KERNEL); - if (!trg) + if (!trg) { + kfree(nrg); return -ENOMEM; + } spin_lock(&resv->lock); list_add(&trg->link, &resv->region_cache); @@ -483,8 +485,16 @@ static long region_del(struct resv_map *resv, long f, long t) retry: spin_lock(&resv->lock); list_for_each_entry_safe(rg, trg, head, link) { - if (rg->to <= f) + /* + * Skip regions before the range to be deleted. file_region + * ranges are normally of the form [from, to). However, there + * may be a "placeholder" entry in the map which is of the form + * (from, to) with from == to. Check for placeholder entries + * at the beginning of the range to be deleted. + */ + if (rg->to <= f && (rg->to != rg->from || rg->to != f)) continue; + if (rg->from >= t) break; -- cgit v0.10.2 From 267a4c76bbdb950688d3aeb020976c2918064584 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Fri, 11 Dec 2015 13:40:55 -0800 Subject: tmpfs: fix shmem_evict_inode() warnings on i_blocks Dmitry Vyukov provides a little program, autogenerated by syzkaller, which races a fault on a mapping of a sparse memfd object, against truncation of that object below the fault address: run repeatedly for a few minutes, it reliably generates shmem_evict_inode()'s WARN_ON(inode->i_blocks). (But there's nothing specific to memfd here, nor to the fstat which it happened to use to generate the fault: though that looked suspicious, since a shmem_recalc_inode() had been added there recently. The same problem can be reproduced with open+unlink in place of memfd_create, and with fstatfs in place of fstat.) v3.7 commit 0f3c42f522dc ("tmpfs: change final i_blocks BUG to WARNING") explains one cause of such a warning (a race with shmem_writepage to swap), and possible solutions; but we never took it further, and this syzkaller incident turns out to have a different cause. shmem_getpage_gfp()'s error recovery, when a freshly allocated page is then found to be beyond eof, looks plausible - decrementing the alloced count that was just before incremented - but in fact can go wrong, if a racing thread (the truncator, for example) gets its shmem_recalc_inode() in just after our delete_from_page_cache(). delete_from_page_cache() decrements nrpages, that shmem_recalc_inode() will balance the books by decrementing alloced itself, then our decrement of alloced take it one too low: leading to the WARNING when the object is finally evicted. Once the new page has been exposed in the page cache, shmem_getpage_gfp() must leave it to shmem_recalc_inode() itself to get the accounting right in all cases (and not fall through from "trunc:" to "decused:"). Adjust that error recovery block; and the reinitialization of info and sbinfo can be removed too. While we're here, fix shmem_writepage() to avoid the original issue: it will be safe against a racing shmem_recalc_inode(), if it merely increments swapped before the shmem_delete_from_page_cache() which decrements nrpages (but it must then do its own shmem_recalc_inode() before that, while still in balance, instead of after). (Aside: why do we shmem_recalc_inode() here in the swap path? Because its raison d'etre is to cope with clean sparse shmem pages being reclaimed behind our back: so here when swapping is a good place to look for that case.) But I've not now managed to reproduce this bug, even without the patch. I don't see why I didn't do that earlier: perhaps inhibited by the preference to eliminate shmem_recalc_inode() altogether. Driven by this incident, I do now have a patch to do so at last; but still want to sit on it for a bit, there's a couple of questions yet to be resolved. Signed-off-by: Hugh Dickins Reported-by: Dmitry Vyukov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/shmem.c b/mm/shmem.c index 9187eee..2afcdbb 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -843,14 +843,14 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) list_add_tail(&info->swaplist, &shmem_swaplist); if (add_to_swap_cache(page, swap, GFP_ATOMIC) == 0) { - swap_shmem_alloc(swap); - shmem_delete_from_page_cache(page, swp_to_radix_entry(swap)); - spin_lock(&info->lock); - info->swapped++; shmem_recalc_inode(inode); + info->swapped++; spin_unlock(&info->lock); + swap_shmem_alloc(swap); + shmem_delete_from_page_cache(page, swp_to_radix_entry(swap)); + mutex_unlock(&shmem_swaplist_mutex); BUG_ON(page_mapped(page)); swap_writepage(page, wbc); @@ -1078,7 +1078,7 @@ repeat: if (sgp != SGP_WRITE && sgp != SGP_FALLOC && ((loff_t)index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) { error = -EINVAL; - goto failed; + goto unlock; } if (page && sgp == SGP_WRITE) @@ -1246,11 +1246,15 @@ clear: /* Perhaps the file has been truncated since we checked */ if (sgp != SGP_WRITE && sgp != SGP_FALLOC && ((loff_t)index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) { + if (alloced) { + ClearPageDirty(page); + delete_from_page_cache(page); + spin_lock(&info->lock); + shmem_recalc_inode(inode); + spin_unlock(&info->lock); + } error = -EINVAL; - if (alloced) - goto trunc; - else - goto failed; + goto unlock; } *pagep = page; return 0; @@ -1258,23 +1262,13 @@ clear: /* * Error recovery. */ -trunc: - info = SHMEM_I(inode); - ClearPageDirty(page); - delete_from_page_cache(page); - spin_lock(&info->lock); - info->alloced--; - inode->i_blocks -= BLOCKS_PER_PAGE; - spin_unlock(&info->lock); decused: - sbinfo = SHMEM_SB(inode->i_sb); if (sbinfo->max_blocks) percpu_counter_add(&sbinfo->used_blocks, -1); unacct: shmem_unacct_blocks(info->flags, 1); failed: - if (swap.val && error != -EINVAL && - !shmem_confirm_swap(mapping, index, swap)) + if (swap.val && !shmem_confirm_swap(mapping, index, swap)) error = -EEXIST; unlock: if (page) { -- cgit v0.10.2 From 26bbe7ef6d5cdc7ec08cba6d433fca4060f258f3 Mon Sep 17 00:00:00 2001 From: Seth Jennings Date: Fri, 11 Dec 2015 13:40:57 -0800 Subject: drivers/base/memory.c: prohibit offlining of memory blocks with missing sections Commit bdee237c0343 ("x86: mm: Use 2GB memory block size on large-memory x86-64 systems") and 982792c782ef ("x86, mm: probe memory block size for generic x86 64bit") introduced large block sizes for x86. This made it possible to have multiple sections per memory block where previously, there was a only every one section per block. Since blocks consist of contiguous ranges of section, there can be holes in the blocks where sections are not present. If one attempts to offline such a block, a crash occurs since the code is not designed to deal with this. This patch is a quick fix to gaurd against the crash by not allowing blocks with non-present sections to be offlined. Addresses https://bugzilla.kernel.org/show_bug.cgi?id=107781 Signed-off-by: Seth Jennings Reported-by: Andrew Banman Cc: Daniel J Blueman Cc: Yinghai Lu Cc: Greg KH Cc: Russ Anderson Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/drivers/base/memory.c b/drivers/base/memory.c index 2804aed..25425d3 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -303,6 +303,10 @@ static int memory_subsys_offline(struct device *dev) if (mem->state == MEM_OFFLINE) return 0; + /* Can't offline block with non-present sections */ + if (mem->section_count != sections_per_block) + return -EINVAL; + return memory_block_change_state(mem, MEM_OFFLINE, MEM_ONLINE); } -- cgit v0.10.2 From a2b829d95958da2025ef844c0f53ac15ad720fac Mon Sep 17 00:00:00 2001 From: Chen Jie Date: Fri, 11 Dec 2015 13:41:00 -0800 Subject: mm/oom_kill.c: avoid attempting to kill init sharing same memory It's possible that an oom killed victim shares an ->mm with the init process and thus oom_kill_process() would end up trying to kill init as well. This has been shown in practice: Out of memory: Kill process 9134 (init) score 3 or sacrifice child Killed process 9134 (init) total-vm:1868kB, anon-rss:84kB, file-rss:572kB Kill process 1 (init) sharing same memory ... Kernel panic - not syncing: Attempted to kill init! exitcode=0x00000009 And this will result in a kernel panic. If a process is forked by init and selected for oom kill while still sharing init_mm, then it's likely this system is in a recoverable state. However, it's better not to try to kill init and allow the machine to panic due to unkillable processes. [rientjes@google.com: rewrote changelog] [akpm@linux-foundation.org: fix inverted test, per Ben] Signed-off-by: Chen Jie Signed-off-by: David Rientjes Acked-by: Michal Hocko Acked-by: Hillf Danton Cc: Ben Hutchings Cc: Li Zefan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/oom_kill.c b/mm/oom_kill.c index d13a339..c126809 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -608,6 +608,8 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p, continue; if (unlikely(p->flags & PF_KTHREAD)) continue; + if (is_global_init(p)) + continue; if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) continue; -- cgit v0.10.2 From 854ee2e944b4daf795e32562a7d2f9e90ab5a6a8 Mon Sep 17 00:00:00 2001 From: Junxiao Bi Date: Fri, 11 Dec 2015 13:41:03 -0800 Subject: ocfs2: fix SGID not inherited issue Commit 8f1eb48758aa ("ocfs2: fix umask ignored issue") introduced an issue, SGID of sub dir was not inherited from its parents dir. It is because SGID is set into "inode->i_mode" in ocfs2_get_init_inode(), but is overwritten by "mode" which don't have SGID set later. Fixes: 8f1eb48758aa ("ocfs2: fix umask ignored issue") Signed-off-by: Junxiao Bi Cc: Mark Fasheh Cc: Joel Becker Acked-by: Srinivas Eeda Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index a03f6f4..3123408 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -367,13 +367,11 @@ static int ocfs2_mknod(struct inode *dir, goto leave; } - status = posix_acl_create(dir, &mode, &default_acl, &acl); + status = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl); if (status) { mlog_errno(status); goto leave; } - /* update inode->i_mode after mask with "umask". */ - inode->i_mode = mode; handle = ocfs2_start_trans(osb, ocfs2_mknod_credits(osb->sb, S_ISDIR(mode), -- cgit v0.10.2 From 2d33fa1059da4c8e816627a688d950b613ec0474 Mon Sep 17 00:00:00 2001 From: "Dmitry V. Levin" Date: Fri, 11 Dec 2015 13:41:06 -0800 Subject: sh64: fix __NR_fgetxattr According to arch/sh/kernel/syscalls_64.S and common sense, __NR_fgetxattr has to be defined to 259, but it doesn't. Instead, it's defined to 269, which is of course used by another syscall, __NR_sched_setaffinity in this case. This bug was found by strace test suite. Signed-off-by: Dmitry V. Levin Acked-by: Geert Uytterhoeven Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/arch/sh/include/uapi/asm/unistd_64.h b/arch/sh/include/uapi/asm/unistd_64.h index e6820c8..47ebd5b 100644 --- a/arch/sh/include/uapi/asm/unistd_64.h +++ b/arch/sh/include/uapi/asm/unistd_64.h @@ -278,7 +278,7 @@ #define __NR_fsetxattr 256 #define __NR_getxattr 257 #define __NR_lgetxattr 258 -#define __NR_fgetxattr 269 +#define __NR_fgetxattr 259 #define __NR_listxattr 260 #define __NR_llistxattr 261 #define __NR_flistxattr 262 -- cgit v0.10.2 From 9530d0fe129c0197d5df13319ccefd08a827383b Mon Sep 17 00:00:00 2001 From: Qais Yousef Date: Fri, 11 Dec 2015 13:41:09 -0800 Subject: MIPS: fix DMA contiguous allocation Recent changes to how GFP_ATOMIC is defined seems to have broken the condition to use mips_alloc_from_contiguous() in mips_dma_alloc_coherent(). I couldn't bottom out the exact change but I think it's this commit d0164adc89f6 ("mm, page_alloc: distinguish between being unable to sleep, unwilling to sleep and avoiding waking kswapd"). GFP_ATOMIC has multiple bits set and the check for !(gfp & GFP_ATOMIC) isn't enough. The reason behind this condition is to check whether we can potentially do a sleeping memory allocation. Use gfpflags_allow_blocking() instead which should be more robust. Signed-off-by: Qais Yousef Acked-by: Mel Gorman Cc: Ralf Baechle Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/arch/mips/mm/dma-default.c b/arch/mips/mm/dma-default.c index d8117be..730d394 100644 --- a/arch/mips/mm/dma-default.c +++ b/arch/mips/mm/dma-default.c @@ -145,7 +145,7 @@ static void *mips_dma_alloc_coherent(struct device *dev, size_t size, gfp = massage_gfp_flags(dev, gfp); - if (IS_ENABLED(CONFIG_DMA_CMA) && !(gfp & GFP_ATOMIC)) + if (IS_ENABLED(CONFIG_DMA_CMA) && gfpflags_allow_blocking(gfp)) page = dma_alloc_from_contiguous(dev, count, get_order(size)); if (!page) -- cgit v0.10.2 From dfd01f026058a59a513f8a365b439a0681b803af Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sun, 13 Dec 2015 22:11:16 +0100 Subject: sched/wait: Fix the signal handling fix Jan Stancek reported that I wrecked things for him by fixing things for Vladimir :/ His report was due to an UNINTERRUPTIBLE wait getting -EINTR, which should not be possible, however my previous patch made this possible by unconditionally checking signal_pending(). We cannot use current->state as was done previously, because the instruction after the store to that variable it can be changed. We must instead pass the initial state along and use that. Fixes: 68985633bccb ("sched/wait: Fix signal handling in bit wait helpers") Reported-by: Jan Stancek Reported-by: Chris Mason Tested-by: Jan Stancek Tested-by: Vladimir Murzin Tested-by: Chris Mason Reviewed-by: Paul Turner Cc: Ingo Molnar Cc: tglx@linutronix.de Cc: Oleg Nesterov Cc: hpa@zytor.com Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Linus Torvalds diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 6b66dd5..a329f5b 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -1831,11 +1831,11 @@ cifs_invalidate_mapping(struct inode *inode) * @word: long word containing the bit lock */ static int -cifs_wait_bit_killable(struct wait_bit_key *key) +cifs_wait_bit_killable(struct wait_bit_key *key, int mode) { - if (fatal_signal_pending(current)) - return -ERESTARTSYS; freezable_schedule_unsafe(); + if (signal_pending_state(mode, current)) + return -ERESTARTSYS; return 0; } diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 31b0a52..c7e8b87 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -75,11 +75,11 @@ nfs_fattr_to_ino_t(struct nfs_fattr *fattr) * nfs_wait_bit_killable - helper for functions that are sleeping on bit locks * @word: long word containing the bit lock */ -int nfs_wait_bit_killable(struct wait_bit_key *key) +int nfs_wait_bit_killable(struct wait_bit_key *key, int mode) { - if (fatal_signal_pending(current)) - return -ERESTARTSYS; freezable_schedule_unsafe(); + if (signal_pending_state(mode, current)) + return -ERESTARTSYS; return 0; } EXPORT_SYMBOL_GPL(nfs_wait_bit_killable); diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 56cfde2..9dea85f 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -379,7 +379,7 @@ extern int nfs_drop_inode(struct inode *); extern void nfs_clear_inode(struct inode *); extern void nfs_evict_inode(struct inode *); void nfs_zap_acl_cache(struct inode *inode); -extern int nfs_wait_bit_killable(struct wait_bit_key *key); +extern int nfs_wait_bit_killable(struct wait_bit_key *key, int mode); /* super.c */ extern const struct super_operations nfs_sops; diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index fe3ddd2..452a011 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -129,7 +129,7 @@ __nfs_iocounter_wait(struct nfs_io_counter *c) set_bit(NFS_IO_INPROGRESS, &c->flags); if (atomic_read(&c->io_count) == 0) break; - ret = nfs_wait_bit_killable(&q.key); + ret = nfs_wait_bit_killable(&q.key, TASK_KILLABLE); } while (atomic_read(&c->io_count) != 0 && !ret); finish_wait(wq, &q.wait); return ret; diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 5a8ae21..bec0384 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -1466,11 +1466,11 @@ static bool pnfs_within_mdsthreshold(struct nfs_open_context *ctx, } /* stop waiting if someone clears NFS_LAYOUT_RETRY_LAYOUTGET bit. */ -static int pnfs_layoutget_retry_bit_wait(struct wait_bit_key *key) +static int pnfs_layoutget_retry_bit_wait(struct wait_bit_key *key, int mode) { if (!test_bit(NFS_LAYOUT_RETRY_LAYOUTGET, key->flags)) return 1; - return nfs_wait_bit_killable(key); + return nfs_wait_bit_killable(key, mode); } static bool pnfs_prepare_to_retry_layoutget(struct pnfs_layout_hdr *lo) diff --git a/include/linux/wait.h b/include/linux/wait.h index 1e1bf9f..513b36f 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -145,7 +145,7 @@ __remove_wait_queue(wait_queue_head_t *head, wait_queue_t *old) list_del(&old->task_list); } -typedef int wait_bit_action_f(struct wait_bit_key *); +typedef int wait_bit_action_f(struct wait_bit_key *, int mode); void __wake_up(wait_queue_head_t *q, unsigned int mode, int nr, void *key); void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key); void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, int nr, void *key); @@ -960,10 +960,10 @@ int wake_bit_function(wait_queue_t *wait, unsigned mode, int sync, void *key); } while (0) -extern int bit_wait(struct wait_bit_key *); -extern int bit_wait_io(struct wait_bit_key *); -extern int bit_wait_timeout(struct wait_bit_key *); -extern int bit_wait_io_timeout(struct wait_bit_key *); +extern int bit_wait(struct wait_bit_key *, int); +extern int bit_wait_io(struct wait_bit_key *, int); +extern int bit_wait_timeout(struct wait_bit_key *, int); +extern int bit_wait_io_timeout(struct wait_bit_key *, int); /** * wait_on_bit - wait for a bit to be cleared diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index f10bd87..f15d6b6 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -392,7 +392,7 @@ __wait_on_bit(wait_queue_head_t *wq, struct wait_bit_queue *q, do { prepare_to_wait(wq, &q->wait, mode); if (test_bit(q->key.bit_nr, q->key.flags)) - ret = (*action)(&q->key); + ret = (*action)(&q->key, mode); } while (test_bit(q->key.bit_nr, q->key.flags) && !ret); finish_wait(wq, &q->wait); return ret; @@ -431,7 +431,7 @@ __wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q, prepare_to_wait_exclusive(wq, &q->wait, mode); if (!test_bit(q->key.bit_nr, q->key.flags)) continue; - ret = action(&q->key); + ret = action(&q->key, mode); if (!ret) continue; abort_exclusive_wait(wq, &q->wait, mode, &q->key); @@ -581,43 +581,43 @@ void wake_up_atomic_t(atomic_t *p) } EXPORT_SYMBOL(wake_up_atomic_t); -__sched int bit_wait(struct wait_bit_key *word) +__sched int bit_wait(struct wait_bit_key *word, int mode) { schedule(); - if (signal_pending(current)) + if (signal_pending_state(mode, current)) return -EINTR; return 0; } EXPORT_SYMBOL(bit_wait); -__sched int bit_wait_io(struct wait_bit_key *word) +__sched int bit_wait_io(struct wait_bit_key *word, int mode) { io_schedule(); - if (signal_pending(current)) + if (signal_pending_state(mode, current)) return -EINTR; return 0; } EXPORT_SYMBOL(bit_wait_io); -__sched int bit_wait_timeout(struct wait_bit_key *word) +__sched int bit_wait_timeout(struct wait_bit_key *word, int mode) { unsigned long now = READ_ONCE(jiffies); if (time_after_eq(now, word->timeout)) return -EAGAIN; schedule_timeout(word->timeout - now); - if (signal_pending(current)) + if (signal_pending_state(mode, current)) return -EINTR; return 0; } EXPORT_SYMBOL_GPL(bit_wait_timeout); -__sched int bit_wait_io_timeout(struct wait_bit_key *word) +__sched int bit_wait_io_timeout(struct wait_bit_key *word, int mode) { unsigned long now = READ_ONCE(jiffies); if (time_after_eq(now, word->timeout)) return -EAGAIN; io_schedule_timeout(word->timeout - now); - if (signal_pending(current)) + if (signal_pending_state(mode, current)) return -EINTR; return 0; } diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index f14f24e..73ad57a 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -250,11 +250,11 @@ void rpc_destroy_wait_queue(struct rpc_wait_queue *queue) } EXPORT_SYMBOL_GPL(rpc_destroy_wait_queue); -static int rpc_wait_bit_killable(struct wait_bit_key *key) +static int rpc_wait_bit_killable(struct wait_bit_key *key, int mode) { - if (fatal_signal_pending(current)) - return -ERESTARTSYS; freezable_schedule_unsafe(); + if (signal_pending_state(mode, current)) + return -ERESTARTSYS; return 0; } -- cgit v0.10.2 From 9f9499ae8e6415cefc4fe0a96ad0e27864353c89 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 13 Dec 2015 17:42:58 -0800 Subject: Linux 4.4-rc5 diff --git a/Makefile b/Makefile index d644f6e..bc0165d 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ VERSION = 4 PATCHLEVEL = 4 SUBLEVEL = 0 -EXTRAVERSION = -rc4 +EXTRAVERSION = -rc5 NAME = Blurry Fish Butt # *DOCUMENTATION* -- cgit v0.10.2 From 21266be9ed542f13436bd9c75316d43e1e84f6ae Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Thu, 19 Nov 2015 18:19:29 -0800 Subject: arch: consolidate CONFIG_STRICT_DEVM in lib/Kconfig.debug Let all the archs that implement devmem_is_allowed() opt-in to a common definition of CONFIG_STRICT_DEVM in lib/Kconfig.debug. Cc: Kees Cook Cc: Russell King Cc: Will Deacon Cc: Benjamin Herrenschmidt Cc: Martin Schwidefsky Cc: Thomas Gleixner Cc: "H. Peter Anvin" Cc: Andrew Morton Cc: Greg Kroah-Hartman Cc: "David S. Miller" Acked-by: Catalin Marinas Acked-by: Heiko Carstens [heiko: drop 'default y' for s390] Acked-by: Ingo Molnar Suggested-by: Arnd Bergmann Signed-off-by: Dan Williams diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 34e1569..b8a4797 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -2,6 +2,7 @@ config ARM bool default y select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE + select ARCH_HAS_DEVMEM_IS_ALLOWED select ARCH_HAS_ELF_RANDOMIZE select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST select ARCH_HAVE_CUSTOM_GPIO_H diff --git a/arch/arm/Kconfig.debug b/arch/arm/Kconfig.debug index 259c0ca..e356357 100644 --- a/arch/arm/Kconfig.debug +++ b/arch/arm/Kconfig.debug @@ -15,20 +15,6 @@ config ARM_PTDUMP kernel. If in doubt, say "N" -config STRICT_DEVMEM - bool "Filter access to /dev/mem" - depends on MMU - ---help--- - If this option is disabled, you allow userspace (root) access to all - of memory, including kernel and userspace memory. Accidental - access to this is obviously disastrous, but specific access can - be used by people debugging the kernel. - - If this option is switched on, the /dev/mem file only allows - userspace access to memory mapped peripherals. - - If in doubt, say Y. - # RMK wants arm kernels compiled with frame pointers or stack unwinding. # If you know what you are doing and are willing to live without stack # traces, you can get a slightly smaller kernel by setting this option to diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 871f217..08f64b4 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -3,6 +3,7 @@ config ARM64 select ACPI_CCA_REQUIRED if ACPI select ACPI_GENERIC_GSI if ACPI select ACPI_REDUCED_HARDWARE_ONLY if ACPI + select ARCH_HAS_DEVMEM_IS_ALLOWED select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE select ARCH_HAS_ELF_RANDOMIZE select ARCH_HAS_GCOV_PROFILE_ALL diff --git a/arch/arm64/Kconfig.debug b/arch/arm64/Kconfig.debug index 04fb73b..e13c4bf 100644 --- a/arch/arm64/Kconfig.debug +++ b/arch/arm64/Kconfig.debug @@ -14,20 +14,6 @@ config ARM64_PTDUMP kernel. If in doubt, say "N" -config STRICT_DEVMEM - bool "Filter access to /dev/mem" - depends on MMU - help - If this option is disabled, you allow userspace (root) access to all - of memory, including kernel and userspace memory. Accidental - access to this is obviously disastrous, but specific access can - be used by people debugging the kernel. - - If this option is switched on, the /dev/mem file only allows - userspace access to memory mapped peripherals. - - If in doubt, say Y. - config PID_IN_CONTEXTIDR bool "Write the current PID to the CONTEXTIDR register" help diff --git a/arch/frv/Kconfig b/arch/frv/Kconfig index 34aa193..03bfd6b 100644 --- a/arch/frv/Kconfig +++ b/arch/frv/Kconfig @@ -10,6 +10,7 @@ config FRV select HAVE_DEBUG_BUGVERBOSE select ARCH_HAVE_NMI_SAFE_CMPXCHG select GENERIC_CPU_DEVICES + select ARCH_HAS_DEVMEM_IS_ALLOWED select ARCH_WANT_IPC_PARSE_VERSION select OLD_SIGSUSPEND3 select OLD_SIGACTION diff --git a/arch/m32r/Kconfig b/arch/m32r/Kconfig index 9e44bbd..836ac5a 100644 --- a/arch/m32r/Kconfig +++ b/arch/m32r/Kconfig @@ -13,6 +13,7 @@ config M32R select GENERIC_IRQ_PROBE select GENERIC_IRQ_SHOW select GENERIC_ATOMIC64 + select ARCH_HAS_DEVMEM_IS_ALLOWED select ARCH_USES_GETTIMEOFFSET select MODULES_USE_ELF_RELA select HAVE_DEBUG_STACKOVERFLOW diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index db49e0d..85eabc4 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -159,6 +159,7 @@ config PPC select EDAC_SUPPORT select EDAC_ATOMIC_SCRUB select ARCH_HAS_DMA_SET_COHERENT_MASK + select ARCH_HAS_DEVMEM_IS_ALLOWED select HAVE_ARCH_SECCOMP_FILTER config GENERIC_CSUM diff --git a/arch/powerpc/Kconfig.debug b/arch/powerpc/Kconfig.debug index 3a510f4..a0e44a9 100644 --- a/arch/powerpc/Kconfig.debug +++ b/arch/powerpc/Kconfig.debug @@ -335,18 +335,6 @@ config PPC_EARLY_DEBUG_CPM_ADDR platform probing is done, all platforms selected must share the same address. -config STRICT_DEVMEM - def_bool y - prompt "Filter access to /dev/mem" - help - This option restricts access to /dev/mem. If this option is - disabled, you allow userspace access to all memory, including - kernel and userspace memory. Accidental memory access is likely - to be disastrous. - Memory access is required for experts who want to debug the kernel. - - If you are unsure, say Y. - config FAIL_IOMMU bool "Fault-injection capability for IOMMU" depends on FAULT_INJECTION diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 3a55f49..779becb 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -66,6 +66,7 @@ config S390 def_bool y select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE select ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS + select ARCH_HAS_DEVMEM_IS_ALLOWED select ARCH_HAS_ELF_RANDOMIZE select ARCH_HAS_GCOV_PROFILE_ALL select ARCH_HAS_SG_CHAIN diff --git a/arch/s390/Kconfig.debug b/arch/s390/Kconfig.debug index c56878e..26c5d5be 100644 --- a/arch/s390/Kconfig.debug +++ b/arch/s390/Kconfig.debug @@ -5,18 +5,6 @@ config TRACE_IRQFLAGS_SUPPORT source "lib/Kconfig.debug" -config STRICT_DEVMEM - def_bool y - prompt "Filter access to /dev/mem" - ---help--- - This option restricts access to /dev/mem. If this option is - disabled, you allow userspace access to all memory, including - kernel and userspace memory. Accidental memory access is likely - to be disastrous. - Memory access is required for experts who want to debug the kernel. - - If you are unsure, say Y. - config S390_PTDUMP bool "Export kernel pagetable layout to userspace via debugfs" depends on DEBUG_KERNEL diff --git a/arch/tile/Kconfig b/arch/tile/Kconfig index 106c21b..cf31168 100644 --- a/arch/tile/Kconfig +++ b/arch/tile/Kconfig @@ -19,6 +19,7 @@ config TILE select VIRT_TO_BUS select SYS_HYPERVISOR select ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS + select ARCH_HAS_DEVMEM_IS_ALLOWED select ARCH_HAVE_NMI_SAFE_CMPXCHG select GENERIC_CLOCKEVENTS select MODULES_USE_ELF_RELA @@ -116,9 +117,6 @@ config ARCH_DISCONTIGMEM_DEFAULT config TRACE_IRQFLAGS_SUPPORT def_bool y -config STRICT_DEVMEM - def_bool y - # SMP is required for Tilera Linux. config SMP def_bool y diff --git a/arch/unicore32/Kconfig b/arch/unicore32/Kconfig index c9faddc..5dc4c0a 100644 --- a/arch/unicore32/Kconfig +++ b/arch/unicore32/Kconfig @@ -1,5 +1,6 @@ config UNICORE32 def_bool y + select ARCH_HAS_DEVMEM_IS_ALLOWED select ARCH_MIGHT_HAVE_PC_PARPORT select ARCH_MIGHT_HAVE_PC_SERIO select HAVE_MEMBLOCK diff --git a/arch/unicore32/Kconfig.debug b/arch/unicore32/Kconfig.debug index 1a36262..f075bbe 100644 --- a/arch/unicore32/Kconfig.debug +++ b/arch/unicore32/Kconfig.debug @@ -2,20 +2,6 @@ menu "Kernel hacking" source "lib/Kconfig.debug" -config STRICT_DEVMEM - bool "Filter access to /dev/mem" - depends on MMU - ---help--- - If this option is disabled, you allow userspace (root) access to all - of memory, including kernel and userspace memory. Accidental - access to this is obviously disastrous, but specific access can - be used by people debugging the kernel. - - If this option is switched on, the /dev/mem file only allows - userspace access to memory mapped peripherals. - - If in doubt, say Y. - config EARLY_PRINTK def_bool DEBUG_OCD help diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index db3622f..75fba1f 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -24,6 +24,7 @@ config X86 select ARCH_DISCARD_MEMBLOCK select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE select ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS + select ARCH_HAS_DEVMEM_IS_ALLOWED select ARCH_HAS_ELF_RANDOMIZE select ARCH_HAS_FAST_MULTIPLIER select ARCH_HAS_GCOV_PROFILE_ALL diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index 137dfa9..1116452 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -5,23 +5,6 @@ config TRACE_IRQFLAGS_SUPPORT source "lib/Kconfig.debug" -config STRICT_DEVMEM - bool "Filter access to /dev/mem" - ---help--- - If this option is disabled, you allow userspace (root) access to all - of memory, including kernel and userspace memory. Accidental - access to this is obviously disastrous, but specific access can - be used by people debugging the kernel. Note that with PAT support - enabled, even in this case there are restrictions on /dev/mem - use due to the cache aliasing requirements. - - If this option is switched on, the /dev/mem file only allows - userspace access to PCI space and the BIOS code and data regions. - This is sufficient for dosemu and X and all common users of - /dev/mem. - - If in doubt, say Y. - config X86_VERBOSE_BOOTUP bool "Enable verbose x86 bootup info messages" default y diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 8c15b29..289dfcb 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1853,3 +1853,25 @@ source "samples/Kconfig" source "lib/Kconfig.kgdb" +config ARCH_HAS_DEVMEM_IS_ALLOWED + bool + +config STRICT_DEVMEM + bool "Filter access to /dev/mem" + depends on MMU + depends on ARCH_HAS_DEVMEM_IS_ALLOWED + default y if TILE || PPC + ---help--- + If this option is disabled, you allow userspace (root) access to all + of memory, including kernel and userspace memory. Accidental + access to this is obviously disastrous, but specific access can + be used by people debugging the kernel. Note that with PAT support + enabled, even in this case there are restrictions on /dev/mem + use due to the cache aliasing requirements. + + If this option is switched on, the /dev/mem file only allows + userspace access to PCI space and the BIOS code and data regions. + This is sufficient for dosemu and X and all common users of + /dev/mem. + + If in doubt, say Y. -- cgit v0.10.2 From 90a545e981267e917b9d698ce07affd69787db87 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Mon, 23 Nov 2015 15:49:03 -0800 Subject: restrict /dev/mem to idle io memory ranges This effectively promotes IORESOURCE_BUSY to IORESOURCE_EXCLUSIVE semantics by default. If userspace really believes it is safe to access the memory region it can also perform the extra step of disabling an active driver. This protects device address ranges with read side effects and otherwise directs userspace to use the driver. Persistent memory presents a large "mistake surface" to /dev/mem as now accidental writes can corrupt a filesystem. In general if a device driver is busily using a memory region it already informs other parts of the kernel to not touch it via request_mem_region(). /dev/mem should honor the same safety restriction by default. Debugging a device driver from userspace becomes more difficult with this enabled. Any application using /dev/mem or mmap of sysfs pci resources will now need to perform the extra step of either: 1/ Disabling the driver, for example: echo > /dev/bus//drivers//unbind 2/ Rebooting with "iomem=relaxed" on the command line 3/ Recompiling with CONFIG_IO_STRICT_DEVMEM=n Traditional users of /dev/mem like dosemu are unaffected because the first 1MB of memory is not subject to the IO_STRICT_DEVMEM restriction. Legacy X configurations use /dev/mem to talk to graphics hardware, but that functionality has since moved to kernel graphics drivers. Cc: Arnd Bergmann Cc: Russell King Cc: Andrew Morton Cc: Greg Kroah-Hartman Acked-by: Kees Cook Acked-by: Ingo Molnar Signed-off-by: Dan Williams diff --git a/kernel/resource.c b/kernel/resource.c index f150dbb..09c0597 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -1498,8 +1498,15 @@ int iomem_is_exclusive(u64 addr) break; if (p->end < addr) continue; - if (p->flags & IORESOURCE_BUSY && - p->flags & IORESOURCE_EXCLUSIVE) { + /* + * A resource is exclusive if IORESOURCE_EXCLUSIVE is set + * or CONFIG_IO_STRICT_DEVMEM is enabled and the + * resource is busy. + */ + if ((p->flags & IORESOURCE_BUSY) == 0) + continue; + if (IS_ENABLED(CONFIG_IO_STRICT_DEVMEM) + || p->flags & IORESOURCE_EXCLUSIVE) { err = 1; break; } diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 289dfcb..073496d 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1869,9 +1869,26 @@ config STRICT_DEVMEM enabled, even in this case there are restrictions on /dev/mem use due to the cache aliasing requirements. + If this option is switched on, and IO_STRICT_DEVMEM=n, the /dev/mem + file only allows userspace access to PCI space and the BIOS code and + data regions. This is sufficient for dosemu and X and all common + users of /dev/mem. + + If in doubt, say Y. + +config IO_STRICT_DEVMEM + bool "Filter I/O access to /dev/mem" + depends on STRICT_DEVMEM + default STRICT_DEVMEM + ---help--- + If this option is disabled, you allow userspace (root) access to all + io-memory regardless of whether a driver is actively using that + range. Accidental access to this is obviously disastrous, but + specific access can be used by people debugging kernel drivers. + If this option is switched on, the /dev/mem file only allows - userspace access to PCI space and the BIOS code and data regions. - This is sufficient for dosemu and X and all common users of - /dev/mem. + userspace access to *idle* io-memory ranges (see /proc/iomem) This + may break traditional users of /dev/mem (dosemu, legacy X, etc...) + if the driver using a given range cannot be disabled. If in doubt, say Y. -- cgit v0.10.2 From 4ebb16ca9a06a54cdb2e7f2ce1e506fa4d432445 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Wed, 28 Oct 2015 07:48:19 +0900 Subject: block: introduce bdev_file_inode() Similar to the file_inode() helper, provide a helper to lookup the inode for a raw block device itself. Cc: Al Viro Suggested-by: Jan Kara Reviewed-by: Jan Kara Reviewed-by: Jeff Moyer Signed-off-by: Dan Williams diff --git a/fs/block_dev.c b/fs/block_dev.c index 44d4a1e..52248bc 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -156,11 +156,16 @@ blkdev_get_block(struct inode *inode, sector_t iblock, return 0; } +static struct inode *bdev_file_inode(struct file *file) +{ + return file->f_mapping->host; +} + static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t offset) { struct file *file = iocb->ki_filp; - struct inode *inode = file->f_mapping->host; + struct inode *inode = bdev_file_inode(file); if (IS_DAX(inode)) return dax_do_io(iocb, inode, iter, offset, blkdev_get_block, @@ -338,7 +343,7 @@ static int blkdev_write_end(struct file *file, struct address_space *mapping, */ static loff_t block_llseek(struct file *file, loff_t offset, int whence) { - struct inode *bd_inode = file->f_mapping->host; + struct inode *bd_inode = bdev_file_inode(file); loff_t retval; mutex_lock(&bd_inode->i_mutex); @@ -349,7 +354,7 @@ static loff_t block_llseek(struct file *file, loff_t offset, int whence) int blkdev_fsync(struct file *filp, loff_t start, loff_t end, int datasync) { - struct inode *bd_inode = filp->f_mapping->host; + struct inode *bd_inode = bdev_file_inode(filp); struct block_device *bdev = I_BDEV(bd_inode); int error; @@ -1605,14 +1610,14 @@ EXPORT_SYMBOL(blkdev_put); static int blkdev_close(struct inode * inode, struct file * filp) { - struct block_device *bdev = I_BDEV(filp->f_mapping->host); + struct block_device *bdev = I_BDEV(bdev_file_inode(filp)); blkdev_put(bdev, filp->f_mode); return 0; } static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg) { - struct block_device *bdev = I_BDEV(file->f_mapping->host); + struct block_device *bdev = I_BDEV(bdev_file_inode(file)); fmode_t mode = file->f_mode; /* @@ -1637,7 +1642,7 @@ static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg) ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; - struct inode *bd_inode = file->f_mapping->host; + struct inode *bd_inode = bdev_file_inode(file); loff_t size = i_size_read(bd_inode); struct blk_plug plug; ssize_t ret; @@ -1669,7 +1674,7 @@ EXPORT_SYMBOL_GPL(blkdev_write_iter); ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct file *file = iocb->ki_filp; - struct inode *bd_inode = file->f_mapping->host; + struct inode *bd_inode = bdev_file_inode(file); loff_t size = i_size_read(bd_inode); loff_t pos = iocb->ki_pos; -- cgit v0.10.2 From 5a023cdba50c5f5f2bc351783b3131699deb3937 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Mon, 30 Nov 2015 10:20:29 -0800 Subject: block: enable dax for raw block devices If an application wants exclusive access to all of the persistent memory provided by an NVDIMM namespace it can use this raw-block-dax facility to forgo establishing a filesystem. This capability is targeted primarily to hypervisors wanting to provision persistent memory for guests. It can be disabled / enabled dynamically via the new BLKDAXSET ioctl. Cc: Jeff Moyer Cc: Christoph Hellwig Cc: Dave Chinner Cc: Andrew Morton Cc: Ross Zwisler Reported-by: kbuild test robot Reviewed-by: Jan Kara Signed-off-by: Dan Williams diff --git a/block/ioctl.c b/block/ioctl.c index 0918aed..7a964d8 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -406,6 +406,62 @@ static inline int is_unrecognized_ioctl(int ret) ret == -ENOIOCTLCMD; } +#ifdef CONFIG_FS_DAX +bool blkdev_dax_capable(struct block_device *bdev) +{ + struct gendisk *disk = bdev->bd_disk; + + if (!disk->fops->direct_access) + return false; + + /* + * If the partition is not aligned on a page boundary, we can't + * do dax I/O to it. + */ + if ((bdev->bd_part->start_sect % (PAGE_SIZE / 512)) + || (bdev->bd_part->nr_sects % (PAGE_SIZE / 512))) + return false; + + return true; +} + +static int blkdev_daxset(struct block_device *bdev, unsigned long argp) +{ + unsigned long arg; + int rc = 0; + + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; + + if (get_user(arg, (int __user *)(argp))) + return -EFAULT; + arg = !!arg; + if (arg == !!(bdev->bd_inode->i_flags & S_DAX)) + return 0; + + if (arg) + arg = S_DAX; + + if (arg && !blkdev_dax_capable(bdev)) + return -ENOTTY; + + mutex_lock(&bdev->bd_inode->i_mutex); + if (bdev->bd_map_count == 0) + inode_set_flags(bdev->bd_inode, arg, S_DAX); + else + rc = -EBUSY; + mutex_unlock(&bdev->bd_inode->i_mutex); + return rc; +} +#else +static int blkdev_daxset(struct block_device *bdev, int arg) +{ + if (arg) + return -ENOTTY; + return 0; +} +#endif + static int blkdev_flushbuf(struct block_device *bdev, fmode_t mode, unsigned cmd, unsigned long arg) { @@ -568,6 +624,11 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd, case BLKTRACESETUP: case BLKTRACETEARDOWN: return blk_trace_ioctl(bdev, cmd, argp); + case BLKDAXSET: + return blkdev_daxset(bdev, arg); + case BLKDAXGET: + return put_int(arg, !!(bdev->bd_inode->i_flags & S_DAX)); + break; case IOC_PR_REGISTER: return blkdev_pr_register(bdev, argp); case IOC_PR_RESERVE: diff --git a/fs/block_dev.c b/fs/block_dev.c index 52248bc..5c0b2cb 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1235,8 +1235,11 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) } } - if (!ret) + if (!ret) { bd_set_size(bdev,(loff_t)get_capacity(disk)<<9); + if (!blkdev_dax_capable(bdev)) + bdev->bd_inode->i_flags &= ~S_DAX; + } /* * If the device is invalidated, rescan partition @@ -1250,6 +1253,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) else if (ret == -ENOMEDIUM) invalidate_partitions(disk, bdev); } + if (ret) goto out_clear; } else { @@ -1270,12 +1274,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) goto out_clear; } bd_set_size(bdev, (loff_t)bdev->bd_part->nr_sects << 9); - /* - * If the partition is not aligned on a page - * boundary, we can't do dax I/O to it. - */ - if ((bdev->bd_part->start_sect % (PAGE_SIZE / 512)) || - (bdev->bd_part->nr_sects % (PAGE_SIZE / 512))) + if (!blkdev_dax_capable(bdev)) bdev->bd_inode->i_flags &= ~S_DAX; } } else { @@ -1713,13 +1712,101 @@ static const struct address_space_operations def_blk_aops = { .is_dirty_writeback = buffer_check_dirty_writeback, }; +#ifdef CONFIG_FS_DAX +/* + * In the raw block case we do not need to contend with truncation nor + * unwritten file extents. Without those concerns there is no need for + * additional locking beyond the mmap_sem context that these routines + * are already executing under. + * + * Note, there is no protection if the block device is dynamically + * resized (partition grow/shrink) during a fault. A stable block device + * size is already not enforced in the blkdev_direct_IO path. + * + * For DAX, it is the responsibility of the block device driver to + * ensure the whole-disk device size is stable while requests are in + * flight. + * + * Finally, unlike the filemap_page_mkwrite() case there is no + * filesystem superblock to sync against freezing. We still include a + * pfn_mkwrite callback for dax drivers to receive write fault + * notifications. + */ +static int blkdev_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + return __dax_fault(vma, vmf, blkdev_get_block, NULL); +} + +static int blkdev_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr, + pmd_t *pmd, unsigned int flags) +{ + return __dax_pmd_fault(vma, addr, pmd, flags, blkdev_get_block, NULL); +} + +static void blkdev_vm_open(struct vm_area_struct *vma) +{ + struct inode *bd_inode = bdev_file_inode(vma->vm_file); + struct block_device *bdev = I_BDEV(bd_inode); + + mutex_lock(&bd_inode->i_mutex); + bdev->bd_map_count++; + mutex_unlock(&bd_inode->i_mutex); +} + +static void blkdev_vm_close(struct vm_area_struct *vma) +{ + struct inode *bd_inode = bdev_file_inode(vma->vm_file); + struct block_device *bdev = I_BDEV(bd_inode); + + mutex_lock(&bd_inode->i_mutex); + bdev->bd_map_count--; + mutex_unlock(&bd_inode->i_mutex); +} + +static const struct vm_operations_struct blkdev_dax_vm_ops = { + .open = blkdev_vm_open, + .close = blkdev_vm_close, + .fault = blkdev_dax_fault, + .pmd_fault = blkdev_dax_pmd_fault, + .pfn_mkwrite = blkdev_dax_fault, +}; + +static const struct vm_operations_struct blkdev_default_vm_ops = { + .open = blkdev_vm_open, + .close = blkdev_vm_close, + .fault = filemap_fault, + .map_pages = filemap_map_pages, +}; + +static int blkdev_mmap(struct file *file, struct vm_area_struct *vma) +{ + struct inode *bd_inode = bdev_file_inode(file); + struct block_device *bdev = I_BDEV(bd_inode); + + file_accessed(file); + mutex_lock(&bd_inode->i_mutex); + bdev->bd_map_count++; + if (IS_DAX(bd_inode)) { + vma->vm_ops = &blkdev_dax_vm_ops; + vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE; + } else { + vma->vm_ops = &blkdev_default_vm_ops; + } + mutex_unlock(&bd_inode->i_mutex); + + return 0; +} +#else +#define blkdev_mmap generic_file_mmap +#endif + const struct file_operations def_blk_fops = { .open = blkdev_open, .release = blkdev_close, .llseek = block_llseek, .read_iter = blkdev_read_iter, .write_iter = blkdev_write_iter, - .mmap = generic_file_mmap, + .mmap = blkdev_mmap, .fsync = blkdev_fsync, .unlocked_ioctl = block_ioctl, #ifdef CONFIG_COMPAT diff --git a/include/linux/fs.h b/include/linux/fs.h index 3aa5142..96fabc9 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -482,6 +482,9 @@ struct block_device { int bd_fsfreeze_count; /* Mutex for freeze */ struct mutex bd_fsfreeze_mutex; +#ifdef CONFIG_FS_DAX + int bd_map_count; +#endif }; /* @@ -2264,6 +2267,14 @@ extern struct super_block *freeze_bdev(struct block_device *); extern void emergency_thaw_all(void); extern int thaw_bdev(struct block_device *bdev, struct super_block *sb); extern int fsync_bdev(struct block_device *); +#ifdef CONFIG_FS_DAX +extern bool blkdev_dax_capable(struct block_device *bdev); +#else +static inline bool blkdev_dax_capable(struct block_device *bdev) +{ + return false; +} +#endif extern struct super_block *blockdev_superblock; diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h index f15d980..401c409 100644 --- a/include/uapi/linux/fs.h +++ b/include/uapi/linux/fs.h @@ -152,6 +152,8 @@ struct inodes_stat_t { #define BLKSECDISCARD _IO(0x12,125) #define BLKROTATIONAL _IO(0x12,126) #define BLKZEROOUT _IO(0x12,127) +#define BLKDAXSET _IO(0x12,128) +#define BLKDAXGET _IO(0x12,129) #define BMAP_IOCTL 1 /* obsolete - kept for compatibility */ #define FIBMAP _IO(0x00,1) /* bmap access */ -- cgit v0.10.2 From ac34f15e0c6d2fd58480052b6985f6991fb53bcc Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Tue, 29 Dec 2015 14:02:29 -0800 Subject: block: fix del_gendisk() vs blkdev_ioctl crash When tearing down a block device early in its lifetime, userspace may still be performing discovery actions like blkdev_ioctl() to re-read partitions. The nvdimm_revalidate_disk() implementation depends on disk->driverfs_dev to be valid at entry. However, it is set to NULL in del_gendisk() and fatally this is happening *before* the disk device is deleted from userspace view. There's no reason for del_gendisk() to clear ->driverfs_dev. That device is the parent of the disk. It is guaranteed to not be freed until the disk, as a child, drops its ->parent reference. We could also fix this issue locally in nvdimm_revalidate_disk() by using disk_to_dev(disk)->parent, but lets fix it globally since ->driverfs_dev follows the lifetime of the parent. Longer term we should probably just add a @parent parameter to add_disk(), and stop carrying this pointer in the gendisk. BUG: unable to handle kernel NULL pointer dereference at (null) IP: [] nvdimm_revalidate_disk+0x18/0x90 [libnvdimm] CPU: 2 PID: 538 Comm: systemd-udevd Tainted: G O 4.4.0-rc5 #2257 [..] Call Trace: [] rescan_partitions+0x87/0x2c0 [] ? __lock_is_held+0x49/0x70 [] __blkdev_reread_part+0x72/0xb0 [] blkdev_reread_part+0x25/0x40 [] blkdev_ioctl+0x4fd/0x9c0 [] ? current_kernel_time64+0x69/0xd0 [] block_ioctl+0x3d/0x50 [] do_vfs_ioctl+0x308/0x560 [] ? __audit_syscall_entry+0xb1/0x100 [] ? do_audit_syscall_entry+0x66/0x70 [] SyS_ioctl+0x79/0x90 [] entry_SYSCALL_64_fastpath+0x12/0x76 Reported-by: Robert Hu Signed-off-by: Dan Williams diff --git a/block/genhd.c b/block/genhd.c index e5cafa5..1a971be 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -664,7 +664,6 @@ void del_gendisk(struct gendisk *disk) kobject_put(disk->part0.holder_dir); kobject_put(disk->slave_dir); - disk->driverfs_dev = NULL; if (!sysfs_deprecated) sysfs_remove_link(block_depr, dev_name(disk_to_dev(disk))); pm_runtime_set_memalloc_noio(disk_to_dev(disk), false); -- cgit v0.10.2 From 9e0e252a048b0ba5066f0dc15c3b2468ffe5c422 Mon Sep 17 00:00:00 2001 From: Vishal Verma Date: Thu, 24 Dec 2015 19:20:32 -0700 Subject: badblocks: Add core badblock management code Take the core badblocks implementation from md, and make it generally available. This follows the same style as kernel implementations of linked lists, rb-trees etc, where you can have a structure that can be embedded anywhere, and accessor functions to manipulate the data. The only changes in this copy of the code are ones to generalize function/variable names from md-specific ones. Also add init and free functions. Signed-off-by: Vishal Verma Signed-off-by: Dan Williams diff --git a/block/Makefile b/block/Makefile index 00ecc97..db5f622 100644 --- a/block/Makefile +++ b/block/Makefile @@ -8,7 +8,7 @@ obj-$(CONFIG_BLOCK) := bio.o elevator.o blk-core.o blk-tag.o blk-sysfs.o \ blk-iopoll.o blk-lib.o blk-mq.o blk-mq-tag.o \ blk-mq-sysfs.o blk-mq-cpu.o blk-mq-cpumap.o ioctl.o \ genhd.o scsi_ioctl.o partition-generic.o ioprio.o \ - partitions/ + badblocks.o partitions/ obj-$(CONFIG_BOUNCE) += bounce.o obj-$(CONFIG_BLK_DEV_BSG) += bsg.o diff --git a/block/badblocks.c b/block/badblocks.c new file mode 100644 index 0000000..96aeb91 --- /dev/null +++ b/block/badblocks.c @@ -0,0 +1,561 @@ +/* + * Bad block management + * + * - Heavily based on MD badblocks code from Neil Brown + * + * Copyright (c) 2015, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#include +#include +#include +#include +#include +#include +#include + +/** + * badblocks_check() - check a given range for bad sectors + * @bb: the badblocks structure that holds all badblock information + * @s: sector (start) at which to check for badblocks + * @sectors: number of sectors to check for badblocks + * @first_bad: pointer to store location of the first badblock + * @bad_sectors: pointer to store number of badblocks after @first_bad + * + * We can record which blocks on each device are 'bad' and so just + * fail those blocks, or that stripe, rather than the whole device. + * Entries in the bad-block table are 64bits wide. This comprises: + * Length of bad-range, in sectors: 0-511 for lengths 1-512 + * Start of bad-range, sector offset, 54 bits (allows 8 exbibytes) + * A 'shift' can be set so that larger blocks are tracked and + * consequently larger devices can be covered. + * 'Acknowledged' flag - 1 bit. - the most significant bit. + * + * Locking of the bad-block table uses a seqlock so badblocks_check + * might need to retry if it is very unlucky. + * We will sometimes want to check for bad blocks in a bi_end_io function, + * so we use the write_seqlock_irq variant. + * + * When looking for a bad block we specify a range and want to + * know if any block in the range is bad. So we binary-search + * to the last range that starts at-or-before the given endpoint, + * (or "before the sector after the target range") + * then see if it ends after the given start. + * + * Return: + * 0: there are no known bad blocks in the range + * 1: there are known bad block which are all acknowledged + * -1: there are bad blocks which have not yet been acknowledged in metadata. + * plus the start/length of the first bad section we overlap. + */ +int badblocks_check(struct badblocks *bb, sector_t s, int sectors, + sector_t *first_bad, int *bad_sectors) +{ + int hi; + int lo; + u64 *p = bb->page; + int rv; + sector_t target = s + sectors; + unsigned seq; + + if (bb->shift > 0) { + /* round the start down, and the end up */ + s >>= bb->shift; + target += (1<shift) - 1; + target >>= bb->shift; + sectors = target - s; + } + /* 'target' is now the first block after the bad range */ + +retry: + seq = read_seqbegin(&bb->lock); + lo = 0; + rv = 0; + hi = bb->count; + + /* Binary search between lo and hi for 'target' + * i.e. for the last range that starts before 'target' + */ + /* INVARIANT: ranges before 'lo' and at-or-after 'hi' + * are known not to be the last range before target. + * VARIANT: hi-lo is the number of possible + * ranges, and decreases until it reaches 1 + */ + while (hi - lo > 1) { + int mid = (lo + hi) / 2; + sector_t a = BB_OFFSET(p[mid]); + + if (a < target) + /* This could still be the one, earlier ranges + * could not. + */ + lo = mid; + else + /* This and later ranges are definitely out. */ + hi = mid; + } + /* 'lo' might be the last that started before target, but 'hi' isn't */ + if (hi > lo) { + /* need to check all range that end after 's' to see if + * any are unacknowledged. + */ + while (lo >= 0 && + BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) { + if (BB_OFFSET(p[lo]) < target) { + /* starts before the end, and finishes after + * the start, so they must overlap + */ + if (rv != -1 && BB_ACK(p[lo])) + rv = 1; + else + rv = -1; + *first_bad = BB_OFFSET(p[lo]); + *bad_sectors = BB_LEN(p[lo]); + } + lo--; + } + } + + if (read_seqretry(&bb->lock, seq)) + goto retry; + + return rv; +} +EXPORT_SYMBOL_GPL(badblocks_check); + +/** + * badblocks_set() - Add a range of bad blocks to the table. + * @bb: the badblocks structure that holds all badblock information + * @s: first sector to mark as bad + * @sectors: number of sectors to mark as bad + * @acknowledged: weather to mark the bad sectors as acknowledged + * + * This might extend the table, or might contract it if two adjacent ranges + * can be merged. We binary-search to find the 'insertion' point, then + * decide how best to handle it. + * + * Return: + * 0: success + * 1: failed to set badblocks (out of space) + */ +int badblocks_set(struct badblocks *bb, sector_t s, int sectors, + int acknowledged) +{ + u64 *p; + int lo, hi; + int rv = 0; + unsigned long flags; + + if (bb->shift < 0) + /* badblocks are disabled */ + return 0; + + if (bb->shift) { + /* round the start down, and the end up */ + sector_t next = s + sectors; + + s >>= bb->shift; + next += (1<shift) - 1; + next >>= bb->shift; + sectors = next - s; + } + + write_seqlock_irqsave(&bb->lock, flags); + + p = bb->page; + lo = 0; + hi = bb->count; + /* Find the last range that starts at-or-before 's' */ + while (hi - lo > 1) { + int mid = (lo + hi) / 2; + sector_t a = BB_OFFSET(p[mid]); + + if (a <= s) + lo = mid; + else + hi = mid; + } + if (hi > lo && BB_OFFSET(p[lo]) > s) + hi = lo; + + if (hi > lo) { + /* we found a range that might merge with the start + * of our new range + */ + sector_t a = BB_OFFSET(p[lo]); + sector_t e = a + BB_LEN(p[lo]); + int ack = BB_ACK(p[lo]); + + if (e >= s) { + /* Yes, we can merge with a previous range */ + if (s == a && s + sectors >= e) + /* new range covers old */ + ack = acknowledged; + else + ack = ack && acknowledged; + + if (e < s + sectors) + e = s + sectors; + if (e - a <= BB_MAX_LEN) { + p[lo] = BB_MAKE(a, e-a, ack); + s = e; + } else { + /* does not all fit in one range, + * make p[lo] maximal + */ + if (BB_LEN(p[lo]) != BB_MAX_LEN) + p[lo] = BB_MAKE(a, BB_MAX_LEN, ack); + s = a + BB_MAX_LEN; + } + sectors = e - s; + } + } + if (sectors && hi < bb->count) { + /* 'hi' points to the first range that starts after 's'. + * Maybe we can merge with the start of that range + */ + sector_t a = BB_OFFSET(p[hi]); + sector_t e = a + BB_LEN(p[hi]); + int ack = BB_ACK(p[hi]); + + if (a <= s + sectors) { + /* merging is possible */ + if (e <= s + sectors) { + /* full overlap */ + e = s + sectors; + ack = acknowledged; + } else + ack = ack && acknowledged; + + a = s; + if (e - a <= BB_MAX_LEN) { + p[hi] = BB_MAKE(a, e-a, ack); + s = e; + } else { + p[hi] = BB_MAKE(a, BB_MAX_LEN, ack); + s = a + BB_MAX_LEN; + } + sectors = e - s; + lo = hi; + hi++; + } + } + if (sectors == 0 && hi < bb->count) { + /* we might be able to combine lo and hi */ + /* Note: 's' is at the end of 'lo' */ + sector_t a = BB_OFFSET(p[hi]); + int lolen = BB_LEN(p[lo]); + int hilen = BB_LEN(p[hi]); + int newlen = lolen + hilen - (s - a); + + if (s >= a && newlen < BB_MAX_LEN) { + /* yes, we can combine them */ + int ack = BB_ACK(p[lo]) && BB_ACK(p[hi]); + + p[lo] = BB_MAKE(BB_OFFSET(p[lo]), newlen, ack); + memmove(p + hi, p + hi + 1, + (bb->count - hi - 1) * 8); + bb->count--; + } + } + while (sectors) { + /* didn't merge (it all). + * Need to add a range just before 'hi' + */ + if (bb->count >= MAX_BADBLOCKS) { + /* No room for more */ + rv = 1; + break; + } else { + int this_sectors = sectors; + + memmove(p + hi + 1, p + hi, + (bb->count - hi) * 8); + bb->count++; + + if (this_sectors > BB_MAX_LEN) + this_sectors = BB_MAX_LEN; + p[hi] = BB_MAKE(s, this_sectors, acknowledged); + sectors -= this_sectors; + s += this_sectors; + } + } + + bb->changed = 1; + if (!acknowledged) + bb->unacked_exist = 1; + write_sequnlock_irqrestore(&bb->lock, flags); + + return rv; +} +EXPORT_SYMBOL_GPL(badblocks_set); + +/** + * badblocks_clear() - Remove a range of bad blocks to the table. + * @bb: the badblocks structure that holds all badblock information + * @s: first sector to mark as bad + * @sectors: number of sectors to mark as bad + * + * This may involve extending the table if we spilt a region, + * but it must not fail. So if the table becomes full, we just + * drop the remove request. + * + * Return: + * 0: success + * 1: failed to clear badblocks + */ +int badblocks_clear(struct badblocks *bb, sector_t s, int sectors) +{ + u64 *p; + int lo, hi; + sector_t target = s + sectors; + int rv = 0; + + if (bb->shift > 0) { + /* When clearing we round the start up and the end down. + * This should not matter as the shift should align with + * the block size and no rounding should ever be needed. + * However it is better the think a block is bad when it + * isn't than to think a block is not bad when it is. + */ + s += (1<shift) - 1; + s >>= bb->shift; + target >>= bb->shift; + sectors = target - s; + } + + write_seqlock_irq(&bb->lock); + + p = bb->page; + lo = 0; + hi = bb->count; + /* Find the last range that starts before 'target' */ + while (hi - lo > 1) { + int mid = (lo + hi) / 2; + sector_t a = BB_OFFSET(p[mid]); + + if (a < target) + lo = mid; + else + hi = mid; + } + if (hi > lo) { + /* p[lo] is the last range that could overlap the + * current range. Earlier ranges could also overlap, + * but only this one can overlap the end of the range. + */ + if (BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > target) { + /* Partial overlap, leave the tail of this range */ + int ack = BB_ACK(p[lo]); + sector_t a = BB_OFFSET(p[lo]); + sector_t end = a + BB_LEN(p[lo]); + + if (a < s) { + /* we need to split this range */ + if (bb->count >= MAX_BADBLOCKS) { + rv = -ENOSPC; + goto out; + } + memmove(p+lo+1, p+lo, (bb->count - lo) * 8); + bb->count++; + p[lo] = BB_MAKE(a, s-a, ack); + lo++; + } + p[lo] = BB_MAKE(target, end - target, ack); + /* there is no longer an overlap */ + hi = lo; + lo--; + } + while (lo >= 0 && + BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) { + /* This range does overlap */ + if (BB_OFFSET(p[lo]) < s) { + /* Keep the early parts of this range. */ + int ack = BB_ACK(p[lo]); + sector_t start = BB_OFFSET(p[lo]); + + p[lo] = BB_MAKE(start, s - start, ack); + /* now low doesn't overlap, so.. */ + break; + } + lo--; + } + /* 'lo' is strictly before, 'hi' is strictly after, + * anything between needs to be discarded + */ + if (hi - lo > 1) { + memmove(p+lo+1, p+hi, (bb->count - hi) * 8); + bb->count -= (hi - lo - 1); + } + } + + bb->changed = 1; +out: + write_sequnlock_irq(&bb->lock); + return rv; +} +EXPORT_SYMBOL_GPL(badblocks_clear); + +/** + * ack_all_badblocks() - Acknowledge all bad blocks in a list. + * @bb: the badblocks structure that holds all badblock information + * + * This only succeeds if ->changed is clear. It is used by + * in-kernel metadata updates + */ +void ack_all_badblocks(struct badblocks *bb) +{ + if (bb->page == NULL || bb->changed) + /* no point even trying */ + return; + write_seqlock_irq(&bb->lock); + + if (bb->changed == 0 && bb->unacked_exist) { + u64 *p = bb->page; + int i; + + for (i = 0; i < bb->count ; i++) { + if (!BB_ACK(p[i])) { + sector_t start = BB_OFFSET(p[i]); + int len = BB_LEN(p[i]); + + p[i] = BB_MAKE(start, len, 1); + } + } + bb->unacked_exist = 0; + } + write_sequnlock_irq(&bb->lock); +} +EXPORT_SYMBOL_GPL(ack_all_badblocks); + +/** + * badblocks_show() - sysfs access to bad-blocks list + * @bb: the badblocks structure that holds all badblock information + * @page: buffer received from sysfs + * @unack: weather to show unacknowledged badblocks + * + * Return: + * Length of returned data + */ +ssize_t badblocks_show(struct badblocks *bb, char *page, int unack) +{ + size_t len; + int i; + u64 *p = bb->page; + unsigned seq; + + if (bb->shift < 0) + return 0; + +retry: + seq = read_seqbegin(&bb->lock); + + len = 0; + i = 0; + + while (len < PAGE_SIZE && i < bb->count) { + sector_t s = BB_OFFSET(p[i]); + unsigned int length = BB_LEN(p[i]); + int ack = BB_ACK(p[i]); + + i++; + + if (unack && ack) + continue; + + len += snprintf(page+len, PAGE_SIZE-len, "%llu %u\n", + (unsigned long long)s << bb->shift, + length << bb->shift); + } + if (unack && len == 0) + bb->unacked_exist = 0; + + if (read_seqretry(&bb->lock, seq)) + goto retry; + + return len; +} +EXPORT_SYMBOL_GPL(badblocks_show); + +/** + * badblocks_store() - sysfs access to bad-blocks list + * @bb: the badblocks structure that holds all badblock information + * @page: buffer received from sysfs + * @len: length of data received from sysfs + * @unack: weather to show unacknowledged badblocks + * + * Return: + * Length of the buffer processed or -ve error. + */ +ssize_t badblocks_store(struct badblocks *bb, const char *page, size_t len, + int unack) +{ + unsigned long long sector; + int length; + char newline; + + switch (sscanf(page, "%llu %d%c", §or, &length, &newline)) { + case 3: + if (newline != '\n') + return -EINVAL; + case 2: + if (length <= 0) + return -EINVAL; + break; + default: + return -EINVAL; + } + + if (badblocks_set(bb, sector, length, !unack)) + return -ENOSPC; + else + return len; +} +EXPORT_SYMBOL_GPL(badblocks_store); + +/** + * badblocks_init() - initialize the badblocks structure + * @bb: the badblocks structure that holds all badblock information + * @enable: weather to enable badblocks accounting + * + * Return: + * 0: success + * -ve errno: on error + */ +int badblocks_init(struct badblocks *bb, int enable) +{ + bb->count = 0; + if (enable) + bb->shift = 0; + else + bb->shift = -1; + bb->page = kzalloc(PAGE_SIZE, GFP_KERNEL); + if (bb->page == (u64 *)0) { + bb->shift = -1; + return -ENOMEM; + } + seqlock_init(&bb->lock); + + return 0; +} +EXPORT_SYMBOL_GPL(badblocks_init); + +/** + * badblocks_free() - free the badblocks structure + * @bb: the badblocks structure that holds all badblock information + */ +void badblocks_free(struct badblocks *bb) +{ + kfree(bb->page); + bb->page = NULL; +} +EXPORT_SYMBOL_GPL(badblocks_free); diff --git a/include/linux/badblocks.h b/include/linux/badblocks.h new file mode 100644 index 0000000..9293446 --- /dev/null +++ b/include/linux/badblocks.h @@ -0,0 +1,53 @@ +#ifndef _LINUX_BADBLOCKS_H +#define _LINUX_BADBLOCKS_H + +#include +#include +#include +#include + +#define BB_LEN_MASK (0x00000000000001FFULL) +#define BB_OFFSET_MASK (0x7FFFFFFFFFFFFE00ULL) +#define BB_ACK_MASK (0x8000000000000000ULL) +#define BB_MAX_LEN 512 +#define BB_OFFSET(x) (((x) & BB_OFFSET_MASK) >> 9) +#define BB_LEN(x) (((x) & BB_LEN_MASK) + 1) +#define BB_ACK(x) (!!((x) & BB_ACK_MASK)) +#define BB_MAKE(a, l, ack) (((a)<<9) | ((l)-1) | ((u64)(!!(ack)) << 63)) + +/* Bad block numbers are stored sorted in a single page. + * 64bits is used for each block or extent. + * 54 bits are sector number, 9 bits are extent size, + * 1 bit is an 'acknowledged' flag. + */ +#define MAX_BADBLOCKS (PAGE_SIZE/8) + +struct badblocks { + int count; /* count of bad blocks */ + int unacked_exist; /* there probably are unacknowledged + * bad blocks. This is only cleared + * when a read discovers none + */ + int shift; /* shift from sectors to block size + * a -ve shift means badblocks are + * disabled.*/ + u64 *page; /* badblock list */ + int changed; + seqlock_t lock; + sector_t sector; + sector_t size; /* in sectors */ +}; + +int badblocks_check(struct badblocks *bb, sector_t s, int sectors, + sector_t *first_bad, int *bad_sectors); +int badblocks_set(struct badblocks *bb, sector_t s, int sectors, + int acknowledged); +int badblocks_clear(struct badblocks *bb, sector_t s, int sectors); +void ack_all_badblocks(struct badblocks *bb); +ssize_t badblocks_show(struct badblocks *bb, char *page, int unack); +ssize_t badblocks_store(struct badblocks *bb, const char *page, size_t len, + int unack); +int badblocks_init(struct badblocks *bb, int enable); +void badblocks_free(struct badblocks *bb); + +#endif -- cgit v0.10.2 From 99e6608c9e7414ae4f2168df8bf8fae3eb49e41f Mon Sep 17 00:00:00 2001 From: Vishal Verma Date: Sat, 9 Jan 2016 08:36:51 -0800 Subject: block: Add badblock management for gendisks NVDIMM devices, which can behave more like DRAM rather than block devices, may develop bad cache lines, or 'poison'. A block device exposed by the pmem driver can then consume poison via a read (or write), and cause a machine check. On platforms without machine check recovery features, this would mean a crash. The block device maintaining a runtime list of all known sectors that have poison can directly avoid this, and also provide a path forward to enable proper handling/recovery for DAX faults on such a device. Use the new badblock management interfaces to add a badblocks list to gendisks. Signed-off-by: Vishal Verma Signed-off-by: Dan Williams diff --git a/block/genhd.c b/block/genhd.c index 1a971be..88579cf 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -20,6 +20,7 @@ #include #include #include +#include #include "blk.h" @@ -505,6 +506,16 @@ static int exact_lock(dev_t devt, void *data) return 0; } +int disk_alloc_badblocks(struct gendisk *disk) +{ + disk->bb = kzalloc(sizeof(*(disk->bb)), GFP_KERNEL); + if (!disk->bb) + return -ENOMEM; + + return badblocks_init(disk->bb, 1); +} +EXPORT_SYMBOL(disk_alloc_badblocks); + static void register_disk(struct gendisk *disk) { struct device *ddev = disk_to_dev(disk); @@ -659,6 +670,11 @@ void del_gendisk(struct gendisk *disk) blk_unregister_queue(disk); blk_unregister_region(disk_devt(disk), disk->minors); + if (disk->bb) { + badblocks_free(disk->bb); + kfree(disk->bb); + } + part_stat_set_all(&disk->part0, 0); disk->part0.stamp = 0; @@ -671,6 +687,63 @@ void del_gendisk(struct gendisk *disk) } EXPORT_SYMBOL(del_gendisk); +/* + * The gendisk usage of badblocks does not track acknowledgements for + * badblocks. We always assume they are acknowledged. + */ +int disk_check_badblocks(struct gendisk *disk, sector_t s, int sectors, + sector_t *first_bad, int *bad_sectors) +{ + if (!disk->bb) + return 0; + + return badblocks_check(disk->bb, s, sectors, first_bad, bad_sectors); +} +EXPORT_SYMBOL(disk_check_badblocks); + +int disk_set_badblocks(struct gendisk *disk, sector_t s, int sectors) +{ + if (!disk->bb) + return 0; + + return badblocks_set(disk->bb, s, sectors, 1); +} +EXPORT_SYMBOL(disk_set_badblocks); + +int disk_clear_badblocks(struct gendisk *disk, sector_t s, int sectors) +{ + if (!disk->bb) + return 0; + + return badblocks_clear(disk->bb, s, sectors); +} +EXPORT_SYMBOL(disk_clear_badblocks); + +/* sysfs access to bad-blocks list. */ +static ssize_t disk_badblocks_show(struct device *dev, + struct device_attribute *attr, + char *page) +{ + struct gendisk *disk = dev_to_disk(dev); + + if (!disk->bb) + return sprintf(page, "\n"); + + return badblocks_show(disk->bb, page, 0); +} + +static ssize_t disk_badblocks_store(struct device *dev, + struct device_attribute *attr, + const char *page, size_t len) +{ + struct gendisk *disk = dev_to_disk(dev); + + if (!disk->bb) + return -ENXIO; + + return badblocks_store(disk->bb, page, len, 0); +} + /** * get_gendisk - get partitioning information for a given device * @devt: device to get partitioning information for @@ -989,6 +1062,8 @@ static DEVICE_ATTR(discard_alignment, S_IRUGO, disk_discard_alignment_show, static DEVICE_ATTR(capability, S_IRUGO, disk_capability_show, NULL); static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL); static DEVICE_ATTR(inflight, S_IRUGO, part_inflight_show, NULL); +static DEVICE_ATTR(badblocks, S_IRUGO | S_IWUSR, disk_badblocks_show, + disk_badblocks_store); #ifdef CONFIG_FAIL_MAKE_REQUEST static struct device_attribute dev_attr_fail = __ATTR(make-it-fail, S_IRUGO|S_IWUSR, part_fail_show, part_fail_store); @@ -1010,6 +1085,7 @@ static struct attribute *disk_attrs[] = { &dev_attr_capability.attr, &dev_attr_stat.attr, &dev_attr_inflight.attr, + &dev_attr_badblocks.attr, #ifdef CONFIG_FAIL_MAKE_REQUEST &dev_attr_fail.attr, #endif diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 847cc1d..0bbec68 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -162,6 +162,7 @@ struct disk_part_tbl { }; struct disk_events; +struct badblocks; #if defined(CONFIG_BLK_DEV_INTEGRITY) @@ -213,6 +214,7 @@ struct gendisk { struct kobject integrity_kobj; #endif /* CONFIG_BLK_DEV_INTEGRITY */ int node_id; + struct badblocks *bb; }; static inline struct gendisk *part_to_disk(struct hd_struct *part) @@ -433,6 +435,11 @@ extern void add_disk(struct gendisk *disk); extern void del_gendisk(struct gendisk *gp); extern struct gendisk *get_gendisk(dev_t dev, int *partno); extern struct block_device *bdget_disk(struct gendisk *disk, int partno); +int disk_alloc_badblocks(struct gendisk *disk); +extern int disk_check_badblocks(struct gendisk *disk, sector_t s, int sectors, + sector_t *first_bad, int *bad_sectors); +extern int disk_set_badblocks(struct gendisk *disk, sector_t s, int sectors); +extern int disk_clear_badblocks(struct gendisk *disk, sector_t s, int sectors); extern void set_device_ro(struct block_device *bdev, int flag); extern void set_disk_ro(struct gendisk *disk, int flag); -- cgit v0.10.2 From fc974ee2bffdde47d1e4b220cf326952cc2c4794 Mon Sep 17 00:00:00 2001 From: Vishal Verma Date: Thu, 24 Dec 2015 19:20:34 -0700 Subject: md: convert to use the generic badblocks code Retain badblocks as part of rdev, but use the accessor functions from include/linux/badblocks for all manipulation. Signed-off-by: Vishal Verma Signed-off-by: Dan Williams diff --git a/drivers/md/md.c b/drivers/md/md.c index 807095f..1e48aa9 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -34,6 +34,7 @@ #include #include +#include #include #include #include @@ -709,8 +710,7 @@ void md_rdev_clear(struct md_rdev *rdev) put_page(rdev->bb_page); rdev->bb_page = NULL; } - kfree(rdev->badblocks.page); - rdev->badblocks.page = NULL; + badblocks_free(&rdev->badblocks); } EXPORT_SYMBOL_GPL(md_rdev_clear); @@ -1360,8 +1360,6 @@ static __le32 calc_sb_1_csum(struct mdp_superblock_1 *sb) return cpu_to_le32(csum); } -static int md_set_badblocks(struct badblocks *bb, sector_t s, int sectors, - int acknowledged); static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version) { struct mdp_superblock_1 *sb; @@ -1486,8 +1484,7 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_ count <<= sb->bblog_shift; if (bb + 1 == 0) break; - if (md_set_badblocks(&rdev->badblocks, - sector, count, 1) == 0) + if (badblocks_set(&rdev->badblocks, sector, count, 1)) return -EINVAL; } } else if (sb->bblog_offset != 0) @@ -2319,7 +2316,7 @@ repeat: rdev_for_each(rdev, mddev) { if (rdev->badblocks.changed) { rdev->badblocks.changed = 0; - md_ack_all_badblocks(&rdev->badblocks); + ack_all_badblocks(&rdev->badblocks); md_error(mddev, rdev); } clear_bit(Blocked, &rdev->flags); @@ -2445,7 +2442,7 @@ repeat: clear_bit(Blocked, &rdev->flags); if (any_badblocks_changed) - md_ack_all_badblocks(&rdev->badblocks); + ack_all_badblocks(&rdev->badblocks); clear_bit(BlockedBadBlocks, &rdev->flags); wake_up(&rdev->blocked_wait); } @@ -3046,11 +3043,17 @@ static ssize_t recovery_start_store(struct md_rdev *rdev, const char *buf, size_ static struct rdev_sysfs_entry rdev_recovery_start = __ATTR(recovery_start, S_IRUGO|S_IWUSR, recovery_start_show, recovery_start_store); -static ssize_t -badblocks_show(struct badblocks *bb, char *page, int unack); -static ssize_t -badblocks_store(struct badblocks *bb, const char *page, size_t len, int unack); - +/* sysfs access to bad-blocks list. + * We present two files. + * 'bad-blocks' lists sector numbers and lengths of ranges that + * are recorded as bad. The list is truncated to fit within + * the one-page limit of sysfs. + * Writing "sector length" to this file adds an acknowledged + * bad block list. + * 'unacknowledged-bad-blocks' lists bad blocks that have not yet + * been acknowledged. Writing to this file adds bad blocks + * without acknowledging them. This is largely for testing. + */ static ssize_t bb_show(struct md_rdev *rdev, char *page) { return badblocks_show(&rdev->badblocks, page, 0); @@ -3165,14 +3168,7 @@ int md_rdev_init(struct md_rdev *rdev) * This reserves the space even on arrays where it cannot * be used - I wonder if that matters */ - rdev->badblocks.count = 0; - rdev->badblocks.shift = -1; /* disabled until explicitly enabled */ - rdev->badblocks.page = kmalloc(PAGE_SIZE, GFP_KERNEL); - seqlock_init(&rdev->badblocks.lock); - if (rdev->badblocks.page == NULL) - return -ENOMEM; - - return 0; + return badblocks_init(&rdev->badblocks, 0); } EXPORT_SYMBOL_GPL(md_rdev_init); /* @@ -8478,254 +8474,9 @@ void md_finish_reshape(struct mddev *mddev) } EXPORT_SYMBOL(md_finish_reshape); -/* Bad block management. - * We can record which blocks on each device are 'bad' and so just - * fail those blocks, or that stripe, rather than the whole device. - * Entries in the bad-block table are 64bits wide. This comprises: - * Length of bad-range, in sectors: 0-511 for lengths 1-512 - * Start of bad-range, sector offset, 54 bits (allows 8 exbibytes) - * A 'shift' can be set so that larger blocks are tracked and - * consequently larger devices can be covered. - * 'Acknowledged' flag - 1 bit. - the most significant bit. - * - * Locking of the bad-block table uses a seqlock so md_is_badblock - * might need to retry if it is very unlucky. - * We will sometimes want to check for bad blocks in a bi_end_io function, - * so we use the write_seqlock_irq variant. - * - * When looking for a bad block we specify a range and want to - * know if any block in the range is bad. So we binary-search - * to the last range that starts at-or-before the given endpoint, - * (or "before the sector after the target range") - * then see if it ends after the given start. - * We return - * 0 if there are no known bad blocks in the range - * 1 if there are known bad block which are all acknowledged - * -1 if there are bad blocks which have not yet been acknowledged in metadata. - * plus the start/length of the first bad section we overlap. - */ -int md_is_badblock(struct badblocks *bb, sector_t s, int sectors, - sector_t *first_bad, int *bad_sectors) -{ - int hi; - int lo; - u64 *p = bb->page; - int rv; - sector_t target = s + sectors; - unsigned seq; - - if (bb->shift > 0) { - /* round the start down, and the end up */ - s >>= bb->shift; - target += (1<shift) - 1; - target >>= bb->shift; - sectors = target - s; - } - /* 'target' is now the first block after the bad range */ - -retry: - seq = read_seqbegin(&bb->lock); - lo = 0; - rv = 0; - hi = bb->count; - - /* Binary search between lo and hi for 'target' - * i.e. for the last range that starts before 'target' - */ - /* INVARIANT: ranges before 'lo' and at-or-after 'hi' - * are known not to be the last range before target. - * VARIANT: hi-lo is the number of possible - * ranges, and decreases until it reaches 1 - */ - while (hi - lo > 1) { - int mid = (lo + hi) / 2; - sector_t a = BB_OFFSET(p[mid]); - if (a < target) - /* This could still be the one, earlier ranges - * could not. */ - lo = mid; - else - /* This and later ranges are definitely out. */ - hi = mid; - } - /* 'lo' might be the last that started before target, but 'hi' isn't */ - if (hi > lo) { - /* need to check all range that end after 's' to see if - * any are unacknowledged. - */ - while (lo >= 0 && - BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) { - if (BB_OFFSET(p[lo]) < target) { - /* starts before the end, and finishes after - * the start, so they must overlap - */ - if (rv != -1 && BB_ACK(p[lo])) - rv = 1; - else - rv = -1; - *first_bad = BB_OFFSET(p[lo]); - *bad_sectors = BB_LEN(p[lo]); - } - lo--; - } - } - - if (read_seqretry(&bb->lock, seq)) - goto retry; - - return rv; -} -EXPORT_SYMBOL_GPL(md_is_badblock); - -/* - * Add a range of bad blocks to the table. - * This might extend the table, or might contract it - * if two adjacent ranges can be merged. - * We binary-search to find the 'insertion' point, then - * decide how best to handle it. - */ -static int md_set_badblocks(struct badblocks *bb, sector_t s, int sectors, - int acknowledged) -{ - u64 *p; - int lo, hi; - int rv = 1; - unsigned long flags; - - if (bb->shift < 0) - /* badblocks are disabled */ - return 0; - - if (bb->shift) { - /* round the start down, and the end up */ - sector_t next = s + sectors; - s >>= bb->shift; - next += (1<shift) - 1; - next >>= bb->shift; - sectors = next - s; - } - - write_seqlock_irqsave(&bb->lock, flags); - - p = bb->page; - lo = 0; - hi = bb->count; - /* Find the last range that starts at-or-before 's' */ - while (hi - lo > 1) { - int mid = (lo + hi) / 2; - sector_t a = BB_OFFSET(p[mid]); - if (a <= s) - lo = mid; - else - hi = mid; - } - if (hi > lo && BB_OFFSET(p[lo]) > s) - hi = lo; - - if (hi > lo) { - /* we found a range that might merge with the start - * of our new range - */ - sector_t a = BB_OFFSET(p[lo]); - sector_t e = a + BB_LEN(p[lo]); - int ack = BB_ACK(p[lo]); - if (e >= s) { - /* Yes, we can merge with a previous range */ - if (s == a && s + sectors >= e) - /* new range covers old */ - ack = acknowledged; - else - ack = ack && acknowledged; - - if (e < s + sectors) - e = s + sectors; - if (e - a <= BB_MAX_LEN) { - p[lo] = BB_MAKE(a, e-a, ack); - s = e; - } else { - /* does not all fit in one range, - * make p[lo] maximal - */ - if (BB_LEN(p[lo]) != BB_MAX_LEN) - p[lo] = BB_MAKE(a, BB_MAX_LEN, ack); - s = a + BB_MAX_LEN; - } - sectors = e - s; - } - } - if (sectors && hi < bb->count) { - /* 'hi' points to the first range that starts after 's'. - * Maybe we can merge with the start of that range */ - sector_t a = BB_OFFSET(p[hi]); - sector_t e = a + BB_LEN(p[hi]); - int ack = BB_ACK(p[hi]); - if (a <= s + sectors) { - /* merging is possible */ - if (e <= s + sectors) { - /* full overlap */ - e = s + sectors; - ack = acknowledged; - } else - ack = ack && acknowledged; - - a = s; - if (e - a <= BB_MAX_LEN) { - p[hi] = BB_MAKE(a, e-a, ack); - s = e; - } else { - p[hi] = BB_MAKE(a, BB_MAX_LEN, ack); - s = a + BB_MAX_LEN; - } - sectors = e - s; - lo = hi; - hi++; - } - } - if (sectors == 0 && hi < bb->count) { - /* we might be able to combine lo and hi */ - /* Note: 's' is at the end of 'lo' */ - sector_t a = BB_OFFSET(p[hi]); - int lolen = BB_LEN(p[lo]); - int hilen = BB_LEN(p[hi]); - int newlen = lolen + hilen - (s - a); - if (s >= a && newlen < BB_MAX_LEN) { - /* yes, we can combine them */ - int ack = BB_ACK(p[lo]) && BB_ACK(p[hi]); - p[lo] = BB_MAKE(BB_OFFSET(p[lo]), newlen, ack); - memmove(p + hi, p + hi + 1, - (bb->count - hi - 1) * 8); - bb->count--; - } - } - while (sectors) { - /* didn't merge (it all). - * Need to add a range just before 'hi' */ - if (bb->count >= MD_MAX_BADBLOCKS) { - /* No room for more */ - rv = 0; - break; - } else { - int this_sectors = sectors; - memmove(p + hi + 1, p + hi, - (bb->count - hi) * 8); - bb->count++; - - if (this_sectors > BB_MAX_LEN) - this_sectors = BB_MAX_LEN; - p[hi] = BB_MAKE(s, this_sectors, acknowledged); - sectors -= this_sectors; - s += this_sectors; - } - } - - bb->changed = 1; - if (!acknowledged) - bb->unacked_exist = 1; - write_sequnlock_irqrestore(&bb->lock, flags); - - return rv; -} +/* Bad block management */ +/* Returns 1 on success, 0 on failure */ int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors, int is_new) { @@ -8734,114 +8485,19 @@ int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors, s += rdev->new_data_offset; else s += rdev->data_offset; - rv = md_set_badblocks(&rdev->badblocks, - s, sectors, 0); - if (rv) { + rv = badblocks_set(&rdev->badblocks, s, sectors, 0); + if (rv == 0) { /* Make sure they get written out promptly */ sysfs_notify_dirent_safe(rdev->sysfs_state); set_bit(MD_CHANGE_CLEAN, &rdev->mddev->flags); set_bit(MD_CHANGE_PENDING, &rdev->mddev->flags); md_wakeup_thread(rdev->mddev->thread); - } - return rv; + return 1; + } else + return 0; } EXPORT_SYMBOL_GPL(rdev_set_badblocks); -/* - * Remove a range of bad blocks from the table. - * This may involve extending the table if we spilt a region, - * but it must not fail. So if the table becomes full, we just - * drop the remove request. - */ -static int md_clear_badblocks(struct badblocks *bb, sector_t s, int sectors) -{ - u64 *p; - int lo, hi; - sector_t target = s + sectors; - int rv = 0; - - if (bb->shift > 0) { - /* When clearing we round the start up and the end down. - * This should not matter as the shift should align with - * the block size and no rounding should ever be needed. - * However it is better the think a block is bad when it - * isn't than to think a block is not bad when it is. - */ - s += (1<shift) - 1; - s >>= bb->shift; - target >>= bb->shift; - sectors = target - s; - } - - write_seqlock_irq(&bb->lock); - - p = bb->page; - lo = 0; - hi = bb->count; - /* Find the last range that starts before 'target' */ - while (hi - lo > 1) { - int mid = (lo + hi) / 2; - sector_t a = BB_OFFSET(p[mid]); - if (a < target) - lo = mid; - else - hi = mid; - } - if (hi > lo) { - /* p[lo] is the last range that could overlap the - * current range. Earlier ranges could also overlap, - * but only this one can overlap the end of the range. - */ - if (BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > target) { - /* Partial overlap, leave the tail of this range */ - int ack = BB_ACK(p[lo]); - sector_t a = BB_OFFSET(p[lo]); - sector_t end = a + BB_LEN(p[lo]); - - if (a < s) { - /* we need to split this range */ - if (bb->count >= MD_MAX_BADBLOCKS) { - rv = -ENOSPC; - goto out; - } - memmove(p+lo+1, p+lo, (bb->count - lo) * 8); - bb->count++; - p[lo] = BB_MAKE(a, s-a, ack); - lo++; - } - p[lo] = BB_MAKE(target, end - target, ack); - /* there is no longer an overlap */ - hi = lo; - lo--; - } - while (lo >= 0 && - BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) { - /* This range does overlap */ - if (BB_OFFSET(p[lo]) < s) { - /* Keep the early parts of this range. */ - int ack = BB_ACK(p[lo]); - sector_t start = BB_OFFSET(p[lo]); - p[lo] = BB_MAKE(start, s - start, ack); - /* now low doesn't overlap, so.. */ - break; - } - lo--; - } - /* 'lo' is strictly before, 'hi' is strictly after, - * anything between needs to be discarded - */ - if (hi - lo > 1) { - memmove(p+lo+1, p+hi, (bb->count - hi) * 8); - bb->count -= (hi - lo - 1); - } - } - - bb->changed = 1; -out: - write_sequnlock_irq(&bb->lock); - return rv; -} - int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors, int is_new) { @@ -8849,133 +8505,11 @@ int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors, s += rdev->new_data_offset; else s += rdev->data_offset; - return md_clear_badblocks(&rdev->badblocks, + return badblocks_clear(&rdev->badblocks, s, sectors); } EXPORT_SYMBOL_GPL(rdev_clear_badblocks); -/* - * Acknowledge all bad blocks in a list. - * This only succeeds if ->changed is clear. It is used by - * in-kernel metadata updates - */ -void md_ack_all_badblocks(struct badblocks *bb) -{ - if (bb->page == NULL || bb->changed) - /* no point even trying */ - return; - write_seqlock_irq(&bb->lock); - - if (bb->changed == 0 && bb->unacked_exist) { - u64 *p = bb->page; - int i; - for (i = 0; i < bb->count ; i++) { - if (!BB_ACK(p[i])) { - sector_t start = BB_OFFSET(p[i]); - int len = BB_LEN(p[i]); - p[i] = BB_MAKE(start, len, 1); - } - } - bb->unacked_exist = 0; - } - write_sequnlock_irq(&bb->lock); -} -EXPORT_SYMBOL_GPL(md_ack_all_badblocks); - -/* sysfs access to bad-blocks list. - * We present two files. - * 'bad-blocks' lists sector numbers and lengths of ranges that - * are recorded as bad. The list is truncated to fit within - * the one-page limit of sysfs. - * Writing "sector length" to this file adds an acknowledged - * bad block list. - * 'unacknowledged-bad-blocks' lists bad blocks that have not yet - * been acknowledged. Writing to this file adds bad blocks - * without acknowledging them. This is largely for testing. - */ - -static ssize_t -badblocks_show(struct badblocks *bb, char *page, int unack) -{ - size_t len; - int i; - u64 *p = bb->page; - unsigned seq; - - if (bb->shift < 0) - return 0; - -retry: - seq = read_seqbegin(&bb->lock); - - len = 0; - i = 0; - - while (len < PAGE_SIZE && i < bb->count) { - sector_t s = BB_OFFSET(p[i]); - unsigned int length = BB_LEN(p[i]); - int ack = BB_ACK(p[i]); - i++; - - if (unack && ack) - continue; - - len += snprintf(page+len, PAGE_SIZE-len, "%llu %u\n", - (unsigned long long)s << bb->shift, - length << bb->shift); - } - if (unack && len == 0) - bb->unacked_exist = 0; - - if (read_seqretry(&bb->lock, seq)) - goto retry; - - return len; -} - -#define DO_DEBUG 1 - -static ssize_t -badblocks_store(struct badblocks *bb, const char *page, size_t len, int unack) -{ - unsigned long long sector; - int length; - char newline; -#ifdef DO_DEBUG - /* Allow clearing via sysfs *only* for testing/debugging. - * Normally only a successful write may clear a badblock - */ - int clear = 0; - if (page[0] == '-') { - clear = 1; - page++; - } -#endif /* DO_DEBUG */ - - switch (sscanf(page, "%llu %d%c", §or, &length, &newline)) { - case 3: - if (newline != '\n') - return -EINVAL; - case 2: - if (length <= 0) - return -EINVAL; - break; - default: - return -EINVAL; - } - -#ifdef DO_DEBUG - if (clear) { - md_clear_badblocks(bb, sector, length); - return len; - } -#endif /* DO_DEBUG */ - if (md_set_badblocks(bb, sector, length, !unack)) - return len; - else - return -ENOSPC; -} - static int md_notify_reboot(struct notifier_block *this, unsigned long code, void *x) { diff --git a/drivers/md/md.h b/drivers/md/md.h index 2bea51e..389afc4 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -17,6 +17,7 @@ #include #include +#include #include #include #include @@ -28,13 +29,6 @@ #define MaxSector (~(sector_t)0) -/* Bad block numbers are stored sorted in a single page. - * 64bits is used for each block or extent. - * 54 bits are sector number, 9 bits are extent size, - * 1 bit is an 'acknowledged' flag. - */ -#define MD_MAX_BADBLOCKS (PAGE_SIZE/8) - /* * MD's 'extended' device */ @@ -117,22 +111,7 @@ struct md_rdev { struct kernfs_node *sysfs_state; /* handle for 'state' * sysfs entry */ - struct badblocks { - int count; /* count of bad blocks */ - int unacked_exist; /* there probably are unacknowledged - * bad blocks. This is only cleared - * when a read discovers none - */ - int shift; /* shift from sectors to block size - * a -ve shift means badblocks are - * disabled.*/ - u64 *page; /* badblock list */ - int changed; - seqlock_t lock; - - sector_t sector; - sector_t size; /* in sectors */ - } badblocks; + struct badblocks badblocks; }; enum flag_bits { Faulty, /* device is known to have a fault */ @@ -185,22 +164,11 @@ enum flag_bits { */ }; -#define BB_LEN_MASK (0x00000000000001FFULL) -#define BB_OFFSET_MASK (0x7FFFFFFFFFFFFE00ULL) -#define BB_ACK_MASK (0x8000000000000000ULL) -#define BB_MAX_LEN 512 -#define BB_OFFSET(x) (((x) & BB_OFFSET_MASK) >> 9) -#define BB_LEN(x) (((x) & BB_LEN_MASK) + 1) -#define BB_ACK(x) (!!((x) & BB_ACK_MASK)) -#define BB_MAKE(a, l, ack) (((a)<<9) | ((l)-1) | ((u64)(!!(ack)) << 63)) - -extern int md_is_badblock(struct badblocks *bb, sector_t s, int sectors, - sector_t *first_bad, int *bad_sectors); static inline int is_badblock(struct md_rdev *rdev, sector_t s, int sectors, sector_t *first_bad, int *bad_sectors) { if (unlikely(rdev->badblocks.count)) { - int rv = md_is_badblock(&rdev->badblocks, rdev->data_offset + s, + int rv = badblocks_check(&rdev->badblocks, rdev->data_offset + s, sectors, first_bad, bad_sectors); if (rv) @@ -213,8 +181,6 @@ extern int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors, int is_new); extern int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors, int is_new); -extern void md_ack_all_badblocks(struct badblocks *bb); - struct md_cluster_info; struct mddev { -- cgit v0.10.2 From d26f73f083ed6fbea7fd3fdbacb527b7f3e75ac0 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Wed, 30 Dec 2015 15:01:19 -0800 Subject: nfit_test: Enable DSMs for all test NFITs In preparation for getting a poison list using ARS DSMs, enable DSMs for all manufactured NFITs supplied by the test framework. Also, supply valid response data for ars_status. Signed-off-by: Vishal Verma Signed-off-by: Dan Williams diff --git a/tools/testing/nvdimm/test/nfit.c b/tools/testing/nvdimm/test/nfit.c index 51cf825..90bd2ea 100644 --- a/tools/testing/nvdimm/test/nfit.c +++ b/tools/testing/nvdimm/test/nfit.c @@ -248,6 +248,8 @@ static int nfit_test_cmd_ars_status(struct nd_cmd_ars_status *nd_cmd, nd_cmd->out_length = 256; nd_cmd->num_records = 0; + nd_cmd->address = 0; + nd_cmd->length = -1ULL; nd_cmd->status = 0; return 0; @@ -1088,6 +1090,8 @@ static void nfit_test1_setup(struct nfit_test *t) struct acpi_nfit_memory_map *memdev; struct acpi_nfit_control_region *dcr; struct acpi_nfit_system_address *spa; + struct nvdimm_bus_descriptor *nd_desc; + struct acpi_nfit_desc *acpi_desc; offset = 0; /* spa0 (flat range with no bdw aliasing) */ @@ -1135,6 +1139,13 @@ static void nfit_test1_setup(struct nfit_test *t) dcr->command_size = 0; dcr->status_offset = 0; dcr->status_size = 0; + + acpi_desc = &t->acpi_desc; + set_bit(ND_CMD_ARS_CAP, &acpi_desc->bus_dsm_force_en); + set_bit(ND_CMD_ARS_START, &acpi_desc->bus_dsm_force_en); + set_bit(ND_CMD_ARS_STATUS, &acpi_desc->bus_dsm_force_en); + nd_desc = &acpi_desc->nd_desc; + nd_desc->ndctl = nfit_test_ctl; } static int nfit_test_blk_do_io(struct nd_blk_region *ndbr, resource_size_t dpa, -- cgit v0.10.2 From 0caeef63e6d2f866d85bb507bf63e0ce8ec91cef Mon Sep 17 00:00:00 2001 From: Vishal Verma Date: Thu, 24 Dec 2015 19:21:43 -0700 Subject: libnvdimm: Add a poison list and export badblocks During region creation, perform Address Range Scrubs (ARS) for the SPA (System Physical Address) ranges to retrieve known poison locations from firmware. Add a new data structure 'nd_poison' which is used as a list in nvdimm_bus to store these poison locations. When creating a pmem namespace, if there is any known poison associated with its physical address space, convert the poison ranges to bad sectors that are exposed using the badblocks interface. Signed-off-by: Vishal Verma Signed-off-by: Dan Williams diff --git a/drivers/acpi/nfit.c b/drivers/acpi/nfit.c index e7ed39b..e1dbc8d 100644 --- a/drivers/acpi/nfit.c +++ b/drivers/acpi/nfit.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include @@ -1473,6 +1474,201 @@ static void acpi_nfit_blk_region_disable(struct nvdimm_bus *nvdimm_bus, /* devm will free nfit_blk */ } +static int ars_get_cap(struct nvdimm_bus_descriptor *nd_desc, + struct nd_cmd_ars_cap *cmd, u64 addr, u64 length) +{ + cmd->address = addr; + cmd->length = length; + + return nd_desc->ndctl(nd_desc, NULL, ND_CMD_ARS_CAP, cmd, + sizeof(*cmd)); +} + +static int ars_do_start(struct nvdimm_bus_descriptor *nd_desc, + struct nd_cmd_ars_start *cmd, u64 addr, u64 length) +{ + int rc; + + cmd->address = addr; + cmd->length = length; + cmd->type = ND_ARS_PERSISTENT; + + while (1) { + rc = nd_desc->ndctl(nd_desc, NULL, ND_CMD_ARS_START, cmd, + sizeof(*cmd)); + if (rc) + return rc; + switch (cmd->status) { + case 0: + return 0; + case 1: + /* ARS unsupported, but we should never get here */ + return 0; + case 2: + return -EINVAL; + case 3: + /* ARS is in progress */ + msleep(1000); + break; + default: + return -ENXIO; + } + } +} + +static int ars_get_status(struct nvdimm_bus_descriptor *nd_desc, + struct nd_cmd_ars_status *cmd) +{ + int rc; + + while (1) { + rc = nd_desc->ndctl(nd_desc, NULL, ND_CMD_ARS_STATUS, cmd, + sizeof(*cmd)); + if (rc || cmd->status & 0xffff) + return -ENXIO; + + /* Check extended status (Upper two bytes) */ + switch (cmd->status >> 16) { + case 0: + return 0; + case 1: + /* ARS is in progress */ + msleep(1000); + break; + case 2: + /* No ARS performed for the current boot */ + return 0; + default: + return -ENXIO; + } + } +} + +static int ars_status_process_records(struct nvdimm_bus *nvdimm_bus, + struct nd_cmd_ars_status *ars_status, u64 start) +{ + int rc; + u32 i; + + /* + * The address field returned by ars_status should be either + * less than or equal to the address we last started ARS for. + * The (start, length) returned by ars_status should also have + * non-zero overlap with the range we started ARS for. + * If this is not the case, bail. + */ + if (ars_status->address > start || + (ars_status->address + ars_status->length < start)) + return -ENXIO; + + for (i = 0; i < ars_status->num_records; i++) { + rc = nvdimm_bus_add_poison(nvdimm_bus, + ars_status->records[i].err_address, + ars_status->records[i].length); + if (rc) + return rc; + } + + return 0; +} + +static int acpi_nfit_find_poison(struct acpi_nfit_desc *acpi_desc, + struct nd_region_desc *ndr_desc) +{ + struct nvdimm_bus_descriptor *nd_desc = &acpi_desc->nd_desc; + struct nvdimm_bus *nvdimm_bus = acpi_desc->nvdimm_bus; + struct nd_cmd_ars_status *ars_status = NULL; + struct nd_cmd_ars_start *ars_start = NULL; + struct nd_cmd_ars_cap *ars_cap = NULL; + u64 start, len, cur, remaining; + int rc; + + ars_cap = kzalloc(sizeof(*ars_cap), GFP_KERNEL); + if (!ars_cap) + return -ENOMEM; + + start = ndr_desc->res->start; + len = ndr_desc->res->end - ndr_desc->res->start + 1; + + rc = ars_get_cap(nd_desc, ars_cap, start, len); + if (rc) + goto out; + + /* + * If ARS is unsupported, or if the 'Persistent Memory Scrub' flag in + * extended status is not set, skip this but continue initialization + */ + if ((ars_cap->status & 0xffff) || + !(ars_cap->status >> 16 & ND_ARS_PERSISTENT)) { + dev_warn(acpi_desc->dev, + "ARS unsupported (status: 0x%x), won't create an error list\n", + ars_cap->status); + goto out; + } + + /* + * Check if a full-range ARS has been run. If so, use those results + * without having to start a new ARS. + */ + ars_status = kzalloc(ars_cap->max_ars_out + sizeof(*ars_status), + GFP_KERNEL); + if (!ars_status) { + rc = -ENOMEM; + goto out; + } + + rc = ars_get_status(nd_desc, ars_status); + if (rc) + goto out; + + if (ars_status->address <= start && + (ars_status->address + ars_status->length >= start + len)) { + rc = ars_status_process_records(nvdimm_bus, ars_status, start); + goto out; + } + + /* + * ARS_STATUS can overflow if the number of poison entries found is + * greater than the maximum buffer size (ars_cap->max_ars_out) + * To detect overflow, check if the length field of ars_status + * is less than the length we supplied. If so, process the + * error entries we got, adjust the start point, and start again + */ + ars_start = kzalloc(sizeof(*ars_start), GFP_KERNEL); + if (!ars_start) + return -ENOMEM; + + cur = start; + remaining = len; + do { + u64 done, end; + + rc = ars_do_start(nd_desc, ars_start, cur, remaining); + if (rc) + goto out; + + rc = ars_get_status(nd_desc, ars_status); + if (rc) + goto out; + + rc = ars_status_process_records(nvdimm_bus, ars_status, cur); + if (rc) + goto out; + + end = min(cur + remaining, + ars_status->address + ars_status->length); + done = end - cur; + cur += done; + remaining -= done; + } while (remaining); + + out: + kfree(ars_cap); + kfree(ars_start); + kfree(ars_status); + return rc; +} + static int acpi_nfit_init_mapping(struct acpi_nfit_desc *acpi_desc, struct nd_mapping *nd_mapping, struct nd_region_desc *ndr_desc, struct acpi_nfit_memory_map *memdev, @@ -1585,6 +1781,13 @@ static int acpi_nfit_register_region(struct acpi_nfit_desc *acpi_desc, nvdimm_bus = acpi_desc->nvdimm_bus; if (nfit_spa_type(spa) == NFIT_SPA_PM) { + rc = acpi_nfit_find_poison(acpi_desc, ndr_desc); + if (rc) { + dev_err(acpi_desc->dev, + "error while performing ARS to find poison: %d\n", + rc); + return rc; + } if (!nvdimm_pmem_region_create(nvdimm_bus, ndr_desc)) return -ENOMEM; } else if (nfit_spa_type(spa) == NFIT_SPA_VOLATILE) { diff --git a/drivers/nvdimm/core.c b/drivers/nvdimm/core.c index 82c49bb..21003b7 100644 --- a/drivers/nvdimm/core.c +++ b/drivers/nvdimm/core.c @@ -325,6 +325,7 @@ struct nvdimm_bus *__nvdimm_bus_register(struct device *parent, if (!nvdimm_bus) return NULL; INIT_LIST_HEAD(&nvdimm_bus->list); + INIT_LIST_HEAD(&nvdimm_bus->poison_list); init_waitqueue_head(&nvdimm_bus->probe_wait); nvdimm_bus->id = ida_simple_get(&nd_ida, 0, 0, GFP_KERNEL); mutex_init(&nvdimm_bus->reconfig_mutex); @@ -359,6 +360,191 @@ struct nvdimm_bus *__nvdimm_bus_register(struct device *parent, } EXPORT_SYMBOL_GPL(__nvdimm_bus_register); +/** + * __add_badblock_range() - Convert a physical address range to bad sectors + * @disk: the disk associated with the namespace + * @ns_offset: namespace offset where the error range begins (in bytes) + * @len: number of bytes of poison to be added + * + * This assumes that the range provided with (ns_offset, len) is within + * the bounds of physical addresses for this namespace, i.e. lies in the + * interval [ns_start, ns_start + ns_size) + */ +static int __add_badblock_range(struct gendisk *disk, u64 ns_offset, u64 len) +{ + unsigned int sector_size = queue_logical_block_size(disk->queue); + sector_t start_sector; + u64 num_sectors; + u32 rem; + int rc; + + start_sector = div_u64(ns_offset, sector_size); + num_sectors = div_u64_rem(len, sector_size, &rem); + if (rem) + num_sectors++; + + if (!disk->bb) { + rc = disk_alloc_badblocks(disk); + if (rc) + return rc; + } + + if (unlikely(num_sectors > (u64)INT_MAX)) { + u64 remaining = num_sectors; + sector_t s = start_sector; + + while (remaining) { + int done = min_t(u64, remaining, INT_MAX); + + rc = disk_set_badblocks(disk, s, done); + if (rc) + return rc; + remaining -= done; + s += done; + } + return 0; + } else + return disk_set_badblocks(disk, start_sector, num_sectors); +} + +/** + * nvdimm_namespace_add_poison() - Convert a list of poison ranges to badblocks + * @disk: the gendisk associated with the namespace where badblocks + * will be stored + * @offset: offset at the start of the namespace before 'sector 0' + * @ndns: the namespace containing poison ranges + * + * The poison list generated during NFIT initialization may contain multiple, + * possibly overlapping ranges in the SPA (System Physical Address) space. + * Compare each of these ranges to the namespace currently being initialized, + * and add badblocks to the gendisk for all matching sub-ranges + * + * Return: + * 0 - Success + */ +int nvdimm_namespace_add_poison(struct gendisk *disk, resource_size_t offset, + struct nd_namespace_common *ndns) +{ + struct nd_namespace_io *nsio = to_nd_namespace_io(&ndns->dev); + struct nd_region *nd_region = to_nd_region(ndns->dev.parent); + struct nvdimm_bus *nvdimm_bus; + struct list_head *poison_list; + u64 ns_start, ns_end, ns_size; + struct nd_poison *pl; + int rc; + + ns_size = nvdimm_namespace_capacity(ndns) - offset; + ns_start = nsio->res.start + offset; + ns_end = nsio->res.end; + + nvdimm_bus = to_nvdimm_bus(nd_region->dev.parent); + poison_list = &nvdimm_bus->poison_list; + if (list_empty(poison_list)) + return 0; + + list_for_each_entry(pl, poison_list, list) { + u64 pl_end = pl->start + pl->length - 1; + + /* Discard intervals with no intersection */ + if (pl_end < ns_start) + continue; + if (pl->start > ns_end) + continue; + /* Deal with any overlap after start of the namespace */ + if (pl->start >= ns_start) { + u64 start = pl->start; + u64 len; + + if (pl_end <= ns_end) + len = pl->length; + else + len = ns_start + ns_size - pl->start; + + rc = __add_badblock_range(disk, start - ns_start, len); + if (rc) + return rc; + dev_info(&nvdimm_bus->dev, + "Found a poison range (0x%llx, 0x%llx)\n", + start, len); + continue; + } + /* Deal with overlap for poison starting before the namespace */ + if (pl->start < ns_start) { + u64 len; + + if (pl_end < ns_end) + len = pl->start + pl->length - ns_start; + else + len = ns_size; + + rc = __add_badblock_range(disk, 0, len); + if (rc) + return rc; + dev_info(&nvdimm_bus->dev, + "Found a poison range (0x%llx, 0x%llx)\n", + pl->start, len); + } + } + + return 0; +} +EXPORT_SYMBOL_GPL(nvdimm_namespace_add_poison); + +static int __add_poison(struct nvdimm_bus *nvdimm_bus, u64 addr, u64 length) +{ + struct nd_poison *pl; + + pl = kzalloc(sizeof(*pl), GFP_KERNEL); + if (!pl) + return -ENOMEM; + + pl->start = addr; + pl->length = length; + list_add_tail(&pl->list, &nvdimm_bus->poison_list); + + return 0; +} + +int nvdimm_bus_add_poison(struct nvdimm_bus *nvdimm_bus, u64 addr, u64 length) +{ + struct nd_poison *pl; + + if (list_empty(&nvdimm_bus->poison_list)) + return __add_poison(nvdimm_bus, addr, length); + + /* + * There is a chance this is a duplicate, check for those first. + * This will be the common case as ARS_STATUS returns all known + * errors in the SPA space, and we can't query it per region + */ + list_for_each_entry(pl, &nvdimm_bus->poison_list, list) + if (pl->start == addr) { + /* If length has changed, update this list entry */ + if (pl->length != length) + pl->length = length; + return 0; + } + + /* + * If not a duplicate or a simple length update, add the entry as is, + * as any overlapping ranges will get resolved when the list is consumed + * and converted to badblocks + */ + return __add_poison(nvdimm_bus, addr, length); +} +EXPORT_SYMBOL_GPL(nvdimm_bus_add_poison); + +static void free_poison_list(struct list_head *poison_list) +{ + struct nd_poison *pl, *next; + + list_for_each_entry_safe(pl, next, poison_list, list) { + list_del(&pl->list); + kfree(pl); + } + list_del_init(poison_list); +} + static int child_unregister(struct device *dev, void *data) { /* @@ -385,6 +571,7 @@ void nvdimm_bus_unregister(struct nvdimm_bus *nvdimm_bus) nd_synchronize(); device_for_each_child(&nvdimm_bus->dev, NULL, child_unregister); + free_poison_list(&nvdimm_bus->poison_list); nvdimm_bus_destroy_ndctl(nvdimm_bus); device_unregister(&nvdimm_bus->dev); diff --git a/drivers/nvdimm/nd-core.h b/drivers/nvdimm/nd-core.h index 159aed5..d3b7ea7 100644 --- a/drivers/nvdimm/nd-core.h +++ b/drivers/nvdimm/nd-core.h @@ -30,6 +30,7 @@ struct nvdimm_bus { struct list_head list; struct device dev; int id, probe_active; + struct list_head poison_list; struct mutex reconfig_mutex; }; @@ -89,4 +90,6 @@ bool __nd_attach_ndns(struct device *dev, struct nd_namespace_common *attach, ssize_t nd_namespace_store(struct device *dev, struct nd_namespace_common **_ndns, const char *buf, size_t len); +int nvdimm_namespace_add_poison(struct gendisk *disk, resource_size_t offset, + struct nd_namespace_common *ndns); #endif /* __ND_CORE_H__ */ diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h index 417e521..ba91fcd 100644 --- a/drivers/nvdimm/nd.h +++ b/drivers/nvdimm/nd.h @@ -38,6 +38,12 @@ enum { #endif }; +struct nd_poison { + u64 start; + u64 length; + struct list_head list; +}; + struct nvdimm_drvdata { struct device *dev; int nsindex_size; diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index 8ee7989..5b95043 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -27,6 +27,7 @@ #include #include #include +#include "nd-core.h" #include "pfn.h" #include "nd.h" @@ -168,6 +169,7 @@ static int pmem_attach_disk(struct device *dev, { int nid = dev_to_node(dev); struct gendisk *disk; + int ret; pmem->pmem_queue = blk_alloc_queue_node(GFP_KERNEL, nid); if (!pmem->pmem_queue) @@ -196,6 +198,10 @@ static int pmem_attach_disk(struct device *dev, set_capacity(disk, (pmem->size - pmem->data_offset) / 512); pmem->pmem_disk = disk; + ret = nvdimm_namespace_add_poison(disk, pmem->data_offset, ndns); + if (ret) + return ret; + add_disk(disk); revalidate_disk(disk); diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h index 3f021dc..bed40df 100644 --- a/include/linux/libnvdimm.h +++ b/include/linux/libnvdimm.h @@ -116,6 +116,7 @@ static inline struct nd_blk_region_desc *to_blk_region_desc( } +int nvdimm_bus_add_poison(struct nvdimm_bus *nvdimm_bus, u64 addr, u64 length); struct nvdimm_bus *__nvdimm_bus_register(struct device *parent, struct nvdimm_bus_descriptor *nfit_desc, struct module *module); #define nvdimm_bus_register(parent, desc) \ -- cgit v0.10.2 From ad9a8bde2cb19f6876f964fc48acc8b6a2f325ff Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Wed, 6 Jan 2016 12:03:41 -0800 Subject: libnvdimm, pmem: move definition of nvdimm_namespace_add_poison to nd.h nd-core.h is private to the libnvdimm core internals and should not be used by drivers. Signed-off-by: Dan Williams diff --git a/drivers/nvdimm/nd-core.h b/drivers/nvdimm/nd-core.h index d3b7ea7..29acdaa 100644 --- a/drivers/nvdimm/nd-core.h +++ b/drivers/nvdimm/nd-core.h @@ -90,6 +90,4 @@ bool __nd_attach_ndns(struct device *dev, struct nd_namespace_common *attach, ssize_t nd_namespace_store(struct device *dev, struct nd_namespace_common **_ndns, const char *buf, size_t len); -int nvdimm_namespace_add_poison(struct gendisk *disk, resource_size_t offset, - struct nd_namespace_common *ndns); #endif /* __ND_CORE_H__ */ diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h index ba91fcd..198933d 100644 --- a/drivers/nvdimm/nd.h +++ b/drivers/nvdimm/nd.h @@ -268,6 +268,8 @@ int nvdimm_namespace_attach_btt(struct nd_namespace_common *ndns); int nvdimm_namespace_detach_btt(struct nd_namespace_common *ndns); const char *nvdimm_namespace_disk_name(struct nd_namespace_common *ndns, char *name); +int nvdimm_namespace_add_poison(struct gendisk *disk, resource_size_t offset, + struct nd_namespace_common *ndns); int nd_blk_region_init(struct nd_region *nd_region); void __nd_iostat_start(struct bio *bio, unsigned long *start); static inline bool nd_iostat_start(struct bio *bio, unsigned long *start) diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index 5b95043..65b2056 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -27,7 +27,6 @@ #include #include #include -#include "nd-core.h" #include "pfn.h" #include "nd.h" -- cgit v0.10.2 From d3b407fb3f782bd915db64e266010ea30a2d381e Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Wed, 6 Jan 2016 12:19:22 -0800 Subject: badblocks: rename badblocks_free to badblocks_exit For symmetry with badblocks_init() make it clear that this path only destroys incremental allocations of a badblocks instance, and does not free the badblocks instance itself. Signed-off-by: Dan Williams diff --git a/block/badblocks.c b/block/badblocks.c index 96aeb91..fabf6b6 100644 --- a/block/badblocks.c +++ b/block/badblocks.c @@ -550,12 +550,12 @@ int badblocks_init(struct badblocks *bb, int enable) EXPORT_SYMBOL_GPL(badblocks_init); /** - * badblocks_free() - free the badblocks structure + * badblocks_exit() - free the badblocks structure * @bb: the badblocks structure that holds all badblock information */ -void badblocks_free(struct badblocks *bb) +void badblocks_exit(struct badblocks *bb) { kfree(bb->page); bb->page = NULL; } -EXPORT_SYMBOL_GPL(badblocks_free); +EXPORT_SYMBOL_GPL(badblocks_exit); diff --git a/block/genhd.c b/block/genhd.c index 88579cf..f463c67 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -671,7 +671,7 @@ void del_gendisk(struct gendisk *disk) blk_unregister_region(disk_devt(disk), disk->minors); if (disk->bb) { - badblocks_free(disk->bb); + badblocks_exit(disk->bb); kfree(disk->bb); } diff --git a/drivers/md/md.c b/drivers/md/md.c index 1e48aa9..96a9918 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -710,7 +710,7 @@ void md_rdev_clear(struct md_rdev *rdev) put_page(rdev->bb_page); rdev->bb_page = NULL; } - badblocks_free(&rdev->badblocks); + badblocks_exit(&rdev->badblocks); } EXPORT_SYMBOL_GPL(md_rdev_clear); diff --git a/include/linux/badblocks.h b/include/linux/badblocks.h index 9293446..2d98c02 100644 --- a/include/linux/badblocks.h +++ b/include/linux/badblocks.h @@ -48,6 +48,6 @@ ssize_t badblocks_show(struct badblocks *bb, char *page, int unack); ssize_t badblocks_store(struct badblocks *bb, const char *page, size_t len, int unack); int badblocks_init(struct badblocks *bb, int enable); -void badblocks_free(struct badblocks *bb); +void badblocks_exit(struct badblocks *bb); #endif -- cgit v0.10.2 From 20a308f09e0d29ce6f5a4114cc476a998d569bfb Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Wed, 6 Jan 2016 12:03:41 -0800 Subject: block: clarify badblocks lifetime The badblocks list attached to a gendisk is allocated by the driver which equates to the driver owning the lifetime of the object. Do not automatically free it in del_gendisk(). This is in preparation for expanding the use of badblocks in libnvdimm drivers and introducing devm_init_badblocks(). Signed-off-by: Dan Williams diff --git a/block/badblocks.c b/block/badblocks.c index fabf6b6..37e5c0a 100644 --- a/block/badblocks.c +++ b/block/badblocks.c @@ -555,6 +555,8 @@ EXPORT_SYMBOL_GPL(badblocks_init); */ void badblocks_exit(struct badblocks *bb) { + if (!bb) + return; kfree(bb->page); bb->page = NULL; } diff --git a/block/genhd.c b/block/genhd.c index f463c67..aa38cd0 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -670,11 +670,6 @@ void del_gendisk(struct gendisk *disk) blk_unregister_queue(disk); blk_unregister_region(disk_devt(disk), disk->minors); - if (disk->bb) { - badblocks_exit(disk->bb); - kfree(disk->bb); - } - part_stat_set_all(&disk->part0, 0); disk->part0.stamp = 0; -- cgit v0.10.2 From 16263ff6c72eb4cc00aa287230144dda12ccad12 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Mon, 4 Jan 2016 23:50:23 -0800 Subject: block, badblocks: introduce devm_init_badblocks Provide a devres interface for initializing a badblocks instance. The pmem driver has several scenarios where it will be beneficial to have this structure automatically freed when the device is disabled / fails probe. Signed-off-by: Dan Williams diff --git a/block/badblocks.c b/block/badblocks.c index 37e5c0a..7be53cb 100644 --- a/block/badblocks.c +++ b/block/badblocks.c @@ -17,6 +17,7 @@ #include #include +#include #include #include #include @@ -522,24 +523,20 @@ ssize_t badblocks_store(struct badblocks *bb, const char *page, size_t len, } EXPORT_SYMBOL_GPL(badblocks_store); -/** - * badblocks_init() - initialize the badblocks structure - * @bb: the badblocks structure that holds all badblock information - * @enable: weather to enable badblocks accounting - * - * Return: - * 0: success - * -ve errno: on error - */ -int badblocks_init(struct badblocks *bb, int enable) +static int __badblocks_init(struct device *dev, struct badblocks *bb, + int enable) { + bb->dev = dev; bb->count = 0; if (enable) bb->shift = 0; else bb->shift = -1; - bb->page = kzalloc(PAGE_SIZE, GFP_KERNEL); - if (bb->page == (u64 *)0) { + if (dev) + bb->page = devm_kzalloc(dev, PAGE_SIZE, GFP_KERNEL); + else + bb->page = kzalloc(PAGE_SIZE, GFP_KERNEL); + if (!bb->page) { bb->shift = -1; return -ENOMEM; } @@ -547,8 +544,30 @@ int badblocks_init(struct badblocks *bb, int enable) return 0; } + +/** + * badblocks_init() - initialize the badblocks structure + * @bb: the badblocks structure that holds all badblock information + * @enable: weather to enable badblocks accounting + * + * Return: + * 0: success + * -ve errno: on error + */ +int badblocks_init(struct badblocks *bb, int enable) +{ + return __badblocks_init(NULL, bb, enable); +} EXPORT_SYMBOL_GPL(badblocks_init); +int devm_init_badblocks(struct device *dev, struct badblocks *bb) +{ + if (!bb) + return -EINVAL; + return __badblocks_init(dev, bb, 1); +} +EXPORT_SYMBOL_GPL(devm_init_badblocks); + /** * badblocks_exit() - free the badblocks structure * @bb: the badblocks structure that holds all badblock information @@ -557,7 +576,10 @@ void badblocks_exit(struct badblocks *bb) { if (!bb) return; - kfree(bb->page); + if (bb->dev) + devm_kfree(bb->dev, bb->page); + else + kfree(bb->page); bb->page = NULL; } EXPORT_SYMBOL_GPL(badblocks_exit); diff --git a/include/linux/badblocks.h b/include/linux/badblocks.h index 2d98c02..c3bdf8c 100644 --- a/include/linux/badblocks.h +++ b/include/linux/badblocks.h @@ -2,6 +2,7 @@ #define _LINUX_BADBLOCKS_H #include +#include #include #include #include @@ -23,6 +24,7 @@ #define MAX_BADBLOCKS (PAGE_SIZE/8) struct badblocks { + struct device *dev; /* set by devm_init_badblocks */ int count; /* count of bad blocks */ int unacked_exist; /* there probably are unacknowledged * bad blocks. This is only cleared @@ -49,5 +51,15 @@ ssize_t badblocks_store(struct badblocks *bb, const char *page, size_t len, int unack); int badblocks_init(struct badblocks *bb, int enable); void badblocks_exit(struct badblocks *bb); - +struct device; +int devm_init_badblocks(struct device *dev, struct badblocks *bb); +static inline void devm_exit_badblocks(struct device *dev, struct badblocks *bb) +{ + if (bb->dev != dev) { + dev_WARN_ONCE(dev, 1, "%s: badblocks instance not associated\n", + __func__); + return; + } + badblocks_exit(bb); +} #endif -- cgit v0.10.2 From 87ba05dff3510f9e058b35d3c3fa222b6f406ecc Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Sat, 9 Jan 2016 07:48:43 -0800 Subject: libnvdimm: don't fail init for full badblocks list If the badblocks list runs out of space it simply means that software is unable to intercept all errors. This is no different than the latent discovery of new badblocks case and should not be an initialization failure condition. Signed-off-by: Dan Williams diff --git a/drivers/nvdimm/core.c b/drivers/nvdimm/core.c index 21003b7..e419d66 100644 --- a/drivers/nvdimm/core.c +++ b/drivers/nvdimm/core.c @@ -360,6 +360,18 @@ struct nvdimm_bus *__nvdimm_bus_register(struct device *parent, } EXPORT_SYMBOL_GPL(__nvdimm_bus_register); +static void set_badblock(struct gendisk *disk, sector_t s, int num) +{ + struct device *dev = disk->driverfs_dev; + + dev_dbg(dev, "Found a poison range (0x%llx, 0x%llx)\n", + (u64) s * 512, (u64) num * 512); + /* this isn't an error as the hardware will still throw an exception */ + if (disk_set_badblocks(disk, s, num)) + dev_info_once(dev, "%s: failed for sector %llx\n", + __func__, (u64) s); +} + /** * __add_badblock_range() - Convert a physical address range to bad sectors * @disk: the disk associated with the namespace @@ -396,15 +408,14 @@ static int __add_badblock_range(struct gendisk *disk, u64 ns_offset, u64 len) while (remaining) { int done = min_t(u64, remaining, INT_MAX); - rc = disk_set_badblocks(disk, s, done); - if (rc) - return rc; + set_badblock(disk, s, done); remaining -= done; s += done; } - return 0; } else - return disk_set_badblocks(disk, start_sector, num_sectors); + set_badblock(disk, start_sector, num_sectors); + + return 0; } /** @@ -463,9 +474,6 @@ int nvdimm_namespace_add_poison(struct gendisk *disk, resource_size_t offset, rc = __add_badblock_range(disk, start - ns_start, len); if (rc) return rc; - dev_info(&nvdimm_bus->dev, - "Found a poison range (0x%llx, 0x%llx)\n", - start, len); continue; } /* Deal with overlap for poison starting before the namespace */ @@ -480,9 +488,6 @@ int nvdimm_namespace_add_poison(struct gendisk *disk, resource_size_t offset, rc = __add_badblock_range(disk, 0, len); if (rc) return rc; - dev_info(&nvdimm_bus->dev, - "Found a poison range (0x%llx, 0x%llx)\n", - pl->start, len); } } -- cgit v0.10.2 From b95f5f4391fad65f1819c2404080b05ca95bdd92 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Mon, 4 Jan 2016 23:50:23 -0800 Subject: libnvdimm: convert to statically allocated badblocks If a device will ever have badblocks it should always have a badblocks instance available. So, similar to md, embed a badblocks instance in pmem_device. This reduces pointer chasing in the i/o fast path, and simplifies the init path. Reported-by: Vishal Verma Signed-off-by: Dan Williams diff --git a/drivers/nvdimm/core.c b/drivers/nvdimm/core.c index e419d66..2e2832b 100644 --- a/drivers/nvdimm/core.c +++ b/drivers/nvdimm/core.c @@ -11,6 +11,7 @@ * General Public License for more details. */ #include +#include #include #include #include @@ -360,21 +361,19 @@ struct nvdimm_bus *__nvdimm_bus_register(struct device *parent, } EXPORT_SYMBOL_GPL(__nvdimm_bus_register); -static void set_badblock(struct gendisk *disk, sector_t s, int num) +static void set_badblock(struct badblocks *bb, sector_t s, int num) { - struct device *dev = disk->driverfs_dev; - - dev_dbg(dev, "Found a poison range (0x%llx, 0x%llx)\n", + dev_dbg(bb->dev, "Found a poison range (0x%llx, 0x%llx)\n", (u64) s * 512, (u64) num * 512); /* this isn't an error as the hardware will still throw an exception */ - if (disk_set_badblocks(disk, s, num)) - dev_info_once(dev, "%s: failed for sector %llx\n", + if (badblocks_set(bb, s, num, 1)) + dev_info_once(bb->dev, "%s: failed for sector %llx\n", __func__, (u64) s); } /** * __add_badblock_range() - Convert a physical address range to bad sectors - * @disk: the disk associated with the namespace + * @bb: badblocks instance to populate * @ns_offset: namespace offset where the error range begins (in bytes) * @len: number of bytes of poison to be added * @@ -382,25 +381,18 @@ static void set_badblock(struct gendisk *disk, sector_t s, int num) * the bounds of physical addresses for this namespace, i.e. lies in the * interval [ns_start, ns_start + ns_size) */ -static int __add_badblock_range(struct gendisk *disk, u64 ns_offset, u64 len) +static void __add_badblock_range(struct badblocks *bb, u64 ns_offset, u64 len) { - unsigned int sector_size = queue_logical_block_size(disk->queue); + const unsigned int sector_size = 512; sector_t start_sector; u64 num_sectors; u32 rem; - int rc; start_sector = div_u64(ns_offset, sector_size); num_sectors = div_u64_rem(len, sector_size, &rem); if (rem) num_sectors++; - if (!disk->bb) { - rc = disk_alloc_badblocks(disk); - if (rc) - return rc; - } - if (unlikely(num_sectors > (u64)INT_MAX)) { u64 remaining = num_sectors; sector_t s = start_sector; @@ -408,33 +400,27 @@ static int __add_badblock_range(struct gendisk *disk, u64 ns_offset, u64 len) while (remaining) { int done = min_t(u64, remaining, INT_MAX); - set_badblock(disk, s, done); + set_badblock(bb, s, done); remaining -= done; s += done; } } else - set_badblock(disk, start_sector, num_sectors); - - return 0; + set_badblock(bb, start_sector, num_sectors); } /** * nvdimm_namespace_add_poison() - Convert a list of poison ranges to badblocks - * @disk: the gendisk associated with the namespace where badblocks - * will be stored - * @offset: offset at the start of the namespace before 'sector 0' * @ndns: the namespace containing poison ranges + * @bb: badblocks instance to populate + * @offset: offset at the start of the namespace before 'sector 0' * * The poison list generated during NFIT initialization may contain multiple, * possibly overlapping ranges in the SPA (System Physical Address) space. * Compare each of these ranges to the namespace currently being initialized, * and add badblocks to the gendisk for all matching sub-ranges - * - * Return: - * 0 - Success */ -int nvdimm_namespace_add_poison(struct gendisk *disk, resource_size_t offset, - struct nd_namespace_common *ndns) +void nvdimm_namespace_add_poison(struct nd_namespace_common *ndns, + struct badblocks *bb, resource_size_t offset) { struct nd_namespace_io *nsio = to_nd_namespace_io(&ndns->dev); struct nd_region *nd_region = to_nd_region(ndns->dev.parent); @@ -442,7 +428,6 @@ int nvdimm_namespace_add_poison(struct gendisk *disk, resource_size_t offset, struct list_head *poison_list; u64 ns_start, ns_end, ns_size; struct nd_poison *pl; - int rc; ns_size = nvdimm_namespace_capacity(ndns) - offset; ns_start = nsio->res.start + offset; @@ -451,7 +436,7 @@ int nvdimm_namespace_add_poison(struct gendisk *disk, resource_size_t offset, nvdimm_bus = to_nvdimm_bus(nd_region->dev.parent); poison_list = &nvdimm_bus->poison_list; if (list_empty(poison_list)) - return 0; + return; list_for_each_entry(pl, poison_list, list) { u64 pl_end = pl->start + pl->length - 1; @@ -470,10 +455,7 @@ int nvdimm_namespace_add_poison(struct gendisk *disk, resource_size_t offset, len = pl->length; else len = ns_start + ns_size - pl->start; - - rc = __add_badblock_range(disk, start - ns_start, len); - if (rc) - return rc; + __add_badblock_range(bb, start - ns_start, len); continue; } /* Deal with overlap for poison starting before the namespace */ @@ -484,14 +466,9 @@ int nvdimm_namespace_add_poison(struct gendisk *disk, resource_size_t offset, len = pl->start + pl->length - ns_start; else len = ns_size; - - rc = __add_badblock_range(disk, 0, len); - if (rc) - return rc; + __add_badblock_range(bb, 0, len); } } - - return 0; } EXPORT_SYMBOL_GPL(nvdimm_namespace_add_poison); diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h index 198933d..288d96e 100644 --- a/drivers/nvdimm/nd.h +++ b/drivers/nvdimm/nd.h @@ -268,8 +268,8 @@ int nvdimm_namespace_attach_btt(struct nd_namespace_common *ndns); int nvdimm_namespace_detach_btt(struct nd_namespace_common *ndns); const char *nvdimm_namespace_disk_name(struct nd_namespace_common *ndns, char *name); -int nvdimm_namespace_add_poison(struct gendisk *disk, resource_size_t offset, - struct nd_namespace_common *ndns); +void nvdimm_namespace_add_poison(struct nd_namespace_common *ndns, + struct badblocks *bb, resource_size_t offset); int nd_blk_region_init(struct nd_region *nd_region); void __nd_iostat_start(struct bio *bio, unsigned long *start); static inline bool nd_iostat_start(struct bio *bio, unsigned long *start) diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index 65b2056..2b1f300 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -41,6 +42,7 @@ struct pmem_device { phys_addr_t data_offset; void __pmem *virt_addr; size_t size; + struct badblocks bb; }; static int pmem_major; @@ -168,7 +170,6 @@ static int pmem_attach_disk(struct device *dev, { int nid = dev_to_node(dev); struct gendisk *disk; - int ret; pmem->pmem_queue = blk_alloc_queue_node(GFP_KERNEL, nid); if (!pmem->pmem_queue) @@ -196,10 +197,9 @@ static int pmem_attach_disk(struct device *dev, disk->driverfs_dev = dev; set_capacity(disk, (pmem->size - pmem->data_offset) / 512); pmem->pmem_disk = disk; - - ret = nvdimm_namespace_add_poison(disk, pmem->data_offset, ndns); - if (ret) - return ret; + if (devm_init_badblocks(dev, &pmem->bb)) + return -ENOMEM; + nvdimm_namespace_add_poison(ndns, &pmem->bb, pmem->data_offset); add_disk(disk); revalidate_disk(disk); -- cgit v0.10.2 From e10624f8c09710b3b0740ea3847627ea02f55c39 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Wed, 6 Jan 2016 12:03:41 -0800 Subject: pmem: fail io-requests to known bad blocks Check the sectors specified in a read bio to see if they hit a known bad block, and return an error code pmem_do_bvec(). Note that the ->rw_page() is not in a position to return errors. For now, copy the same layering violation present in zram_rw_page() to avoid crashes of the form: kernel BUG at mm/filemap.c:822! [..] Call Trace: [] page_endio+0x1e/0x60 [] mpage_end_io+0x39/0x60 [] bio_endio+0x3f/0x60 [] pmem_make_request+0x111/0x230 [nd_pmem] ...i.e. unlock a page that was already unlocked via pmem_rw_page() => page_endio(). Reported-by: Vishal Verma Signed-off-by: Dan Williams diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index 2b1f300..d00c659 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -47,7 +47,20 @@ struct pmem_device { static int pmem_major; -static void pmem_do_bvec(struct pmem_device *pmem, struct page *page, +static bool is_bad_pmem(struct badblocks *bb, sector_t sector, unsigned int len) +{ + if (bb->count) { + sector_t first_bad; + int num_bad; + + return !!badblocks_check(bb, sector, len / 512, &first_bad, + &num_bad); + } + + return false; +} + +static int pmem_do_bvec(struct pmem_device *pmem, struct page *page, unsigned int len, unsigned int off, int rw, sector_t sector) { @@ -56,6 +69,8 @@ static void pmem_do_bvec(struct pmem_device *pmem, struct page *page, void __pmem *pmem_addr = pmem->virt_addr + pmem_off; if (rw == READ) { + if (unlikely(is_bad_pmem(&pmem->bb, sector, len))) + return -EIO; memcpy_from_pmem(mem + off, pmem_addr, len); flush_dcache_page(page); } else { @@ -64,10 +79,12 @@ static void pmem_do_bvec(struct pmem_device *pmem, struct page *page, } kunmap_atomic(mem); + return 0; } static blk_qc_t pmem_make_request(struct request_queue *q, struct bio *bio) { + int rc = 0; bool do_acct; unsigned long start; struct bio_vec bvec; @@ -76,9 +93,15 @@ static blk_qc_t pmem_make_request(struct request_queue *q, struct bio *bio) struct pmem_device *pmem = bdev->bd_disk->private_data; do_acct = nd_iostat_start(bio, &start); - bio_for_each_segment(bvec, bio, iter) - pmem_do_bvec(pmem, bvec.bv_page, bvec.bv_len, bvec.bv_offset, - bio_data_dir(bio), iter.bi_sector); + bio_for_each_segment(bvec, bio, iter) { + rc = pmem_do_bvec(pmem, bvec.bv_page, bvec.bv_len, + bvec.bv_offset, bio_data_dir(bio), + iter.bi_sector); + if (rc) { + bio->bi_error = rc; + break; + } + } if (do_acct) nd_iostat_end(bio, start); @@ -93,13 +116,22 @@ static int pmem_rw_page(struct block_device *bdev, sector_t sector, struct page *page, int rw) { struct pmem_device *pmem = bdev->bd_disk->private_data; + int rc; - pmem_do_bvec(pmem, page, PAGE_CACHE_SIZE, 0, rw, sector); + rc = pmem_do_bvec(pmem, page, PAGE_CACHE_SIZE, 0, rw, sector); if (rw & WRITE) wmb_pmem(); - page_endio(page, rw & WRITE, 0); - return 0; + /* + * The ->rw_page interface is subtle and tricky. The core + * retries on any error, so we can only invoke page_endio() in + * the successful completion case. Otherwise, we'll see crashes + * caused by double completion. + */ + if (rc == 0) + page_endio(page, rw & WRITE, 0); + + return rc; } static long pmem_direct_access(struct block_device *bdev, sector_t sector, -- cgit v0.10.2 From 57f7f317abdd07954cb116280c88d18378afb33e Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Wed, 6 Jan 2016 12:03:42 -0800 Subject: pmem, dax: disable dax in the presence of bad blocks Longer term teach dax to punch "error" holes in mapping requests and deliver SIGBUS to applications that consume a bad pmem page. For now, simply disable the dax performance optimization in the presence of known errors. Signed-off-by: Dan Williams diff --git a/block/ioctl.c b/block/ioctl.c index 7a964d8..2c84683 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -4,6 +4,7 @@ #include #include #include +#include #include #include #include @@ -422,6 +423,15 @@ bool blkdev_dax_capable(struct block_device *bdev) || (bdev->bd_part->nr_sects % (PAGE_SIZE / 512))) return false; + /* + * If the device has known bad blocks, force all I/O through the + * driver / page cache. + * + * TODO: support finer grained dax error handling + */ + if (disk->bb && disk->bb->count) + return false; + return true; } diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index d00c659..6a1832b 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -233,6 +233,7 @@ static int pmem_attach_disk(struct device *dev, return -ENOMEM; nvdimm_namespace_add_poison(ndns, &pmem->bb, pmem->data_offset); + disk->bb = &pmem->bb; add_disk(disk); revalidate_disk(disk); -- cgit v0.10.2 From 710d69cc99507803ed91b4ec7368fbd66d59f014 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Mon, 4 Jan 2016 23:31:24 -0800 Subject: libnvdimm, pmem: nvdimm_read_bytes() badblocks support Support badblock checking in all the pmem read paths that do not go through the block layer. This protects info block reads (btt or pfn) as well as data reads to a pmem namespace via a btt instance. Signed-off-by: Dan Williams diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index 6a1832b..a88762d 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -229,6 +229,7 @@ static int pmem_attach_disk(struct device *dev, disk->driverfs_dev = dev; set_capacity(disk, (pmem->size - pmem->data_offset) / 512); pmem->pmem_disk = disk; + devm_exit_badblocks(dev, &pmem->bb); if (devm_init_badblocks(dev, &pmem->bb)) return -ENOMEM; nvdimm_namespace_add_poison(ndns, &pmem->bb, pmem->data_offset); @@ -250,9 +251,13 @@ static int pmem_rw_bytes(struct nd_namespace_common *ndns, return -EFAULT; } - if (rw == READ) + if (rw == READ) { + unsigned int sz_align = ALIGN(size + (offset & (512 - 1)), 512); + + if (unlikely(is_bad_pmem(&pmem->bb, offset / 512, sz_align))) + return -EIO; memcpy_from_pmem(buf, pmem->virt_addr + offset, size); - else { + } else { memcpy_to_pmem(pmem->virt_addr + offset, buf, size); wmb_pmem(); } @@ -427,6 +432,9 @@ static int nd_pmem_probe(struct device *dev) pmem->ndns = ndns; dev_set_drvdata(dev, pmem); ndns->rw_bytes = pmem_rw_bytes; + if (devm_init_badblocks(dev, &pmem->bb)) + return -ENOMEM; + nvdimm_namespace_add_poison(ndns, &pmem->bb, 0); if (is_nd_btt(dev)) return nvdimm_namespace_attach_btt(ndns); -- cgit v0.10.2 From 55f5560d8c18fe33fc169f8d244a9247dcac7612 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Tue, 5 Jan 2016 00:28:18 -0800 Subject: block: kill disk_{check|set|clear|alloc}_badblocks These actions are completely managed by a block driver or can use the badblocks api directly. Signed-off-by: Dan Williams diff --git a/block/genhd.c b/block/genhd.c index aa38cd0..5aaeb2a 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -506,16 +506,6 @@ static int exact_lock(dev_t devt, void *data) return 0; } -int disk_alloc_badblocks(struct gendisk *disk) -{ - disk->bb = kzalloc(sizeof(*(disk->bb)), GFP_KERNEL); - if (!disk->bb) - return -ENOMEM; - - return badblocks_init(disk->bb, 1); -} -EXPORT_SYMBOL(disk_alloc_badblocks); - static void register_disk(struct gendisk *disk) { struct device *ddev = disk_to_dev(disk); @@ -682,38 +672,6 @@ void del_gendisk(struct gendisk *disk) } EXPORT_SYMBOL(del_gendisk); -/* - * The gendisk usage of badblocks does not track acknowledgements for - * badblocks. We always assume they are acknowledged. - */ -int disk_check_badblocks(struct gendisk *disk, sector_t s, int sectors, - sector_t *first_bad, int *bad_sectors) -{ - if (!disk->bb) - return 0; - - return badblocks_check(disk->bb, s, sectors, first_bad, bad_sectors); -} -EXPORT_SYMBOL(disk_check_badblocks); - -int disk_set_badblocks(struct gendisk *disk, sector_t s, int sectors) -{ - if (!disk->bb) - return 0; - - return badblocks_set(disk->bb, s, sectors, 1); -} -EXPORT_SYMBOL(disk_set_badblocks); - -int disk_clear_badblocks(struct gendisk *disk, sector_t s, int sectors) -{ - if (!disk->bb) - return 0; - - return badblocks_clear(disk->bb, s, sectors); -} -EXPORT_SYMBOL(disk_clear_badblocks); - /* sysfs access to bad-blocks list. */ static ssize_t disk_badblocks_show(struct device *dev, struct device_attribute *attr, diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 0bbec68..5c70676 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -435,11 +435,6 @@ extern void add_disk(struct gendisk *disk); extern void del_gendisk(struct gendisk *gp); extern struct gendisk *get_gendisk(dev_t dev, int *partno); extern struct block_device *bdget_disk(struct gendisk *disk, int partno); -int disk_alloc_badblocks(struct gendisk *disk); -extern int disk_check_badblocks(struct gendisk *disk, sector_t s, int sectors, - sector_t *first_bad, int *bad_sectors); -extern int disk_set_badblocks(struct gendisk *disk, sector_t s, int sectors); -extern int disk_clear_badblocks(struct gendisk *disk, sector_t s, int sectors); extern void set_device_ro(struct block_device *bdev, int flag); extern void set_disk_ro(struct gendisk *disk, int flag); -- cgit v0.10.2